fs/btrfs/tree-log.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2008 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/slab.h>
   8 #include <linux/blkdev.h>
   9 #include <linux/list_sort.h>
  10 #include <linux/iversion.h>
  11 #include "misc.h"
  12 #include "ctree.h"
  13 #include "tree-log.h"
  14 #include "disk-io.h"
  15 #include "locking.h"
  16 #include "backref.h"
  17 #include "compression.h"
  18 #include "qgroup.h"
  19 #include "block-group.h"
  20 #include "space-info.h"
  21 #include "inode-item.h"
  22 #include "fs.h"
  23 #include "accessors.h"
  24 #include "extent-tree.h"
  25 #include "root-tree.h"
  26 #include "dir-item.h"
  27 #include "file-item.h"
  28 #include "file.h"
  29 #include "orphan.h"
  30 #include "tree-checker.h"
  31
  32 #define MAX_CONFLICT_INODES 10
  33
  34 /* magic values for the inode_only field in btrfs_log_inode:
  35  *
  36  * LOG_INODE_ALL means to log everything
  37  * LOG_INODE_EXISTS means to log just enough to recreate the inode
  38  * during log replay
  39  */
  40 enum {
  41         LOG_INODE_ALL,
  42         LOG_INODE_EXISTS,
  43 };
  44
  45 /*
  46  * directory trouble cases
  47  *
  48  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  49  * log, we must force a full commit before doing an fsync of the directory
  50  * where the unlink was done.
  51  * ---> record transid of last unlink/rename per directory
  52  *
  53  * mkdir foo/some_dir
  54  * normal commit
  55  * rename foo/some_dir foo2/some_dir
  56  * mkdir foo/some_dir
  57  * fsync foo/some_dir/some_file
  58  *
  59  * The fsync above will unlink the original some_dir without recording
  60  * it in its new location (foo2).  After a crash, some_dir will be gone
  61  * unless the fsync of some_file forces a full commit
  62  *
  63  * 2) we must log any new names for any file or dir that is in the fsync
  64  * log. ---> check inode while renaming/linking.
  65  *
  66  * 2a) we must log any new names for any file or dir during rename
  67  * when the directory they are being removed from was logged.
  68  * ---> check inode and old parent dir during rename
  69  *
  70  *  2a is actually the more important variant.  With the extra logging
  71  *  a crash might unlink the old name without recreating the new one
  72  *
  73  * 3) after a crash, we must go through any directories with a link count
  74  * of zero and redo the rm -rf
  75  *
  76  * mkdir f1/foo
  77  * normal commit
  78  * rm -rf f1/foo
  79  * fsync(f1)
  80  *
  81  * The directory f1 was fully removed from the FS, but fsync was never
  82  * called on f1, only its parent dir.  After a crash the rm -rf must
  83  * be replayed.  This must be able to recurse down the entire
  84  * directory tree.  The inode link count fixup code takes care of the
  85  * ugly details.
  86  */
  87
  88 /*
  89  * stages for the tree walking.  The first
  90  * stage (0) is to only pin down the blocks we find
  91  * the second stage (1) is to make sure that all the inodes
  92  * we find in the log are created in the subvolume.
  93  *
  94  * The last stage is to deal with directories and links and extents
  95  * and all the other fun semantics
  96  */
  97 enum {
  98         LOG_WALK_PIN_ONLY,
  99         LOG_WALK_REPLAY_INODES,
 100         LOG_WALK_REPLAY_DIR_INDEX,
 101         LOG_WALK_REPLAY_ALL,
 102 };
 103
 104 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 105                            struct btrfs_inode *inode,
 106                            int inode_only,
 107                            struct btrfs_log_ctx *ctx);
 108 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 109                              struct btrfs_root *root,
 110                              struct btrfs_path *path, u64 objectid);
 111 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 112                                        struct btrfs_root *root,
 113                                        struct btrfs_root *log,
 114                                        struct btrfs_path *path,
 115                                        u64 dirid, int del_all);
 116 static void wait_log_commit(struct btrfs_root *root, int transid);
 117
 118 /*
 119  * tree logging is a special write ahead log used to make sure that
 120  * fsyncs and O_SYNCs can happen without doing full tree commits.
 121  *
 122  * Full tree commits are expensive because they require commonly
 123  * modified blocks to be recowed, creating many dirty pages in the
 124  * extent tree an 4x-6x higher write load than ext3.
 125  *
 126  * Instead of doing a tree commit on every fsync, we use the
 127  * key ranges and transaction ids to find items for a given file or directory
 128  * that have changed in this transaction.  Those items are copied into
 129  * a special tree (one per subvolume root), that tree is written to disk
 130  * and then the fsync is considered complete.
 131  *
 132  * After a crash, items are copied out of the log-tree back into the
 133  * subvolume tree.  Any file data extents found are recorded in the extent
 134  * allocation tree, and the log-tree freed.
 135  *
 136  * The log tree is read three times, once to pin down all the extents it is
 137  * using in ram and once, once to create all the inodes logged in the tree
 138  * and once to do all the other items.
 139  */
 140
 141 /*
 142  * start a sub transaction and setup the log tree
 143  * this increments the log tree writer count to make the people
 144  * syncing the tree wait for us to finish
 145  */
 146 static int start_log_trans(struct btrfs_trans_handle *trans,
 147                            struct btrfs_root *root,
 148                            struct btrfs_log_ctx *ctx)
 149 {
 150         struct btrfs_fs_info *fs_info = root->fs_info;
 151         struct btrfs_root *tree_root = fs_info->tree_root;
 152         const bool zoned = btrfs_is_zoned(fs_info);
 153         int ret = 0;
 154         bool created = false;
 155
 156         /*
 157          * First check if the log root tree was already created. If not, create
 158          * it before locking the root's log_mutex, just to keep lockdep happy.
 159          */
 160         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state)) {
 161                 mutex_lock(&tree_root->log_mutex);
 162                 if (!fs_info->log_root_tree) {
 163                         ret = btrfs_init_log_root_tree(trans, fs_info);
 164                         if (!ret) {
 165                                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &tree_root->state);
 166                                 created = true;
 167                         }
 168                 }
 169                 mutex_unlock(&tree_root->log_mutex);
 170                 if (ret)
 171                         return ret;
 172         }
 173
 174         mutex_lock(&root->log_mutex);
 175
 176 again:
 177         if (root->log_root) {
 178                 int index = (root->log_transid + 1) % 2;
 179
 180                 if (btrfs_need_log_full_commit(trans)) {
 181                         ret = BTRFS_LOG_FORCE_COMMIT;
 182                         goto out;
 183                 }
 184
 185                 if (zoned && atomic_read(&root->log_commit[index])) {
 186                         wait_log_commit(root, root->log_transid - 1);
 187                         goto again;
 188                 }
 189
 190                 if (!root->log_start_pid) {
 191                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 192                         root->log_start_pid = current->pid;
 193                 } else if (root->log_start_pid != current->pid) {
 194                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 195                 }
 196         } else {
 197                 /*
 198                  * This means fs_info->log_root_tree was already created
 199                  * for some other FS trees. Do the full commit not to mix
 200                  * nodes from multiple log transactions to do sequential
 201                  * writing.
 202                  */
 203                 if (zoned && !created) {
 204                         ret = BTRFS_LOG_FORCE_COMMIT;
 205                         goto out;
 206                 }
 207
 208                 ret = btrfs_add_log_tree(trans, root);
 209                 if (ret)
 210                         goto out;
 211
 212                 set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
 213                 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 214                 root->log_start_pid = current->pid;
 215         }
 216
 217         atomic_inc(&root->log_writers);
 218         if (!ctx->logging_new_name) {
 219                 int index = root->log_transid % 2;
 220                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
 221                 ctx->log_transid = root->log_transid;
 222         }
 223
 224 out:
 225         mutex_unlock(&root->log_mutex);
 226         return ret;
 227 }
 228
 229 /*
 230  * returns 0 if there was a log transaction running and we were able
 231  * to join, or returns -ENOENT if there were not transactions
 232  * in progress
 233  */
 234 static int join_running_log_trans(struct btrfs_root *root)
 235 {
 236         const bool zoned = btrfs_is_zoned(root->fs_info);
 237         int ret = -ENOENT;
 238
 239         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
 240                 return ret;
 241
 242         mutex_lock(&root->log_mutex);
 243 again:
 244         if (root->log_root) {
 245                 int index = (root->log_transid + 1) % 2;
 246
 247                 ret = 0;
 248                 if (zoned && atomic_read(&root->log_commit[index])) {
 249                         wait_log_commit(root, root->log_transid - 1);
 250                         goto again;
 251                 }
 252                 atomic_inc(&root->log_writers);
 253         }
 254         mutex_unlock(&root->log_mutex);
 255         return ret;
 256 }
 257
 258 /*
 259  * This either makes the current running log transaction wait
 260  * until you call btrfs_end_log_trans() or it makes any future
 261  * log transactions wait until you call btrfs_end_log_trans()
 262  */
 263 void btrfs_pin_log_trans(struct btrfs_root *root)
 264 {
 265         atomic_inc(&root->log_writers);
 266 }
 267
 268 /*
 269  * indicate we're done making changes to the log tree
 270  * and wake up anyone waiting to do a sync
 271  */
 272 void btrfs_end_log_trans(struct btrfs_root *root)
 273 {
 274         if (atomic_dec_and_test(&root->log_writers)) {
 275                 /* atomic_dec_and_test implies a barrier */
 276                 cond_wake_up_nomb(&root->log_writer_wait);
 277         }
 278 }
 279
 280 /*
 281  * the walk control struct is used to pass state down the chain when
 282  * processing the log tree.  The stage field tells us which part
 283  * of the log tree processing we are currently doing.  The others
 284  * are state fields used for that specific part
 285  */
 286 struct walk_control {
 287         /* should we free the extent on disk when done?  This is used
 288          * at transaction commit time while freeing a log tree
 289          */
 290         int free;
 291
 292         /* pin only walk, we record which extents on disk belong to the
 293          * log trees
 294          */
 295         int pin;
 296
 297         /* what stage of the replay code we're currently in */
 298         int stage;
 299
 300         /*
 301          * Ignore any items from the inode currently being processed. Needs
 302          * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
 303          * the LOG_WALK_REPLAY_INODES stage.
 304          */
 305         bool ignore_cur_inode;
 306
 307         /* the root we are currently replaying */
 308         struct btrfs_root *replay_dest;
 309
 310         /* the trans handle for the current replay */
 311         struct btrfs_trans_handle *trans;
 312
 313         /* the function that gets used to process blocks we find in the
 314          * tree.  Note the extent_buffer might not be up to date when it is
 315          * passed in, and it must be checked or read if you need the data
 316          * inside it
 317          */
 318         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 319                             struct walk_control *wc, u64 gen, int level);
 320 };
 321
 322 /*
 323  * process_func used to pin down extents, write them or wait on them
 324  */
 325 static int process_one_buffer(struct btrfs_root *log,
 326                               struct extent_buffer *eb,
 327                               struct walk_control *wc, u64 gen, int level)
 328 {
 329         struct btrfs_fs_info *fs_info = log->fs_info;
 330         int ret = 0;
 331
 332         /*
 333          * If this fs is mixed then we need to be able to process the leaves to
 334          * pin down any logged extents, so we have to read the block.
 335          */
 336         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 337                 struct btrfs_tree_parent_check check = {
 338                         .level = level,
 339                         .transid = gen
 340                 };
 341
 342                 ret = btrfs_read_extent_buffer(eb, &check);
 343                 if (ret)
 344                         return ret;
 345         }
 346
 347         if (wc->pin) {
 348                 ret = btrfs_pin_extent_for_log_replay(wc->trans, eb);
 349                 if (ret)
 350                         return ret;
 351
 352                 if (btrfs_buffer_uptodate(eb, gen, 0) &&
 353                     btrfs_header_level(eb) == 0)
 354                         ret = btrfs_exclude_logged_extents(eb);
 355         }
 356         return ret;
 357 }
 358
 359 /*
 360  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 361  * to the src data we are copying out.
 362  *
 363  * root is the tree we are copying into, and path is a scratch
 364  * path for use in this function (it should be released on entry and
 365  * will be released on exit).
 366  *
 367  * If the key is already in the destination tree the existing item is
 368  * overwritten.  If the existing item isn't big enough, it is extended.
 369  * If it is too large, it is truncated.
 370  *
 371  * If the key isn't in the destination yet, a new item is inserted.
 372  */
 373 static int overwrite_item(struct btrfs_trans_handle *trans,
 374                           struct btrfs_root *root,
 375                           struct btrfs_path *path,
 376                           struct extent_buffer *eb, int slot,
 377                           struct btrfs_key *key)
 378 {
 379         int ret;
 380         u32 item_size;
 381         u64 saved_i_size = 0;
 382         int save_old_i_size = 0;
 383         unsigned long src_ptr;
 384         unsigned long dst_ptr;
 385         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 386
 387         /*
 388          * This is only used during log replay, so the root is always from a
 389          * fs/subvolume tree. In case we ever need to support a log root, then
 390          * we'll have to clone the leaf in the path, release the path and use
 391          * the leaf before writing into the log tree. See the comments at
 392          * copy_items() for more details.
 393          */
 394         ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
 395
 396         item_size = btrfs_item_size(eb, slot);
 397         src_ptr = btrfs_item_ptr_offset(eb, slot);
 398
 399         /* Look for the key in the destination tree. */
 400         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 401         if (ret < 0)
 402                 return ret;
 403
 404         if (ret == 0) {
 405                 char *src_copy;
 406                 char *dst_copy;
 407                 u32 dst_size = btrfs_item_size(path->nodes[0],
 408                                                   path->slots[0]);
 409                 if (dst_size != item_size)
 410                         goto insert;
 411
 412                 if (item_size == 0) {
 413                         btrfs_release_path(path);
 414                         return 0;
 415                 }
 416                 dst_copy = kmalloc(item_size, GFP_NOFS);
 417                 src_copy = kmalloc(item_size, GFP_NOFS);
 418                 if (!dst_copy || !src_copy) {
 419                         btrfs_release_path(path);
 420                         kfree(dst_copy);
 421                         kfree(src_copy);
 422                         return -ENOMEM;
 423                 }
 424
 425                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
 426
 427                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 428                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 429                                    item_size);
 430                 ret = memcmp(dst_copy, src_copy, item_size);
 431
 432                 kfree(dst_copy);
 433                 kfree(src_copy);
 434                 /*
 435                  * they have the same contents, just return, this saves
 436                  * us from cowing blocks in the destination tree and doing
 437                  * extra writes that may not have been done by a previous
 438                  * sync
 439                  */
 440                 if (ret == 0) {
 441                         btrfs_release_path(path);
 442                         return 0;
 443                 }
 444
 445                 /*
 446                  * We need to load the old nbytes into the inode so when we
 447                  * replay the extents we've logged we get the right nbytes.
 448                  */
 449                 if (inode_item) {
 450                         struct btrfs_inode_item *item;
 451                         u64 nbytes;
 452                         u32 mode;
 453
 454                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 455                                               struct btrfs_inode_item);
 456                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 457                         item = btrfs_item_ptr(eb, slot,
 458                                               struct btrfs_inode_item);
 459                         btrfs_set_inode_nbytes(eb, item, nbytes);
 460
 461                         /*
 462                          * If this is a directory we need to reset the i_size to
 463                          * 0 so that we can set it up properly when replaying
 464                          * the rest of the items in this log.
 465                          */
 466                         mode = btrfs_inode_mode(eb, item);
 467                         if (S_ISDIR(mode))
 468                                 btrfs_set_inode_size(eb, item, 0);
 469                 }
 470         } else if (inode_item) {
 471                 struct btrfs_inode_item *item;
 472                 u32 mode;
 473
 474                 /*
 475                  * New inode, set nbytes to 0 so that the nbytes comes out
 476                  * properly when we replay the extents.
 477                  */
 478                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 479                 btrfs_set_inode_nbytes(eb, item, 0);
 480
 481                 /*
 482                  * If this is a directory we need to reset the i_size to 0 so
 483                  * that we can set it up properly when replaying the rest of
 484                  * the items in this log.
 485                  */
 486                 mode = btrfs_inode_mode(eb, item);
 487                 if (S_ISDIR(mode))
 488                         btrfs_set_inode_size(eb, item, 0);
 489         }
 490 insert:
 491         btrfs_release_path(path);
 492         /* try to insert the key into the destination tree */
 493         path->skip_release_on_error = 1;
 494         ret = btrfs_insert_empty_item(trans, root, path,
 495                                       key, item_size);
 496         path->skip_release_on_error = 0;
 497
 498         /* make sure any existing item is the correct size */
 499         if (ret == -EEXIST || ret == -EOVERFLOW) {
 500                 u32 found_size;
 501                 found_size = btrfs_item_size(path->nodes[0],
 502                                                 path->slots[0]);
 503                 if (found_size > item_size)
 504                         btrfs_truncate_item(trans, path, item_size, 1);
 505                 else if (found_size < item_size)
 506                         btrfs_extend_item(trans, path, item_size - found_size);
 507         } else if (ret) {
 508                 return ret;
 509         }
 510         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 511                                         path->slots[0]);
 512
 513         /* don't overwrite an existing inode if the generation number
 514          * was logged as zero.  This is done when the tree logging code
 515          * is just logging an inode to make sure it exists after recovery.
 516          *
 517          * Also, don't overwrite i_size on directories during replay.
 518          * log replay inserts and removes directory items based on the
 519          * state of the tree found in the subvolume, and i_size is modified
 520          * as it goes
 521          */
 522         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 523                 struct btrfs_inode_item *src_item;
 524                 struct btrfs_inode_item *dst_item;
 525
 526                 src_item = (struct btrfs_inode_item *)src_ptr;
 527                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 528
 529                 if (btrfs_inode_generation(eb, src_item) == 0) {
 530                         struct extent_buffer *dst_eb = path->nodes[0];
 531                         const u64 ino_size = btrfs_inode_size(eb, src_item);
 532
 533                         /*
 534                          * For regular files an ino_size == 0 is used only when
 535                          * logging that an inode exists, as part of a directory
 536                          * fsync, and the inode wasn't fsynced before. In this
 537                          * case don't set the size of the inode in the fs/subvol
 538                          * tree, otherwise we would be throwing valid data away.
 539                          */
 540                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
 541                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 542                             ino_size != 0)
 543                                 btrfs_set_inode_size(dst_eb, dst_item, ino_size);
 544                         goto no_copy;
 545                 }
 546
 547                 if (S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 548                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 549                         save_old_i_size = 1;
 550                         saved_i_size = btrfs_inode_size(path->nodes[0],
 551                                                         dst_item);
 552                 }
 553         }
 554
 555         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 556                            src_ptr, item_size);
 557
 558         if (save_old_i_size) {
 559                 struct btrfs_inode_item *dst_item;
 560                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 561                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 562         }
 563
 564         /* make sure the generation is filled in */
 565         if (key->type == BTRFS_INODE_ITEM_KEY) {
 566                 struct btrfs_inode_item *dst_item;
 567                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 568                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 569                         btrfs_set_inode_generation(path->nodes[0], dst_item,
 570                                                    trans->transid);
 571                 }
 572         }
 573 no_copy:
 574         btrfs_mark_buffer_dirty(trans, path->nodes[0]);
 575         btrfs_release_path(path);
 576         return 0;
 577 }
 578
 579 static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len,
 580                                struct fscrypt_str *name)
 581 {
 582         char *buf;
 583
 584         buf = kmalloc(len, GFP_NOFS);
 585         if (!buf)
 586                 return -ENOMEM;
 587
 588         read_extent_buffer(eb, buf, (unsigned long)start, len);
 589         name->name = buf;
 590         name->len = len;
 591         return 0;
 592 }
 593
 594 /*
 595  * simple helper to read an inode off the disk from a given root
 596  * This can only be called for subvolume roots and not for the log
 597  */
 598 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 599                                              u64 objectid)
 600 {
 601         struct inode *inode;
 602
 603         inode = btrfs_iget(root->fs_info->sb, objectid, root);
 604         if (IS_ERR(inode))
 605                 inode = NULL;
 606         return inode;
 607 }
 608
 609 /* replays a single extent in 'eb' at 'slot' with 'key' into the
 610  * subvolume 'root'.  path is released on entry and should be released
 611  * on exit.
 612  *
 613  * extents in the log tree have not been allocated out of the extent
 614  * tree yet.  So, this completes the allocation, taking a reference
 615  * as required if the extent already exists or creating a new extent
 616  * if it isn't in the extent allocation tree yet.
 617  *
 618  * The extent is inserted into the file, dropping any existing extents
 619  * from the file that overlap the new one.
 620  */
 621 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 622                                       struct btrfs_root *root,
 623                                       struct btrfs_path *path,
 624                                       struct extent_buffer *eb, int slot,
 625                                       struct btrfs_key *key)
 626 {
 627         struct btrfs_drop_extents_args drop_args = { 0 };
 628         struct btrfs_fs_info *fs_info = root->fs_info;
 629         int found_type;
 630         u64 extent_end;
 631         u64 start = key->offset;
 632         u64 nbytes = 0;
 633         struct btrfs_file_extent_item *item;
 634         struct inode *inode = NULL;
 635         unsigned long size;
 636         int ret = 0;
 637
 638         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 639         found_type = btrfs_file_extent_type(eb, item);
 640
 641         if (found_type == BTRFS_FILE_EXTENT_REG ||
 642             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 643                 nbytes = btrfs_file_extent_num_bytes(eb, item);
 644                 extent_end = start + nbytes;
 645
 646                 /*
 647                  * We don't add to the inodes nbytes if we are prealloc or a
 648                  * hole.
 649                  */
 650                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 651                         nbytes = 0;
 652         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 653                 size = btrfs_file_extent_ram_bytes(eb, item);
 654                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
 655                 extent_end = ALIGN(start + size,
 656                                    fs_info->sectorsize);
 657         } else {
 658                 ret = 0;
 659                 goto out;
 660         }
 661
 662         inode = read_one_inode(root, key->objectid);
 663         if (!inode) {
 664                 ret = -EIO;
 665                 goto out;
 666         }
 667
 668         /*
 669          * first check to see if we already have this extent in the
 670          * file.  This must be done before the btrfs_drop_extents run
 671          * so we don't try to drop this extent.
 672          */
 673         ret = btrfs_lookup_file_extent(trans, root, path,
 674                         btrfs_ino(BTRFS_I(inode)), start, 0);
 675
 676         if (ret == 0 &&
 677             (found_type == BTRFS_FILE_EXTENT_REG ||
 678              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 679                 struct btrfs_file_extent_item cmp1;
 680                 struct btrfs_file_extent_item cmp2;
 681                 struct btrfs_file_extent_item *existing;
 682                 struct extent_buffer *leaf;
 683
 684                 leaf = path->nodes[0];
 685                 existing = btrfs_item_ptr(leaf, path->slots[0],
 686                                           struct btrfs_file_extent_item);
 687
 688                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
 689                                    sizeof(cmp1));
 690                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 691                                    sizeof(cmp2));
 692
 693                 /*
 694                  * we already have a pointer to this exact extent,
 695                  * we don't have to do anything
 696                  */
 697                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 698                         btrfs_release_path(path);
 699                         goto out;
 700                 }
 701         }
 702         btrfs_release_path(path);
 703
 704         /* drop any overlapping extents */
 705         drop_args.start = start;
 706         drop_args.end = extent_end;
 707         drop_args.drop_cache = true;
 708         ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
 709         if (ret)
 710                 goto out;
 711
 712         if (found_type == BTRFS_FILE_EXTENT_REG ||
 713             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 714                 u64 offset;
 715                 unsigned long dest_offset;
 716                 struct btrfs_key ins;
 717
 718                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
 719                     btrfs_fs_incompat(fs_info, NO_HOLES))
 720                         goto update_inode;
 721
 722                 ret = btrfs_insert_empty_item(trans, root, path, key,
 723                                               sizeof(*item));
 724                 if (ret)
 725                         goto out;
 726                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 727                                                     path->slots[0]);
 728                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
 729                                 (unsigned long)item,  sizeof(*item));
 730
 731                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 732                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 733                 ins.type = BTRFS_EXTENT_ITEM_KEY;
 734                 offset = key->offset - btrfs_file_extent_offset(eb, item);
 735
 736                 /*
 737                  * Manually record dirty extent, as here we did a shallow
 738                  * file extent item copy and skip normal backref update,
 739                  * but modifying extent tree all by ourselves.
 740                  * So need to manually record dirty extent for qgroup,
 741                  * as the owner of the file extent changed from log tree
 742                  * (doesn't affect qgroup) to fs/file tree(affects qgroup)
 743                  */
 744                 ret = btrfs_qgroup_trace_extent(trans,
 745                                 btrfs_file_extent_disk_bytenr(eb, item),
 746                                 btrfs_file_extent_disk_num_bytes(eb, item));
 747                 if (ret < 0)
 748                         goto out;
 749
 750                 if (ins.objectid > 0) {
 751                         struct btrfs_ref ref = { 0 };
 752                         u64 csum_start;
 753                         u64 csum_end;
 754                         LIST_HEAD(ordered_sums);
 755
 756                         /*
 757                          * is this extent already allocated in the extent
 758                          * allocation tree?  If so, just add a reference
 759                          */
 760                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 761                                                 ins.offset);
 762                         if (ret < 0) {
 763                                 goto out;
 764                         } else if (ret == 0) {
 765                                 btrfs_init_generic_ref(&ref,
 766                                                 BTRFS_ADD_DELAYED_REF,
 767                                                 ins.objectid, ins.offset, 0,
 768                                                 root->root_key.objectid);
 769                                 btrfs_init_data_ref(&ref,
 770                                                 root->root_key.objectid,
 771                                                 key->objectid, offset, 0, false);
 772                                 ret = btrfs_inc_extent_ref(trans, &ref);
 773                                 if (ret)
 774                                         goto out;
 775                         } else {
 776                                 /*
 777                                  * insert the extent pointer in the extent
 778                                  * allocation tree
 779                                  */
 780                                 ret = btrfs_alloc_logged_file_extent(trans,
 781                                                 root->root_key.objectid,
 782                                                 key->objectid, offset, &ins);
 783                                 if (ret)
 784                                         goto out;
 785                         }
 786                         btrfs_release_path(path);
 787
 788                         if (btrfs_file_extent_compression(eb, item)) {
 789                                 csum_start = ins.objectid;
 790                                 csum_end = csum_start + ins.offset;
 791                         } else {
 792                                 csum_start = ins.objectid +
 793                                         btrfs_file_extent_offset(eb, item);
 794                                 csum_end = csum_start +
 795                                         btrfs_file_extent_num_bytes(eb, item);
 796                         }
 797
 798                         ret = btrfs_lookup_csums_list(root->log_root,
 799                                                 csum_start, csum_end - 1,
 800                                                 &ordered_sums, 0, false);
 801                         if (ret)
 802                                 goto out;
 803                         /*
 804                          * Now delete all existing cums in the csum root that
 805                          * cover our range. We do this because we can have an
 806                          * extent that is completely referenced by one file
 807                          * extent item and partially referenced by another
 808                          * file extent item (like after using the clone or
 809                          * extent_same ioctls). In this case if we end up doing
 810                          * the replay of the one that partially references the
 811                          * extent first, and we do not do the csum deletion
 812                          * below, we can get 2 csum items in the csum tree that
 813                          * overlap each other. For example, imagine our log has
 814                          * the two following file extent items:
 815                          *
 816                          * key (257 EXTENT_DATA 409600)
 817                          *     extent data disk byte 12845056 nr 102400
 818                          *     extent data offset 20480 nr 20480 ram 102400
 819                          *
 820                          * key (257 EXTENT_DATA 819200)
 821                          *     extent data disk byte 12845056 nr 102400
 822                          *     extent data offset 0 nr 102400 ram 102400
 823                          *
 824                          * Where the second one fully references the 100K extent
 825                          * that starts at disk byte 12845056, and the log tree
 826                          * has a single csum item that covers the entire range
 827                          * of the extent:
 828                          *
 829                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 830                          *
 831                          * After the first file extent item is replayed, the
 832                          * csum tree gets the following csum item:
 833                          *
 834                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 835                          *
 836                          * Which covers the 20K sub-range starting at offset 20K
 837                          * of our extent. Now when we replay the second file
 838                          * extent item, if we do not delete existing csum items
 839                          * that cover any of its blocks, we end up getting two
 840                          * csum items in our csum tree that overlap each other:
 841                          *
 842                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 843                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 844                          *
 845                          * Which is a problem, because after this anyone trying
 846                          * to lookup up for the checksum of any block of our
 847                          * extent starting at an offset of 40K or higher, will
 848                          * end up looking at the second csum item only, which
 849                          * does not contain the checksum for any block starting
 850                          * at offset 40K or higher of our extent.
 851                          */
 852                         while (!list_empty(&ordered_sums)) {
 853                                 struct btrfs_ordered_sum *sums;
 854                                 struct btrfs_root *csum_root;
 855
 856                                 sums = list_entry(ordered_sums.next,
 857                                                 struct btrfs_ordered_sum,
 858                                                 list);
 859                                 csum_root = btrfs_csum_root(fs_info,
 860                                                             sums->logical);
 861                                 if (!ret)
 862                                         ret = btrfs_del_csums(trans, csum_root,
 863                                                               sums->logical,
 864                                                               sums->len);
 865                                 if (!ret)
 866                                         ret = btrfs_csum_file_blocks(trans,
 867                                                                      csum_root,
 868                                                                      sums);
 869                                 list_del(&sums->list);
 870                                 kfree(sums);
 871                         }
 872                         if (ret)
 873                                 goto out;
 874                 } else {
 875                         btrfs_release_path(path);
 876                 }
 877         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 878                 /* inline extents are easy, we just overwrite them */
 879                 ret = overwrite_item(trans, root, path, eb, slot, key);
 880                 if (ret)
 881                         goto out;
 882         }
 883
 884         ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
 885                                                 extent_end - start);
 886         if (ret)
 887                 goto out;
 888
 889 update_inode:
 890         btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
 891         ret = btrfs_update_inode(trans, BTRFS_I(inode));
 892 out:
 893         iput(inode);
 894         return ret;
 895 }
 896
 897 static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans,
 898                                        struct btrfs_inode *dir,
 899                                        struct btrfs_inode *inode,
 900                                        const struct fscrypt_str *name)
 901 {
 902         int ret;
 903
 904         ret = btrfs_unlink_inode(trans, dir, inode, name);
 905         if (ret)
 906                 return ret;
 907         /*
 908          * Whenever we need to check if a name exists or not, we check the
 909          * fs/subvolume tree. So after an unlink we must run delayed items, so
 910          * that future checks for a name during log replay see that the name
 911          * does not exists anymore.
 912          */
 913         return btrfs_run_delayed_items(trans);
 914 }
 915
 916 /*
 917  * when cleaning up conflicts between the directory names in the
 918  * subvolume, directory names in the log and directory names in the
 919  * inode back references, we may have to unlink inodes from directories.
 920  *
 921  * This is a helper function to do the unlink of a specific directory
 922  * item
 923  */
 924 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 925                                       struct btrfs_path *path,
 926                                       struct btrfs_inode *dir,
 927                                       struct btrfs_dir_item *di)
 928 {
 929         struct btrfs_root *root = dir->root;
 930         struct inode *inode;
 931         struct fscrypt_str name;
 932         struct extent_buffer *leaf;
 933         struct btrfs_key location;
 934         int ret;
 935
 936         leaf = path->nodes[0];
 937
 938         btrfs_dir_item_key_to_cpu(leaf, di, &location);
 939         ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name);
 940         if (ret)
 941                 return -ENOMEM;
 942
 943         btrfs_release_path(path);
 944
 945         inode = read_one_inode(root, location.objectid);
 946         if (!inode) {
 947                 ret = -EIO;
 948                 goto out;
 949         }
 950
 951         ret = link_to_fixup_dir(trans, root, path, location.objectid);
 952         if (ret)
 953                 goto out;
 954
 955         ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name);
 956 out:
 957         kfree(name.name);
 958         iput(inode);
 959         return ret;
 960 }
 961
 962 /*
 963  * See if a given name and sequence number found in an inode back reference are
 964  * already in a directory and correctly point to this inode.
 965  *
 966  * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
 967  * exists.
 968  */
 969 static noinline int inode_in_dir(struct btrfs_root *root,
 970                                  struct btrfs_path *path,
 971                                  u64 dirid, u64 objectid, u64 index,
 972                                  struct fscrypt_str *name)
 973 {
 974         struct btrfs_dir_item *di;
 975         struct btrfs_key location;
 976         int ret = 0;
 977
 978         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 979                                          index, name, 0);
 980         if (IS_ERR(di)) {
 981                 ret = PTR_ERR(di);
 982                 goto out;
 983         } else if (di) {
 984                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 985                 if (location.objectid != objectid)
 986                         goto out;
 987         } else {
 988                 goto out;
 989         }
 990
 991         btrfs_release_path(path);
 992         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0);
 993         if (IS_ERR(di)) {
 994                 ret = PTR_ERR(di);
 995                 goto out;
 996         } else if (di) {
 997                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 998                 if (location.objectid == objectid)
 999                         ret = 1;
1000         }
1001 out:
1002         btrfs_release_path(path);
1003         return ret;
1004 }
1005
1006 /*
1007  * helper function to check a log tree for a named back reference in
1008  * an inode.  This is used to decide if a back reference that is
1009  * found in the subvolume conflicts with what we find in the log.
1010  *
1011  * inode backreferences may have multiple refs in a single item,
1012  * during replay we process one reference at a time, and we don't
1013  * want to delete valid links to a file from the subvolume if that
1014  * link is also in the log.
1015  */
1016 static noinline int backref_in_log(struct btrfs_root *log,
1017                                    struct btrfs_key *key,
1018                                    u64 ref_objectid,
1019                                    const struct fscrypt_str *name)
1020 {
1021         struct btrfs_path *path;
1022         int ret;
1023
1024         path = btrfs_alloc_path();
1025         if (!path)
1026                 return -ENOMEM;
1027
1028         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
1029         if (ret < 0) {
1030                 goto out;
1031         } else if (ret == 1) {
1032                 ret = 0;
1033                 goto out;
1034         }
1035
1036         if (key->type == BTRFS_INODE_EXTREF_KEY)
1037                 ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1038                                                        path->slots[0],
1039                                                        ref_objectid, name);
1040         else
1041                 ret = !!btrfs_find_name_in_backref(path->nodes[0],
1042                                                    path->slots[0], name);
1043 out:
1044         btrfs_free_path(path);
1045         return ret;
1046 }
1047
1048 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1049                                   struct btrfs_root *root,
1050                                   struct btrfs_path *path,
1051                                   struct btrfs_root *log_root,
1052                                   struct btrfs_inode *dir,
1053                                   struct btrfs_inode *inode,
1054                                   u64 inode_objectid, u64 parent_objectid,
1055                                   u64 ref_index, struct fscrypt_str *name)
1056 {
1057         int ret;
1058         struct extent_buffer *leaf;
1059         struct btrfs_dir_item *di;
1060         struct btrfs_key search_key;
1061         struct btrfs_inode_extref *extref;
1062
1063 again:
1064         /* Search old style refs */
1065         search_key.objectid = inode_objectid;
1066         search_key.type = BTRFS_INODE_REF_KEY;
1067         search_key.offset = parent_objectid;
1068         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1069         if (ret == 0) {
1070                 struct btrfs_inode_ref *victim_ref;
1071                 unsigned long ptr;
1072                 unsigned long ptr_end;
1073
1074                 leaf = path->nodes[0];
1075
1076                 /* are we trying to overwrite a back ref for the root directory
1077                  * if so, just jump out, we're done
1078                  */
1079                 if (search_key.objectid == search_key.offset)
1080                         return 1;
1081
1082                 /* check all the names in this back reference to see
1083                  * if they are in the log.  if so, we allow them to stay
1084                  * otherwise they must be unlinked as a conflict
1085                  */
1086                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1087                 ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]);
1088                 while (ptr < ptr_end) {
1089                         struct fscrypt_str victim_name;
1090
1091                         victim_ref = (struct btrfs_inode_ref *)ptr;
1092                         ret = read_alloc_one_name(leaf, (victim_ref + 1),
1093                                  btrfs_inode_ref_name_len(leaf, victim_ref),
1094                                  &victim_name);
1095                         if (ret)
1096                                 return ret;
1097
1098                         ret = backref_in_log(log_root, &search_key,
1099                                              parent_objectid, &victim_name);
1100                         if (ret < 0) {
1101                                 kfree(victim_name.name);
1102                                 return ret;
1103                         } else if (!ret) {
1104                                 inc_nlink(&inode->vfs_inode);
1105                                 btrfs_release_path(path);
1106
1107                                 ret = unlink_inode_for_log_replay(trans, dir, inode,
1108                                                 &victim_name);
1109                                 kfree(victim_name.name);
1110                                 if (ret)
1111                                         return ret;
1112                                 goto again;
1113                         }
1114                         kfree(victim_name.name);
1115
1116                         ptr = (unsigned long)(victim_ref + 1) + victim_name.len;
1117                 }
1118         }
1119         btrfs_release_path(path);
1120
1121         /* Same search but for extended refs */
1122         extref = btrfs_lookup_inode_extref(NULL, root, path, name,
1123                                            inode_objectid, parent_objectid, 0,
1124                                            0);
1125         if (IS_ERR(extref)) {
1126                 return PTR_ERR(extref);
1127         } else if (extref) {
1128                 u32 item_size;
1129                 u32 cur_offset = 0;
1130                 unsigned long base;
1131                 struct inode *victim_parent;
1132
1133                 leaf = path->nodes[0];
1134
1135                 item_size = btrfs_item_size(leaf, path->slots[0]);
1136                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1137
1138                 while (cur_offset < item_size) {
1139                         struct fscrypt_str victim_name;
1140
1141                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
1142
1143                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1144                                 goto next;
1145
1146                         ret = read_alloc_one_name(leaf, &extref->name,
1147                                  btrfs_inode_extref_name_len(leaf, extref),
1148                                  &victim_name);
1149                         if (ret)
1150                                 return ret;
1151
1152                         search_key.objectid = inode_objectid;
1153                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1154                         search_key.offset = btrfs_extref_hash(parent_objectid,
1155                                                               victim_name.name,
1156                                                               victim_name.len);
1157                         ret = backref_in_log(log_root, &search_key,
1158                                              parent_objectid, &victim_name);
1159                         if (ret < 0) {
1160                                 kfree(victim_name.name);
1161                                 return ret;
1162                         } else if (!ret) {
1163                                 ret = -ENOENT;
1164                                 victim_parent = read_one_inode(root,
1165                                                 parent_objectid);
1166                                 if (victim_parent) {
1167                                         inc_nlink(&inode->vfs_inode);
1168                                         btrfs_release_path(path);
1169
1170                                         ret = unlink_inode_for_log_replay(trans,
1171                                                         BTRFS_I(victim_parent),
1172                                                         inode, &victim_name);
1173                                 }
1174                                 iput(victim_parent);
1175                                 kfree(victim_name.name);
1176                                 if (ret)
1177                                         return ret;
1178                                 goto again;
1179                         }
1180                         kfree(victim_name.name);
1181 next:
1182                         cur_offset += victim_name.len + sizeof(*extref);
1183                 }
1184         }
1185         btrfs_release_path(path);
1186
1187         /* look for a conflicting sequence number */
1188         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1189                                          ref_index, name, 0);
1190         if (IS_ERR(di)) {
1191                 return PTR_ERR(di);
1192         } else if (di) {
1193                 ret = drop_one_dir_item(trans, path, dir, di);
1194                 if (ret)
1195                         return ret;
1196         }
1197         btrfs_release_path(path);
1198
1199         /* look for a conflicting name */
1200         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0);
1201         if (IS_ERR(di)) {
1202                 return PTR_ERR(di);
1203         } else if (di) {
1204                 ret = drop_one_dir_item(trans, path, dir, di);
1205                 if (ret)
1206                         return ret;
1207         }
1208         btrfs_release_path(path);
1209
1210         return 0;
1211 }
1212
1213 static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1214                              struct fscrypt_str *name, u64 *index,
1215                              u64 *parent_objectid)
1216 {
1217         struct btrfs_inode_extref *extref;
1218         int ret;
1219
1220         extref = (struct btrfs_inode_extref *)ref_ptr;
1221
1222         ret = read_alloc_one_name(eb, &extref->name,
1223                                   btrfs_inode_extref_name_len(eb, extref), name);
1224         if (ret)
1225                 return ret;
1226
1227         if (index)
1228                 *index = btrfs_inode_extref_index(eb, extref);
1229         if (parent_objectid)
1230                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1231
1232         return 0;
1233 }
1234
1235 static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1236                           struct fscrypt_str *name, u64 *index)
1237 {
1238         struct btrfs_inode_ref *ref;
1239         int ret;
1240
1241         ref = (struct btrfs_inode_ref *)ref_ptr;
1242
1243         ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref),
1244                                   name);
1245         if (ret)
1246                 return ret;
1247
1248         if (index)
1249                 *index = btrfs_inode_ref_index(eb, ref);
1250
1251         return 0;
1252 }
1253
1254 /*
1255  * Take an inode reference item from the log tree and iterate all names from the
1256  * inode reference item in the subvolume tree with the same key (if it exists).
1257  * For any name that is not in the inode reference item from the log tree, do a
1258  * proper unlink of that name (that is, remove its entry from the inode
1259  * reference item and both dir index keys).
1260  */
1261 static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1262                                  struct btrfs_root *root,
1263                                  struct btrfs_path *path,
1264                                  struct btrfs_inode *inode,
1265                                  struct extent_buffer *log_eb,
1266                                  int log_slot,
1267                                  struct btrfs_key *key)
1268 {
1269         int ret;
1270         unsigned long ref_ptr;
1271         unsigned long ref_end;
1272         struct extent_buffer *eb;
1273
1274 again:
1275         btrfs_release_path(path);
1276         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1277         if (ret > 0) {
1278                 ret = 0;
1279                 goto out;
1280         }
1281         if (ret < 0)
1282                 goto out;
1283
1284         eb = path->nodes[0];
1285         ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1286         ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]);
1287         while (ref_ptr < ref_end) {
1288                 struct fscrypt_str name;
1289                 u64 parent_id;
1290
1291                 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1292                         ret = extref_get_fields(eb, ref_ptr, &name,
1293                                                 NULL, &parent_id);
1294                 } else {
1295                         parent_id = key->offset;
1296                         ret = ref_get_fields(eb, ref_ptr, &name, NULL);
1297                 }
1298                 if (ret)
1299                         goto out;
1300
1301                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1302                         ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1303                                                                parent_id, &name);
1304                 else
1305                         ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name);
1306
1307                 if (!ret) {
1308                         struct inode *dir;
1309
1310                         btrfs_release_path(path);
1311                         dir = read_one_inode(root, parent_id);
1312                         if (!dir) {
1313                                 ret = -ENOENT;
1314                                 kfree(name.name);
1315                                 goto out;
1316                         }
1317                         ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir),
1318                                                  inode, &name);
1319                         kfree(name.name);
1320                         iput(dir);
1321                         if (ret)
1322                                 goto out;
1323                         goto again;
1324                 }
1325
1326                 kfree(name.name);
1327                 ref_ptr += name.len;
1328                 if (key->type == BTRFS_INODE_EXTREF_KEY)
1329                         ref_ptr += sizeof(struct btrfs_inode_extref);
1330                 else
1331                         ref_ptr += sizeof(struct btrfs_inode_ref);
1332         }
1333         ret = 0;
1334  out:
1335         btrfs_release_path(path);
1336         return ret;
1337 }
1338
1339 /*
1340  * replay one inode back reference item found in the log tree.
1341  * eb, slot and key refer to the buffer and key found in the log tree.
1342  * root is the destination we are replaying into, and path is for temp
1343  * use by this function.  (it should be released on return).
1344  */
1345 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1346                                   struct btrfs_root *root,
1347                                   struct btrfs_root *log,
1348                                   struct btrfs_path *path,
1349                                   struct extent_buffer *eb, int slot,
1350                                   struct btrfs_key *key)
1351 {
1352         struct inode *dir = NULL;
1353         struct inode *inode = NULL;
1354         unsigned long ref_ptr;
1355         unsigned long ref_end;
1356         struct fscrypt_str name;
1357         int ret;
1358         int log_ref_ver = 0;
1359         u64 parent_objectid;
1360         u64 inode_objectid;
1361         u64 ref_index = 0;
1362         int ref_struct_size;
1363
1364         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1365         ref_end = ref_ptr + btrfs_item_size(eb, slot);
1366
1367         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1368                 struct btrfs_inode_extref *r;
1369
1370                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1371                 log_ref_ver = 1;
1372                 r = (struct btrfs_inode_extref *)ref_ptr;
1373                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1374         } else {
1375                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1376                 parent_objectid = key->offset;
1377         }
1378         inode_objectid = key->objectid;
1379
1380         /*
1381          * it is possible that we didn't log all the parent directories
1382          * for a given inode.  If we don't find the dir, just don't
1383          * copy the back ref in.  The link count fixup code will take
1384          * care of the rest
1385          */
1386         dir = read_one_inode(root, parent_objectid);
1387         if (!dir) {
1388                 ret = -ENOENT;
1389                 goto out;
1390         }
1391
1392         inode = read_one_inode(root, inode_objectid);
1393         if (!inode) {
1394                 ret = -EIO;
1395                 goto out;
1396         }
1397
1398         while (ref_ptr < ref_end) {
1399                 if (log_ref_ver) {
1400                         ret = extref_get_fields(eb, ref_ptr, &name,
1401                                                 &ref_index, &parent_objectid);
1402                         /*
1403                          * parent object can change from one array
1404                          * item to another.
1405                          */
1406                         if (!dir)
1407                                 dir = read_one_inode(root, parent_objectid);
1408                         if (!dir) {
1409                                 ret = -ENOENT;
1410                                 goto out;
1411                         }
1412                 } else {
1413                         ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
1414                 }
1415                 if (ret)
1416                         goto out;
1417
1418                 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1419                                    btrfs_ino(BTRFS_I(inode)), ref_index, &name);
1420                 if (ret < 0) {
1421                         goto out;
1422                 } else if (ret == 0) {
1423                         /*
1424                          * look for a conflicting back reference in the
1425                          * metadata. if we find one we have to unlink that name
1426                          * of the file before we add our new link.  Later on, we
1427                          * overwrite any existing back reference, and we don't
1428                          * want to create dangling pointers in the directory.
1429                          */
1430                         ret = __add_inode_ref(trans, root, path, log,
1431                                               BTRFS_I(dir), BTRFS_I(inode),
1432                                               inode_objectid, parent_objectid,
1433                                               ref_index, &name);
1434                         if (ret) {
1435                                 if (ret == 1)
1436                                         ret = 0;
1437                                 goto out;
1438                         }
1439
1440                         /* insert our name */
1441                         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1442                                              &name, 0, ref_index);
1443                         if (ret)
1444                                 goto out;
1445
1446                         ret = btrfs_update_inode(trans, BTRFS_I(inode));
1447                         if (ret)
1448                                 goto out;
1449                 }
1450                 /* Else, ret == 1, we already have a perfect match, we're done. */
1451
1452                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
1453                 kfree(name.name);
1454                 name.name = NULL;
1455                 if (log_ref_ver) {
1456                         iput(dir);
1457                         dir = NULL;
1458                 }
1459         }
1460
1461         /*
1462          * Before we overwrite the inode reference item in the subvolume tree
1463          * with the item from the log tree, we must unlink all names from the
1464          * parent directory that are in the subvolume's tree inode reference
1465          * item, otherwise we end up with an inconsistent subvolume tree where
1466          * dir index entries exist for a name but there is no inode reference
1467          * item with the same name.
1468          */
1469         ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1470                                     key);
1471         if (ret)
1472                 goto out;
1473
1474         /* finally write the back reference in the inode */
1475         ret = overwrite_item(trans, root, path, eb, slot, key);
1476 out:
1477         btrfs_release_path(path);
1478         kfree(name.name);
1479         iput(dir);
1480         iput(inode);
1481         return ret;
1482 }
1483
1484 static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path)
1485 {
1486         int ret = 0;
1487         int name_len;
1488         unsigned int nlink = 0;
1489         u32 item_size;
1490         u32 cur_offset = 0;
1491         u64 inode_objectid = btrfs_ino(inode);
1492         u64 offset = 0;
1493         unsigned long ptr;
1494         struct btrfs_inode_extref *extref;
1495         struct extent_buffer *leaf;
1496
1497         while (1) {
1498                 ret = btrfs_find_one_extref(inode->root, inode_objectid, offset,
1499                                             path, &extref, &offset);
1500                 if (ret)
1501                         break;
1502
1503                 leaf = path->nodes[0];
1504                 item_size = btrfs_item_size(leaf, path->slots[0]);
1505                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1506                 cur_offset = 0;
1507
1508                 while (cur_offset < item_size) {
1509                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1510                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1511
1512                         nlink++;
1513
1514                         cur_offset += name_len + sizeof(*extref);
1515                 }
1516
1517                 offset++;
1518                 btrfs_release_path(path);
1519         }
1520         btrfs_release_path(path);
1521
1522         if (ret < 0 && ret != -ENOENT)
1523                 return ret;
1524         return nlink;
1525 }
1526
1527 static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path)
1528 {
1529         int ret;
1530         struct btrfs_key key;
1531         unsigned int nlink = 0;
1532         unsigned long ptr;
1533         unsigned long ptr_end;
1534         int name_len;
1535         u64 ino = btrfs_ino(inode);
1536
1537         key.objectid = ino;
1538         key.type = BTRFS_INODE_REF_KEY;
1539         key.offset = (u64)-1;
1540
1541         while (1) {
1542                 ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0);
1543                 if (ret < 0)
1544                         break;
1545                 if (ret > 0) {
1546                         if (path->slots[0] == 0)
1547                                 break;
1548                         path->slots[0]--;
1549                 }
1550 process_slot:
1551                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1552                                       path->slots[0]);
1553                 if (key.objectid != ino ||
1554                     key.type != BTRFS_INODE_REF_KEY)
1555                         break;
1556                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1557                 ptr_end = ptr + btrfs_item_size(path->nodes[0],
1558                                                    path->slots[0]);
1559                 while (ptr < ptr_end) {
1560                         struct btrfs_inode_ref *ref;
1561
1562                         ref = (struct btrfs_inode_ref *)ptr;
1563                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1564                                                             ref);
1565                         ptr = (unsigned long)(ref + 1) + name_len;
1566                         nlink++;
1567                 }
1568
1569                 if (key.offset == 0)
1570                         break;
1571                 if (path->slots[0] > 0) {
1572                         path->slots[0]--;
1573                         goto process_slot;
1574                 }
1575                 key.offset--;
1576                 btrfs_release_path(path);
1577         }
1578         btrfs_release_path(path);
1579
1580         return nlink;
1581 }
1582
1583 /*
1584  * There are a few corners where the link count of the file can't
1585  * be properly maintained during replay.  So, instead of adding
1586  * lots of complexity to the log code, we just scan the backrefs
1587  * for any file that has been through replay.
1588  *
1589  * The scan will update the link count on the inode to reflect the
1590  * number of back refs found.  If it goes down to zero, the iput
1591  * will free the inode.
1592  */
1593 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1594                                            struct inode *inode)
1595 {
1596         struct btrfs_root *root = BTRFS_I(inode)->root;
1597         struct btrfs_path *path;
1598         int ret;
1599         u64 nlink = 0;
1600         u64 ino = btrfs_ino(BTRFS_I(inode));
1601
1602         path = btrfs_alloc_path();
1603         if (!path)
1604                 return -ENOMEM;
1605
1606         ret = count_inode_refs(BTRFS_I(inode), path);
1607         if (ret < 0)
1608                 goto out;
1609
1610         nlink = ret;
1611
1612         ret = count_inode_extrefs(BTRFS_I(inode), path);
1613         if (ret < 0)
1614                 goto out;
1615
1616         nlink += ret;
1617
1618         ret = 0;
1619
1620         if (nlink != inode->i_nlink) {
1621                 set_nlink(inode, nlink);
1622                 ret = btrfs_update_inode(trans, BTRFS_I(inode));
1623                 if (ret)
1624                         goto out;
1625         }
1626         BTRFS_I(inode)->index_cnt = (u64)-1;
1627
1628         if (inode->i_nlink == 0) {
1629                 if (S_ISDIR(inode->i_mode)) {
1630                         ret = replay_dir_deletes(trans, root, NULL, path,
1631                                                  ino, 1);
1632                         if (ret)
1633                                 goto out;
1634                 }
1635                 ret = btrfs_insert_orphan_item(trans, root, ino);
1636                 if (ret == -EEXIST)
1637                         ret = 0;
1638         }
1639
1640 out:
1641         btrfs_free_path(path);
1642         return ret;
1643 }
1644
1645 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1646                                             struct btrfs_root *root,
1647                                             struct btrfs_path *path)
1648 {
1649         int ret;
1650         struct btrfs_key key;
1651         struct inode *inode;
1652
1653         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1654         key.type = BTRFS_ORPHAN_ITEM_KEY;
1655         key.offset = (u64)-1;
1656         while (1) {
1657                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1658                 if (ret < 0)
1659                         break;
1660
1661                 if (ret == 1) {
1662                         ret = 0;
1663                         if (path->slots[0] == 0)
1664                                 break;
1665                         path->slots[0]--;
1666                 }
1667
1668                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1669                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1670                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1671                         break;
1672
1673                 ret = btrfs_del_item(trans, root, path);
1674                 if (ret)
1675                         break;
1676
1677                 btrfs_release_path(path);
1678                 inode = read_one_inode(root, key.offset);
1679                 if (!inode) {
1680                         ret = -EIO;
1681                         break;
1682                 }
1683
1684                 ret = fixup_inode_link_count(trans, inode);
1685                 iput(inode);
1686                 if (ret)
1687                         break;
1688
1689                 /*
1690                  * fixup on a directory may create new entries,
1691                  * make sure we always look for the highset possible
1692                  * offset
1693                  */
1694                 key.offset = (u64)-1;
1695         }
1696         btrfs_release_path(path);
1697         return ret;
1698 }
1699
1700
1701 /*
1702  * record a given inode in the fixup dir so we can check its link
1703  * count when replay is done.  The link count is incremented here
1704  * so the inode won't go away until we check it
1705  */
1706 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1707                                       struct btrfs_root *root,
1708                                       struct btrfs_path *path,
1709                                       u64 objectid)
1710 {
1711         struct btrfs_key key;
1712         int ret = 0;
1713         struct inode *inode;
1714
1715         inode = read_one_inode(root, objectid);
1716         if (!inode)
1717                 return -EIO;
1718
1719         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1720         key.type = BTRFS_ORPHAN_ITEM_KEY;
1721         key.offset = objectid;
1722
1723         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1724
1725         btrfs_release_path(path);
1726         if (ret == 0) {
1727                 if (!inode->i_nlink)
1728                         set_nlink(inode, 1);
1729                 else
1730                         inc_nlink(inode);
1731                 ret = btrfs_update_inode(trans, BTRFS_I(inode));
1732         } else if (ret == -EEXIST) {
1733                 ret = 0;
1734         }
1735         iput(inode);
1736
1737         return ret;
1738 }
1739
1740 /*
1741  * when replaying the log for a directory, we only insert names
1742  * for inodes that actually exist.  This means an fsync on a directory
1743  * does not implicitly fsync all the new files in it
1744  */
1745 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1746                                     struct btrfs_root *root,
1747                                     u64 dirid, u64 index,
1748                                     const struct fscrypt_str *name,
1749                                     struct btrfs_key *location)
1750 {
1751         struct inode *inode;
1752         struct inode *dir;
1753         int ret;
1754
1755         inode = read_one_inode(root, location->objectid);
1756         if (!inode)
1757                 return -ENOENT;
1758
1759         dir = read_one_inode(root, dirid);
1760         if (!dir) {
1761                 iput(inode);
1762                 return -EIO;
1763         }
1764
1765         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1766                              1, index);
1767
1768         /* FIXME, put inode into FIXUP list */
1769
1770         iput(inode);
1771         iput(dir);
1772         return ret;
1773 }
1774
1775 static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans,
1776                                         struct btrfs_inode *dir,
1777                                         struct btrfs_path *path,
1778                                         struct btrfs_dir_item *dst_di,
1779                                         const struct btrfs_key *log_key,
1780                                         u8 log_flags,
1781                                         bool exists)
1782 {
1783         struct btrfs_key found_key;
1784
1785         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1786         /* The existing dentry points to the same inode, don't delete it. */
1787         if (found_key.objectid == log_key->objectid &&
1788             found_key.type == log_key->type &&
1789             found_key.offset == log_key->offset &&
1790             btrfs_dir_flags(path->nodes[0], dst_di) == log_flags)
1791                 return 1;
1792
1793         /*
1794          * Don't drop the conflicting directory entry if the inode for the new
1795          * entry doesn't exist.
1796          */
1797         if (!exists)
1798                 return 0;
1799
1800         return drop_one_dir_item(trans, path, dir, dst_di);
1801 }
1802
1803 /*
1804  * take a single entry in a log directory item and replay it into
1805  * the subvolume.
1806  *
1807  * if a conflicting item exists in the subdirectory already,
1808  * the inode it points to is unlinked and put into the link count
1809  * fix up tree.
1810  *
1811  * If a name from the log points to a file or directory that does
1812  * not exist in the FS, it is skipped.  fsyncs on directories
1813  * do not force down inodes inside that directory, just changes to the
1814  * names or unlinks in a directory.
1815  *
1816  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1817  * non-existing inode) and 1 if the name was replayed.
1818  */
1819 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1820                                     struct btrfs_root *root,
1821                                     struct btrfs_path *path,
1822                                     struct extent_buffer *eb,
1823                                     struct btrfs_dir_item *di,
1824                                     struct btrfs_key *key)
1825 {
1826         struct fscrypt_str name;
1827         struct btrfs_dir_item *dir_dst_di;
1828         struct btrfs_dir_item *index_dst_di;
1829         bool dir_dst_matches = false;
1830         bool index_dst_matches = false;
1831         struct btrfs_key log_key;
1832         struct btrfs_key search_key;
1833         struct inode *dir;
1834         u8 log_flags;
1835         bool exists;
1836         int ret;
1837         bool update_size = true;
1838         bool name_added = false;
1839
1840         dir = read_one_inode(root, key->objectid);
1841         if (!dir)
1842                 return -EIO;
1843
1844         ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
1845         if (ret)
1846                 goto out;
1847
1848         log_flags = btrfs_dir_flags(eb, di);
1849         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1850         ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1851         btrfs_release_path(path);
1852         if (ret < 0)
1853                 goto out;
1854         exists = (ret == 0);
1855         ret = 0;
1856
1857         dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1858                                            &name, 1);
1859         if (IS_ERR(dir_dst_di)) {
1860                 ret = PTR_ERR(dir_dst_di);
1861                 goto out;
1862         } else if (dir_dst_di) {
1863                 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1864                                                    dir_dst_di, &log_key,
1865                                                    log_flags, exists);
1866                 if (ret < 0)
1867                         goto out;
1868                 dir_dst_matches = (ret == 1);
1869         }
1870
1871         btrfs_release_path(path);
1872
1873         index_dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1874                                                    key->objectid, key->offset,
1875                                                    &name, 1);
1876         if (IS_ERR(index_dst_di)) {
1877                 ret = PTR_ERR(index_dst_di);
1878                 goto out;
1879         } else if (index_dst_di) {
1880                 ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path,
1881                                                    index_dst_di, &log_key,
1882                                                    log_flags, exists);
1883                 if (ret < 0)
1884                         goto out;
1885                 index_dst_matches = (ret == 1);
1886         }
1887
1888         btrfs_release_path(path);
1889
1890         if (dir_dst_matches && index_dst_matches) {
1891                 ret = 0;
1892                 update_size = false;
1893                 goto out;
1894         }
1895
1896         /*
1897          * Check if the inode reference exists in the log for the given name,
1898          * inode and parent inode
1899          */
1900         search_key.objectid = log_key.objectid;
1901         search_key.type = BTRFS_INODE_REF_KEY;
1902         search_key.offset = key->objectid;
1903         ret = backref_in_log(root->log_root, &search_key, 0, &name);
1904         if (ret < 0) {
1905                 goto out;
1906         } else if (ret) {
1907                 /* The dentry will be added later. */
1908                 ret = 0;
1909                 update_size = false;
1910                 goto out;
1911         }
1912
1913         search_key.objectid = log_key.objectid;
1914         search_key.type = BTRFS_INODE_EXTREF_KEY;
1915         search_key.offset = key->objectid;
1916         ret = backref_in_log(root->log_root, &search_key, key->objectid, &name);
1917         if (ret < 0) {
1918                 goto out;
1919         } else if (ret) {
1920                 /* The dentry will be added later. */
1921                 ret = 0;
1922                 update_size = false;
1923                 goto out;
1924         }
1925         btrfs_release_path(path);
1926         ret = insert_one_name(trans, root, key->objectid, key->offset,
1927                               &name, &log_key);
1928         if (ret && ret != -ENOENT && ret != -EEXIST)
1929                 goto out;
1930         if (!ret)
1931                 name_added = true;
1932         update_size = false;
1933         ret = 0;
1934
1935 out:
1936         if (!ret && update_size) {
1937                 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2);
1938                 ret = btrfs_update_inode(trans, BTRFS_I(dir));
1939         }
1940         kfree(name.name);
1941         iput(dir);
1942         if (!ret && name_added)
1943                 ret = 1;
1944         return ret;
1945 }
1946
1947 /* Replay one dir item from a BTRFS_DIR_INDEX_KEY key. */
1948 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1949                                         struct btrfs_root *root,
1950                                         struct btrfs_path *path,
1951                                         struct extent_buffer *eb, int slot,
1952                                         struct btrfs_key *key)
1953 {
1954         int ret;
1955         struct btrfs_dir_item *di;
1956
1957         /* We only log dir index keys, which only contain a single dir item. */
1958         ASSERT(key->type == BTRFS_DIR_INDEX_KEY);
1959
1960         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1961         ret = replay_one_name(trans, root, path, eb, di, key);
1962         if (ret < 0)
1963                 return ret;
1964
1965         /*
1966          * If this entry refers to a non-directory (directories can not have a
1967          * link count > 1) and it was added in the transaction that was not
1968          * committed, make sure we fixup the link count of the inode the entry
1969          * points to. Otherwise something like the following would result in a
1970          * directory pointing to an inode with a wrong link that does not account
1971          * for this dir entry:
1972          *
1973          * mkdir testdir
1974          * touch testdir/foo
1975          * touch testdir/bar
1976          * sync
1977          *
1978          * ln testdir/bar testdir/bar_link
1979          * ln testdir/foo testdir/foo_link
1980          * xfs_io -c "fsync" testdir/bar
1981          *
1982          * <power failure>
1983          *
1984          * mount fs, log replay happens
1985          *
1986          * File foo would remain with a link count of 1 when it has two entries
1987          * pointing to it in the directory testdir. This would make it impossible
1988          * to ever delete the parent directory has it would result in stale
1989          * dentries that can never be deleted.
1990          */
1991         if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) {
1992                 struct btrfs_path *fixup_path;
1993                 struct btrfs_key di_key;
1994
1995                 fixup_path = btrfs_alloc_path();
1996                 if (!fixup_path)
1997                         return -ENOMEM;
1998
1999                 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2000                 ret = link_to_fixup_dir(trans, root, fixup_path, di_key.objectid);
2001                 btrfs_free_path(fixup_path);
2002         }
2003
2004         return ret;
2005 }
2006
2007 /*
2008  * directory replay has two parts.  There are the standard directory
2009  * items in the log copied from the subvolume, and range items
2010  * created in the log while the subvolume was logged.
2011  *
2012  * The range items tell us which parts of the key space the log
2013  * is authoritative for.  During replay, if a key in the subvolume
2014  * directory is in a logged range item, but not actually in the log
2015  * that means it was deleted from the directory before the fsync
2016  * and should be removed.
2017  */
2018 static noinline int find_dir_range(struct btrfs_root *root,
2019                                    struct btrfs_path *path,
2020                                    u64 dirid,
2021                                    u64 *start_ret, u64 *end_ret)
2022 {
2023         struct btrfs_key key;
2024         u64 found_end;
2025         struct btrfs_dir_log_item *item;
2026         int ret;
2027         int nritems;
2028
2029         if (*start_ret == (u64)-1)
2030                 return 1;
2031
2032         key.objectid = dirid;
2033         key.type = BTRFS_DIR_LOG_INDEX_KEY;
2034         key.offset = *start_ret;
2035
2036         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2037         if (ret < 0)
2038                 goto out;
2039         if (ret > 0) {
2040                 if (path->slots[0] == 0)
2041                         goto out;
2042                 path->slots[0]--;
2043         }
2044         if (ret != 0)
2045                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2046
2047         if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2048                 ret = 1;
2049                 goto next;
2050         }
2051         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2052                               struct btrfs_dir_log_item);
2053         found_end = btrfs_dir_log_end(path->nodes[0], item);
2054
2055         if (*start_ret >= key.offset && *start_ret <= found_end) {
2056                 ret = 0;
2057                 *start_ret = key.offset;
2058                 *end_ret = found_end;
2059                 goto out;
2060         }
2061         ret = 1;
2062 next:
2063         /* check the next slot in the tree to see if it is a valid item */
2064         nritems = btrfs_header_nritems(path->nodes[0]);
2065         path->slots[0]++;
2066         if (path->slots[0] >= nritems) {
2067                 ret = btrfs_next_leaf(root, path);
2068                 if (ret)
2069                         goto out;
2070         }
2071
2072         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2073
2074         if (key.type != BTRFS_DIR_LOG_INDEX_KEY || key.objectid != dirid) {
2075                 ret = 1;
2076                 goto out;
2077         }
2078         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2079                               struct btrfs_dir_log_item);
2080         found_end = btrfs_dir_log_end(path->nodes[0], item);
2081         *start_ret = key.offset;
2082         *end_ret = found_end;
2083         ret = 0;
2084 out:
2085         btrfs_release_path(path);
2086         return ret;
2087 }
2088
2089 /*
2090  * this looks for a given directory item in the log.  If the directory
2091  * item is not in the log, the item is removed and the inode it points
2092  * to is unlinked
2093  */
2094 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2095                                       struct btrfs_root *log,
2096                                       struct btrfs_path *path,
2097                                       struct btrfs_path *log_path,
2098                                       struct inode *dir,
2099                                       struct btrfs_key *dir_key)
2100 {
2101         struct btrfs_root *root = BTRFS_I(dir)->root;
2102         int ret;
2103         struct extent_buffer *eb;
2104         int slot;
2105         struct btrfs_dir_item *di;
2106         struct fscrypt_str name;
2107         struct inode *inode = NULL;
2108         struct btrfs_key location;
2109
2110         /*
2111          * Currently we only log dir index keys. Even if we replay a log created
2112          * by an older kernel that logged both dir index and dir item keys, all
2113          * we need to do is process the dir index keys, we (and our caller) can
2114          * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY).
2115          */
2116         ASSERT(dir_key->type == BTRFS_DIR_INDEX_KEY);
2117
2118         eb = path->nodes[0];
2119         slot = path->slots[0];
2120         di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2121         ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name);
2122         if (ret)
2123                 goto out;
2124
2125         if (log) {
2126                 struct btrfs_dir_item *log_di;
2127
2128                 log_di = btrfs_lookup_dir_index_item(trans, log, log_path,
2129                                                      dir_key->objectid,
2130                                                      dir_key->offset, &name, 0);
2131                 if (IS_ERR(log_di)) {
2132                         ret = PTR_ERR(log_di);
2133                         goto out;
2134                 } else if (log_di) {
2135                         /* The dentry exists in the log, we have nothing to do. */
2136                         ret = 0;
2137                         goto out;
2138                 }
2139         }
2140
2141         btrfs_dir_item_key_to_cpu(eb, di, &location);
2142         btrfs_release_path(path);
2143         btrfs_release_path(log_path);
2144         inode = read_one_inode(root, location.objectid);
2145         if (!inode) {
2146                 ret = -EIO;
2147                 goto out;
2148         }
2149
2150         ret = link_to_fixup_dir(trans, root, path, location.objectid);
2151         if (ret)
2152                 goto out;
2153
2154         inc_nlink(inode);
2155         ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode),
2156                                           &name);
2157         /*
2158          * Unlike dir item keys, dir index keys can only have one name (entry) in
2159          * them, as there are no key collisions since each key has a unique offset
2160          * (an index number), so we're done.
2161          */
2162 out:
2163         btrfs_release_path(path);
2164         btrfs_release_path(log_path);
2165         kfree(name.name);
2166         iput(inode);
2167         return ret;
2168 }
2169
2170 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2171                               struct btrfs_root *root,
2172                               struct btrfs_root *log,
2173                               struct btrfs_path *path,
2174                               const u64 ino)
2175 {
2176         struct btrfs_key search_key;
2177         struct btrfs_path *log_path;
2178         int i;
2179         int nritems;
2180         int ret;
2181
2182         log_path = btrfs_alloc_path();
2183         if (!log_path)
2184                 return -ENOMEM;
2185
2186         search_key.objectid = ino;
2187         search_key.type = BTRFS_XATTR_ITEM_KEY;
2188         search_key.offset = 0;
2189 again:
2190         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2191         if (ret < 0)
2192                 goto out;
2193 process_leaf:
2194         nritems = btrfs_header_nritems(path->nodes[0]);
2195         for (i = path->slots[0]; i < nritems; i++) {
2196                 struct btrfs_key key;
2197                 struct btrfs_dir_item *di;
2198                 struct btrfs_dir_item *log_di;
2199                 u32 total_size;
2200                 u32 cur;
2201
2202                 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2203                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2204                         ret = 0;
2205                         goto out;
2206                 }
2207
2208                 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2209                 total_size = btrfs_item_size(path->nodes[0], i);
2210                 cur = 0;
2211                 while (cur < total_size) {
2212                         u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2213                         u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2214                         u32 this_len = sizeof(*di) + name_len + data_len;
2215                         char *name;
2216
2217                         name = kmalloc(name_len, GFP_NOFS);
2218                         if (!name) {
2219                                 ret = -ENOMEM;
2220                                 goto out;
2221                         }
2222                         read_extent_buffer(path->nodes[0], name,
2223                                            (unsigned long)(di + 1), name_len);
2224
2225                         log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2226                                                     name, name_len, 0);
2227                         btrfs_release_path(log_path);
2228                         if (!log_di) {
2229                                 /* Doesn't exist in log tree, so delete it. */
2230                                 btrfs_release_path(path);
2231                                 di = btrfs_lookup_xattr(trans, root, path, ino,
2232                                                         name, name_len, -1);
2233                                 kfree(name);
2234                                 if (IS_ERR(di)) {
2235                                         ret = PTR_ERR(di);
2236                                         goto out;
2237                                 }
2238                                 ASSERT(di);
2239                                 ret = btrfs_delete_one_dir_name(trans, root,
2240                                                                 path, di);
2241                                 if (ret)
2242                                         goto out;
2243                                 btrfs_release_path(path);
2244                                 search_key = key;
2245                                 goto again;
2246                         }
2247                         kfree(name);
2248                         if (IS_ERR(log_di)) {
2249                                 ret = PTR_ERR(log_di);
2250                                 goto out;
2251                         }
2252                         cur += this_len;
2253                         di = (struct btrfs_dir_item *)((char *)di + this_len);
2254                 }
2255         }
2256         ret = btrfs_next_leaf(root, path);
2257         if (ret > 0)
2258                 ret = 0;
2259         else if (ret == 0)
2260                 goto process_leaf;
2261 out:
2262         btrfs_free_path(log_path);
2263         btrfs_release_path(path);
2264         return ret;
2265 }
2266
2267
2268 /*
2269  * deletion replay happens before we copy any new directory items
2270  * out of the log or out of backreferences from inodes.  It
2271  * scans the log to find ranges of keys that log is authoritative for,
2272  * and then scans the directory to find items in those ranges that are
2273  * not present in the log.
2274  *
2275  * Anything we don't find in the log is unlinked and removed from the
2276  * directory.
2277  */
2278 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2279                                        struct btrfs_root *root,
2280                                        struct btrfs_root *log,
2281                                        struct btrfs_path *path,
2282                                        u64 dirid, int del_all)
2283 {
2284         u64 range_start;
2285         u64 range_end;
2286         int ret = 0;
2287         struct btrfs_key dir_key;
2288         struct btrfs_key found_key;
2289         struct btrfs_path *log_path;
2290         struct inode *dir;
2291
2292         dir_key.objectid = dirid;
2293         dir_key.type = BTRFS_DIR_INDEX_KEY;
2294         log_path = btrfs_alloc_path();
2295         if (!log_path)
2296                 return -ENOMEM;
2297
2298         dir = read_one_inode(root, dirid);
2299         /* it isn't an error if the inode isn't there, that can happen
2300          * because we replay the deletes before we copy in the inode item
2301          * from the log
2302          */
2303         if (!dir) {
2304                 btrfs_free_path(log_path);
2305                 return 0;
2306         }
2307
2308         range_start = 0;
2309         range_end = 0;
2310         while (1) {
2311                 if (del_all)
2312                         range_end = (u64)-1;
2313                 else {
2314                         ret = find_dir_range(log, path, dirid,
2315                                              &range_start, &range_end);
2316                         if (ret < 0)
2317                                 goto out;
2318                         else if (ret > 0)
2319                                 break;
2320                 }
2321
2322                 dir_key.offset = range_start;
2323                 while (1) {
2324                         int nritems;
2325                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
2326                                                 0, 0);
2327                         if (ret < 0)
2328                                 goto out;
2329
2330                         nritems = btrfs_header_nritems(path->nodes[0]);
2331                         if (path->slots[0] >= nritems) {
2332                                 ret = btrfs_next_leaf(root, path);
2333                                 if (ret == 1)
2334                                         break;
2335                                 else if (ret < 0)
2336                                         goto out;
2337                         }
2338                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2339                                               path->slots[0]);
2340                         if (found_key.objectid != dirid ||
2341                             found_key.type != dir_key.type) {
2342                                 ret = 0;
2343                                 goto out;
2344                         }
2345
2346                         if (found_key.offset > range_end)
2347                                 break;
2348
2349                         ret = check_item_in_log(trans, log, path,
2350                                                 log_path, dir,
2351                                                 &found_key);
2352                         if (ret)
2353                                 goto out;
2354                         if (found_key.offset == (u64)-1)
2355                                 break;
2356                         dir_key.offset = found_key.offset + 1;
2357                 }
2358                 btrfs_release_path(path);
2359                 if (range_end == (u64)-1)
2360                         break;
2361                 range_start = range_end + 1;
2362         }
2363         ret = 0;
2364 out:
2365         btrfs_release_path(path);
2366         btrfs_free_path(log_path);
2367         iput(dir);
2368         return ret;
2369 }
2370
2371 /*
2372  * the process_func used to replay items from the log tree.  This
2373  * gets called in two different stages.  The first stage just looks
2374  * for inodes and makes sure they are all copied into the subvolume.
2375  *
2376  * The second stage copies all the other item types from the log into
2377  * the subvolume.  The two stage approach is slower, but gets rid of
2378  * lots of complexity around inodes referencing other inodes that exist
2379  * only in the log (references come from either directory items or inode
2380  * back refs).
2381  */
2382 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2383                              struct walk_control *wc, u64 gen, int level)
2384 {
2385         int nritems;
2386         struct btrfs_tree_parent_check check = {
2387                 .transid = gen,
2388                 .level = level
2389         };
2390         struct btrfs_path *path;
2391         struct btrfs_root *root = wc->replay_dest;
2392         struct btrfs_key key;
2393         int i;
2394         int ret;
2395
2396         ret = btrfs_read_extent_buffer(eb, &check);
2397         if (ret)
2398                 return ret;
2399
2400         level = btrfs_header_level(eb);
2401
2402         if (level != 0)
2403                 return 0;
2404
2405         path = btrfs_alloc_path();
2406         if (!path)
2407                 return -ENOMEM;
2408
2409         nritems = btrfs_header_nritems(eb);
2410         for (i = 0; i < nritems; i++) {
2411                 btrfs_item_key_to_cpu(eb, &key, i);
2412
2413                 /* inode keys are done during the first stage */
2414                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2415                     wc->stage == LOG_WALK_REPLAY_INODES) {
2416                         struct btrfs_inode_item *inode_item;
2417                         u32 mode;
2418
2419                         inode_item = btrfs_item_ptr(eb, i,
2420                                             struct btrfs_inode_item);
2421                         /*
2422                          * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2423                          * and never got linked before the fsync, skip it, as
2424                          * replaying it is pointless since it would be deleted
2425                          * later. We skip logging tmpfiles, but it's always
2426                          * possible we are replaying a log created with a kernel
2427                          * that used to log tmpfiles.
2428                          */
2429                         if (btrfs_inode_nlink(eb, inode_item) == 0) {
2430                                 wc->ignore_cur_inode = true;
2431                                 continue;
2432                         } else {
2433                                 wc->ignore_cur_inode = false;
2434                         }
2435                         ret = replay_xattr_deletes(wc->trans, root, log,
2436                                                    path, key.objectid);
2437                         if (ret)
2438                                 break;
2439                         mode = btrfs_inode_mode(eb, inode_item);
2440                         if (S_ISDIR(mode)) {
2441                                 ret = replay_dir_deletes(wc->trans,
2442                                          root, log, path, key.objectid, 0);
2443                                 if (ret)
2444                                         break;
2445                         }
2446                         ret = overwrite_item(wc->trans, root, path,
2447                                              eb, i, &key);
2448                         if (ret)
2449                                 break;
2450
2451                         /*
2452                          * Before replaying extents, truncate the inode to its
2453                          * size. We need to do it now and not after log replay
2454                          * because before an fsync we can have prealloc extents
2455                          * added beyond the inode's i_size. If we did it after,
2456                          * through orphan cleanup for example, we would drop
2457                          * those prealloc extents just after replaying them.
2458                          */
2459                         if (S_ISREG(mode)) {
2460                                 struct btrfs_drop_extents_args drop_args = { 0 };
2461                                 struct inode *inode;
2462                                 u64 from;
2463
2464                                 inode = read_one_inode(root, key.objectid);
2465                                 if (!inode) {
2466                                         ret = -EIO;
2467                                         break;
2468                                 }
2469                                 from = ALIGN(i_size_read(inode),
2470                                              root->fs_info->sectorsize);
2471                                 drop_args.start = from;
2472                                 drop_args.end = (u64)-1;
2473                                 drop_args.drop_cache = true;
2474                                 ret = btrfs_drop_extents(wc->trans, root,
2475                                                          BTRFS_I(inode),
2476                                                          &drop_args);
2477                                 if (!ret) {
2478                                         inode_sub_bytes(inode,
2479                                                         drop_args.bytes_found);
2480                                         /* Update the inode's nbytes. */
2481                                         ret = btrfs_update_inode(wc->trans,
2482                                                                  BTRFS_I(inode));
2483                                 }
2484                                 iput(inode);
2485                                 if (ret)
2486                                         break;
2487                         }
2488
2489                         ret = link_to_fixup_dir(wc->trans, root,
2490                                                 path, key.objectid);
2491                         if (ret)
2492                                 break;
2493                 }
2494
2495                 if (wc->ignore_cur_inode)
2496                         continue;
2497
2498                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2499                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2500                         ret = replay_one_dir_item(wc->trans, root, path,
2501                                                   eb, i, &key);
2502                         if (ret)
2503                                 break;
2504                 }
2505
2506                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2507                         continue;
2508
2509                 /* these keys are simply copied */
2510                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2511                         ret = overwrite_item(wc->trans, root, path,
2512                                              eb, i, &key);
2513                         if (ret)
2514                                 break;
2515                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2516                            key.type == BTRFS_INODE_EXTREF_KEY) {
2517                         ret = add_inode_ref(wc->trans, root, log, path,
2518                                             eb, i, &key);
2519                         if (ret && ret != -ENOENT)
2520                                 break;
2521                         ret = 0;
2522                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2523                         ret = replay_one_extent(wc->trans, root, path,
2524                                                 eb, i, &key);
2525                         if (ret)
2526                                 break;
2527                 }
2528                 /*
2529                  * We don't log BTRFS_DIR_ITEM_KEY keys anymore, only the
2530                  * BTRFS_DIR_INDEX_KEY items which we use to derive the
2531                  * BTRFS_DIR_ITEM_KEY items. If we are replaying a log from an
2532                  * older kernel with such keys, ignore them.
2533                  */
2534         }
2535         btrfs_free_path(path);
2536         return ret;
2537 }
2538
2539 /*
2540  * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2541  */
2542 static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2543 {
2544         struct btrfs_block_group *cache;
2545
2546         cache = btrfs_lookup_block_group(fs_info, start);
2547         if (!cache) {
2548                 btrfs_err(fs_info, "unable to find block group for %llu", start);
2549                 return;
2550         }
2551
2552         spin_lock(&cache->space_info->lock);
2553         spin_lock(&cache->lock);
2554         cache->reserved -= fs_info->nodesize;
2555         cache->space_info->bytes_reserved -= fs_info->nodesize;
2556         spin_unlock(&cache->lock);
2557         spin_unlock(&cache->space_info->lock);
2558
2559         btrfs_put_block_group(cache);
2560 }
2561
2562 static int clean_log_buffer(struct btrfs_trans_handle *trans,
2563                             struct extent_buffer *eb)
2564 {
2565         int ret;
2566
2567         btrfs_tree_lock(eb);
2568         btrfs_clear_buffer_dirty(trans, eb);
2569         wait_on_extent_buffer_writeback(eb);
2570         btrfs_tree_unlock(eb);
2571
2572         if (trans) {
2573                 ret = btrfs_pin_reserved_extent(trans, eb);
2574                 if (ret)
2575                         return ret;
2576         } else {
2577                 unaccount_log_buffer(eb->fs_info, eb->start);
2578         }
2579
2580         return 0;
2581 }
2582
2583 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2584                                    struct btrfs_root *root,
2585                                    struct btrfs_path *path, int *level,
2586                                    struct walk_control *wc)
2587 {
2588         struct btrfs_fs_info *fs_info = root->fs_info;
2589         u64 bytenr;
2590         u64 ptr_gen;
2591         struct extent_buffer *next;
2592         struct extent_buffer *cur;
2593         int ret = 0;
2594
2595         while (*level > 0) {
2596                 struct btrfs_tree_parent_check check = { 0 };
2597
2598                 cur = path->nodes[*level];
2599
2600                 WARN_ON(btrfs_header_level(cur) != *level);
2601
2602                 if (path->slots[*level] >=
2603                     btrfs_header_nritems(cur))
2604                         break;
2605
2606                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2607                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2608                 check.transid = ptr_gen;
2609                 check.level = *level - 1;
2610                 check.has_first_key = true;
2611                 btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
2612
2613                 next = btrfs_find_create_tree_block(fs_info, bytenr,
2614                                                     btrfs_header_owner(cur),
2615                                                     *level - 1);
2616                 if (IS_ERR(next))
2617                         return PTR_ERR(next);
2618
2619                 if (*level == 1) {
2620                         ret = wc->process_func(root, next, wc, ptr_gen,
2621                                                *level - 1);
2622                         if (ret) {
2623                                 free_extent_buffer(next);
2624                                 return ret;
2625                         }
2626
2627                         path->slots[*level]++;
2628                         if (wc->free) {
2629                                 ret = btrfs_read_extent_buffer(next, &check);
2630                                 if (ret) {
2631                                         free_extent_buffer(next);
2632                                         return ret;
2633                                 }
2634
2635                                 ret = clean_log_buffer(trans, next);
2636                                 if (ret) {
2637                                         free_extent_buffer(next);
2638                                         return ret;
2639                                 }
2640                         }
2641                         free_extent_buffer(next);
2642                         continue;
2643                 }
2644                 ret = btrfs_read_extent_buffer(next, &check);
2645                 if (ret) {
2646                         free_extent_buffer(next);
2647                         return ret;
2648                 }
2649
2650                 if (path->nodes[*level-1])
2651                         free_extent_buffer(path->nodes[*level-1]);
2652                 path->nodes[*level-1] = next;
2653                 *level = btrfs_header_level(next);
2654                 path->slots[*level] = 0;
2655                 cond_resched();
2656         }
2657         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2658
2659         cond_resched();
2660         return 0;
2661 }
2662
2663 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2664                                  struct btrfs_root *root,
2665                                  struct btrfs_path *path, int *level,
2666                                  struct walk_control *wc)
2667 {
2668         int i;
2669         int slot;
2670         int ret;
2671
2672         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2673                 slot = path->slots[i];
2674                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2675                         path->slots[i]++;
2676                         *level = i;
2677                         WARN_ON(*level == 0);
2678                         return 0;
2679                 } else {
2680                         ret = wc->process_func(root, path->nodes[*level], wc,
2681                                  btrfs_header_generation(path->nodes[*level]),
2682                                  *level);
2683                         if (ret)
2684                                 return ret;
2685
2686                         if (wc->free) {
2687                                 ret = clean_log_buffer(trans, path->nodes[*level]);
2688                                 if (ret)
2689                                         return ret;
2690                         }
2691                         free_extent_buffer(path->nodes[*level]);
2692                         path->nodes[*level] = NULL;
2693                         *level = i + 1;
2694                 }
2695         }
2696         return 1;
2697 }
2698
2699 /*
2700  * drop the reference count on the tree rooted at 'snap'.  This traverses
2701  * the tree freeing any blocks that have a ref count of zero after being
2702  * decremented.
2703  */
2704 static int walk_log_tree(struct btrfs_trans_handle *trans,
2705                          struct btrfs_root *log, struct walk_control *wc)
2706 {
2707         int ret = 0;
2708         int wret;
2709         int level;
2710         struct btrfs_path *path;
2711         int orig_level;
2712
2713         path = btrfs_alloc_path();
2714         if (!path)
2715                 return -ENOMEM;
2716
2717         level = btrfs_header_level(log->node);
2718         orig_level = level;
2719         path->nodes[level] = log->node;
2720         atomic_inc(&log->node->refs);
2721         path->slots[level] = 0;
2722
2723         while (1) {
2724                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2725                 if (wret > 0)
2726                         break;
2727                 if (wret < 0) {
2728                         ret = wret;
2729                         goto out;
2730                 }
2731
2732                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2733                 if (wret > 0)
2734                         break;
2735                 if (wret < 0) {
2736                         ret = wret;
2737                         goto out;
2738                 }
2739         }
2740
2741         /* was the root node processed? if not, catch it here */
2742         if (path->nodes[orig_level]) {
2743                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2744                          btrfs_header_generation(path->nodes[orig_level]),
2745                          orig_level);
2746                 if (ret)
2747                         goto out;
2748                 if (wc->free)
2749                         ret = clean_log_buffer(trans, path->nodes[orig_level]);
2750         }
2751
2752 out:
2753         btrfs_free_path(path);
2754         return ret;
2755 }
2756
2757 /*
2758  * helper function to update the item for a given subvolumes log root
2759  * in the tree of log roots
2760  */
2761 static int update_log_root(struct btrfs_trans_handle *trans,
2762                            struct btrfs_root *log,
2763                            struct btrfs_root_item *root_item)
2764 {
2765         struct btrfs_fs_info *fs_info = log->fs_info;
2766         int ret;
2767
2768         if (log->log_transid == 1) {
2769                 /* insert root item on the first sync */
2770                 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2771                                 &log->root_key, root_item);
2772         } else {
2773                 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2774                                 &log->root_key, root_item);
2775         }
2776         return ret;
2777 }
2778
2779 static void wait_log_commit(struct btrfs_root *root, int transid)
2780 {
2781         DEFINE_WAIT(wait);
2782         int index = transid % 2;
2783
2784         /*
2785          * we only allow two pending log transactions at a time,
2786          * so we know that if ours is more than 2 older than the
2787          * current transaction, we're done
2788          */
2789         for (;;) {
2790                 prepare_to_wait(&root->log_commit_wait[index],
2791                                 &wait, TASK_UNINTERRUPTIBLE);
2792
2793                 if (!(root->log_transid_committed < transid &&
2794                       atomic_read(&root->log_commit[index])))
2795                         break;
2796
2797                 mutex_unlock(&root->log_mutex);
2798                 schedule();
2799                 mutex_lock(&root->log_mutex);
2800         }
2801         finish_wait(&root->log_commit_wait[index], &wait);
2802 }
2803
2804 static void wait_for_writer(struct btrfs_root *root)
2805 {
2806         DEFINE_WAIT(wait);
2807
2808         for (;;) {
2809                 prepare_to_wait(&root->log_writer_wait, &wait,
2810                                 TASK_UNINTERRUPTIBLE);
2811                 if (!atomic_read(&root->log_writers))
2812                         break;
2813
2814                 mutex_unlock(&root->log_mutex);
2815                 schedule();
2816                 mutex_lock(&root->log_mutex);
2817         }
2818         finish_wait(&root->log_writer_wait, &wait);
2819 }
2820
2821 void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode)
2822 {
2823         ctx->log_ret = 0;
2824         ctx->log_transid = 0;
2825         ctx->log_new_dentries = false;
2826         ctx->logging_new_name = false;
2827         ctx->logging_new_delayed_dentries = false;
2828         ctx->logged_before = false;
2829         ctx->inode = inode;
2830         INIT_LIST_HEAD(&ctx->list);
2831         INIT_LIST_HEAD(&ctx->ordered_extents);
2832         INIT_LIST_HEAD(&ctx->conflict_inodes);
2833         ctx->num_conflict_inodes = 0;
2834         ctx->logging_conflict_inodes = false;
2835         ctx->scratch_eb = NULL;
2836 }
2837
2838 void btrfs_init_log_ctx_scratch_eb(struct btrfs_log_ctx *ctx)
2839 {
2840         struct btrfs_inode *inode = BTRFS_I(ctx->inode);
2841
2842         if (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
2843             !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
2844                 return;
2845
2846         /*
2847          * Don't care about allocation failure. This is just for optimization,
2848          * if we fail to allocate here, we will try again later if needed.
2849          */
2850         ctx->scratch_eb = alloc_dummy_extent_buffer(inode->root->fs_info, 0);
2851 }
2852
2853 void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx)
2854 {
2855         struct btrfs_ordered_extent *ordered;
2856         struct btrfs_ordered_extent *tmp;
2857
2858         ASSERT(inode_is_locked(ctx->inode));
2859
2860         list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
2861                 list_del_init(&ordered->log_list);
2862                 btrfs_put_ordered_extent(ordered);
2863         }
2864 }
2865
2866
2867 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2868                                         struct btrfs_log_ctx *ctx)
2869 {
2870         mutex_lock(&root->log_mutex);
2871         list_del_init(&ctx->list);
2872         mutex_unlock(&root->log_mutex);
2873 }
2874
2875 /*
2876  * Invoked in log mutex context, or be sure there is no other task which
2877  * can access the list.
2878  */
2879 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2880                                              int index, int error)
2881 {
2882         struct btrfs_log_ctx *ctx;
2883         struct btrfs_log_ctx *safe;
2884
2885         list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2886                 list_del_init(&ctx->list);
2887                 ctx->log_ret = error;
2888         }
2889 }
2890
2891 /*
2892  * Sends a given tree log down to the disk and updates the super blocks to
2893  * record it.  When this call is done, you know that any inodes previously
2894  * logged are safely on disk only if it returns 0.
2895  *
2896  * Any other return value means you need to call btrfs_commit_transaction.
2897  * Some of the edge cases for fsyncing directories that have had unlinks
2898  * or renames done in the past mean that sometimes the only safe
2899  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2900  * that has happened.
2901  */
2902 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2903                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2904 {
2905         int index1;
2906         int index2;
2907         int mark;
2908         int ret;
2909         struct btrfs_fs_info *fs_info = root->fs_info;
2910         struct btrfs_root *log = root->log_root;
2911         struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2912         struct btrfs_root_item new_root_item;
2913         int log_transid = 0;
2914         struct btrfs_log_ctx root_log_ctx;
2915         struct blk_plug plug;
2916         u64 log_root_start;
2917         u64 log_root_level;
2918
2919         mutex_lock(&root->log_mutex);
2920         log_transid = ctx->log_transid;
2921         if (root->log_transid_committed >= log_transid) {
2922                 mutex_unlock(&root->log_mutex);
2923                 return ctx->log_ret;
2924         }
2925
2926         index1 = log_transid % 2;
2927         if (atomic_read(&root->log_commit[index1])) {
2928                 wait_log_commit(root, log_transid);
2929                 mutex_unlock(&root->log_mutex);
2930                 return ctx->log_ret;
2931         }
2932         ASSERT(log_transid == root->log_transid);
2933         atomic_set(&root->log_commit[index1], 1);
2934
2935         /* wait for previous tree log sync to complete */
2936         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2937                 wait_log_commit(root, log_transid - 1);
2938
2939         while (1) {
2940                 int batch = atomic_read(&root->log_batch);
2941                 /* when we're on an ssd, just kick the log commit out */
2942                 if (!btrfs_test_opt(fs_info, SSD) &&
2943                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2944                         mutex_unlock(&root->log_mutex);
2945                         schedule_timeout_uninterruptible(1);
2946                         mutex_lock(&root->log_mutex);
2947                 }
2948                 wait_for_writer(root);
2949                 if (batch == atomic_read(&root->log_batch))
2950                         break;
2951         }
2952
2953         /* bail out if we need to do a full commit */
2954         if (btrfs_need_log_full_commit(trans)) {
2955                 ret = BTRFS_LOG_FORCE_COMMIT;
2956                 mutex_unlock(&root->log_mutex);
2957                 goto out;
2958         }
2959
2960         if (log_transid % 2 == 0)
2961                 mark = EXTENT_DIRTY;
2962         else
2963                 mark = EXTENT_NEW;
2964
2965         /* we start IO on  all the marked extents here, but we don't actually
2966          * wait for them until later.
2967          */
2968         blk_start_plug(&plug);
2969         ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2970         /*
2971          * -EAGAIN happens when someone, e.g., a concurrent transaction
2972          *  commit, writes a dirty extent in this tree-log commit. This
2973          *  concurrent write will create a hole writing out the extents,
2974          *  and we cannot proceed on a zoned filesystem, requiring
2975          *  sequential writing. While we can bail out to a full commit
2976          *  here, but we can continue hoping the concurrent writing fills
2977          *  the hole.
2978          */
2979         if (ret == -EAGAIN && btrfs_is_zoned(fs_info))
2980                 ret = 0;
2981         if (ret) {
2982                 blk_finish_plug(&plug);
2983                 btrfs_set_log_full_commit(trans);
2984                 mutex_unlock(&root->log_mutex);
2985                 goto out;
2986         }
2987
2988         /*
2989          * We _must_ update under the root->log_mutex in order to make sure we
2990          * have a consistent view of the log root we are trying to commit at
2991          * this moment.
2992          *
2993          * We _must_ copy this into a local copy, because we are not holding the
2994          * log_root_tree->log_mutex yet.  This is important because when we
2995          * commit the log_root_tree we must have a consistent view of the
2996          * log_root_tree when we update the super block to point at the
2997          * log_root_tree bytenr.  If we update the log_root_tree here we'll race
2998          * with the commit and possibly point at the new block which we may not
2999          * have written out.
3000          */
3001         btrfs_set_root_node(&log->root_item, log->node);
3002         memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3003
3004         btrfs_set_root_log_transid(root, root->log_transid + 1);
3005         log->log_transid = root->log_transid;
3006         root->log_start_pid = 0;
3007         /*
3008          * IO has been started, blocks of the log tree have WRITTEN flag set
3009          * in their headers. new modifications of the log will be written to
3010          * new positions. so it's safe to allow log writers to go in.
3011          */
3012         mutex_unlock(&root->log_mutex);
3013
3014         if (btrfs_is_zoned(fs_info)) {
3015                 mutex_lock(&fs_info->tree_root->log_mutex);
3016                 if (!log_root_tree->node) {
3017                         ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
3018                         if (ret) {
3019                                 mutex_unlock(&fs_info->tree_root->log_mutex);
3020                                 blk_finish_plug(&plug);
3021                                 goto out;
3022                         }
3023                 }
3024                 mutex_unlock(&fs_info->tree_root->log_mutex);
3025         }
3026
3027         btrfs_init_log_ctx(&root_log_ctx, NULL);
3028
3029         mutex_lock(&log_root_tree->log_mutex);
3030
3031         index2 = log_root_tree->log_transid % 2;
3032         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3033         root_log_ctx.log_transid = log_root_tree->log_transid;
3034
3035         /*
3036          * Now we are safe to update the log_root_tree because we're under the
3037          * log_mutex, and we're a current writer so we're holding the commit
3038          * open until we drop the log_mutex.
3039          */
3040         ret = update_log_root(trans, log, &new_root_item);
3041         if (ret) {
3042                 list_del_init(&root_log_ctx.list);
3043                 blk_finish_plug(&plug);
3044                 btrfs_set_log_full_commit(trans);
3045                 if (ret != -ENOSPC)
3046                         btrfs_err(fs_info,
3047                                   "failed to update log for root %llu ret %d",
3048                                   root->root_key.objectid, ret);
3049                 btrfs_wait_tree_log_extents(log, mark);
3050                 mutex_unlock(&log_root_tree->log_mutex);
3051                 goto out;
3052         }
3053
3054         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3055                 blk_finish_plug(&plug);
3056                 list_del_init(&root_log_ctx.list);
3057                 mutex_unlock(&log_root_tree->log_mutex);
3058                 ret = root_log_ctx.log_ret;
3059                 goto out;
3060         }
3061
3062         if (atomic_read(&log_root_tree->log_commit[index2])) {
3063                 blk_finish_plug(&plug);
3064                 ret = btrfs_wait_tree_log_extents(log, mark);
3065                 wait_log_commit(log_root_tree,
3066                                 root_log_ctx.log_transid);
3067                 mutex_unlock(&log_root_tree->log_mutex);
3068                 if (!ret)
3069                         ret = root_log_ctx.log_ret;
3070                 goto out;
3071         }
3072         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3073         atomic_set(&log_root_tree->log_commit[index2], 1);
3074
3075         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3076                 wait_log_commit(log_root_tree,
3077                                 root_log_ctx.log_transid - 1);
3078         }
3079
3080         /*
3081          * now that we've moved on to the tree of log tree roots,
3082          * check the full commit flag again
3083          */
3084         if (btrfs_need_log_full_commit(trans)) {
3085                 blk_finish_plug(&plug);
3086                 btrfs_wait_tree_log_extents(log, mark);
3087                 mutex_unlock(&log_root_tree->log_mutex);
3088                 ret = BTRFS_LOG_FORCE_COMMIT;
3089                 goto out_wake_log_root;
3090         }
3091
3092         ret = btrfs_write_marked_extents(fs_info,
3093                                          &log_root_tree->dirty_log_pages,
3094                                          EXTENT_DIRTY | EXTENT_NEW);
3095         blk_finish_plug(&plug);
3096         /*
3097          * As described above, -EAGAIN indicates a hole in the extents. We
3098          * cannot wait for these write outs since the waiting cause a
3099          * deadlock. Bail out to the full commit instead.
3100          */
3101         if (ret == -EAGAIN && btrfs_is_zoned(fs_info)) {
3102                 btrfs_set_log_full_commit(trans);
3103                 btrfs_wait_tree_log_extents(log, mark);
3104                 mutex_unlock(&log_root_tree->log_mutex);
3105                 goto out_wake_log_root;
3106         } else if (ret) {
3107                 btrfs_set_log_full_commit(trans);
3108                 mutex_unlock(&log_root_tree->log_mutex);
3109                 goto out_wake_log_root;
3110         }
3111         ret = btrfs_wait_tree_log_extents(log, mark);
3112         if (!ret)
3113                 ret = btrfs_wait_tree_log_extents(log_root_tree,
3114                                                   EXTENT_NEW | EXTENT_DIRTY);
3115         if (ret) {
3116                 btrfs_set_log_full_commit(trans);
3117                 mutex_unlock(&log_root_tree->log_mutex);
3118                 goto out_wake_log_root;
3119         }
3120
3121         log_root_start = log_root_tree->node->start;
3122         log_root_level = btrfs_header_level(log_root_tree->node);
3123         log_root_tree->log_transid++;
3124         mutex_unlock(&log_root_tree->log_mutex);
3125
3126         /*
3127          * Here we are guaranteed that nobody is going to write the superblock
3128          * for the current transaction before us and that neither we do write
3129          * our superblock before the previous transaction finishes its commit
3130          * and writes its superblock, because:
3131          *
3132          * 1) We are holding a handle on the current transaction, so no body
3133          *    can commit it until we release the handle;
3134          *
3135          * 2) Before writing our superblock we acquire the tree_log_mutex, so
3136          *    if the previous transaction is still committing, and hasn't yet
3137          *    written its superblock, we wait for it to do it, because a
3138          *    transaction commit acquires the tree_log_mutex when the commit
3139          *    begins and releases it only after writing its superblock.
3140          */
3141         mutex_lock(&fs_info->tree_log_mutex);
3142
3143         /*
3144          * The previous transaction writeout phase could have failed, and thus
3145          * marked the fs in an error state.  We must not commit here, as we
3146          * could have updated our generation in the super_for_commit and
3147          * writing the super here would result in transid mismatches.  If there
3148          * is an error here just bail.
3149          */
3150         if (BTRFS_FS_ERROR(fs_info)) {
3151                 ret = -EIO;
3152                 btrfs_set_log_full_commit(trans);
3153                 btrfs_abort_transaction(trans, ret);
3154                 mutex_unlock(&fs_info->tree_log_mutex);
3155                 goto out_wake_log_root;
3156         }
3157
3158         btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start);
3159         btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level);
3160         ret = write_all_supers(fs_info, 1);
3161         mutex_unlock(&fs_info->tree_log_mutex);
3162         if (ret) {
3163                 btrfs_set_log_full_commit(trans);
3164                 btrfs_abort_transaction(trans, ret);
3165                 goto out_wake_log_root;
3166         }
3167
3168         /*
3169          * We know there can only be one task here, since we have not yet set
3170          * root->log_commit[index1] to 0 and any task attempting to sync the
3171          * log must wait for the previous log transaction to commit if it's
3172          * still in progress or wait for the current log transaction commit if
3173          * someone else already started it. We use <= and not < because the
3174          * first log transaction has an ID of 0.
3175          */
3176         ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid);
3177         btrfs_set_root_last_log_commit(root, log_transid);
3178
3179 out_wake_log_root:
3180         mutex_lock(&log_root_tree->log_mutex);
3181         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3182
3183         log_root_tree->log_transid_committed++;
3184         atomic_set(&log_root_tree->log_commit[index2], 0);
3185         mutex_unlock(&log_root_tree->log_mutex);
3186
3187         /*
3188          * The barrier before waitqueue_active (in cond_wake_up) is needed so
3189          * all the updates above are seen by the woken threads. It might not be
3190          * necessary, but proving that seems to be hard.
3191          */
3192         cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3193 out:
3194         mutex_lock(&root->log_mutex);
3195         btrfs_remove_all_log_ctxs(root, index1, ret);
3196         root->log_transid_committed++;
3197         atomic_set(&root->log_commit[index1], 0);
3198         mutex_unlock(&root->log_mutex);
3199
3200         /*
3201          * The barrier before waitqueue_active (in cond_wake_up) is needed so
3202          * all the updates above are seen by the woken threads. It might not be
3203          * necessary, but proving that seems to be hard.
3204          */
3205         cond_wake_up(&root->log_commit_wait[index1]);
3206         return ret;
3207 }
3208
3209 static void free_log_tree(struct btrfs_trans_handle *trans,
3210                           struct btrfs_root *log)
3211 {
3212         int ret;
3213         struct walk_control wc = {
3214                 .free = 1,
3215                 .process_func = process_one_buffer
3216         };
3217
3218         if (log->node) {
3219                 ret = walk_log_tree(trans, log, &wc);
3220                 if (ret) {
3221                         /*
3222                          * We weren't able to traverse the entire log tree, the
3223                          * typical scenario is getting an -EIO when reading an
3224                          * extent buffer of the tree, due to a previous writeback
3225                          * failure of it.
3226                          */
3227                         set_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
3228                                 &log->fs_info->fs_state);
3229
3230                         /*
3231                          * Some extent buffers of the log tree may still be dirty
3232                          * and not yet written back to storage, because we may
3233                          * have updates to a log tree without syncing a log tree,
3234                          * such as during rename and link operations. So flush
3235                          * them out and wait for their writeback to complete, so
3236                          * that we properly cleanup their state and pages.
3237                          */
3238                         btrfs_write_marked_extents(log->fs_info,
3239                                                    &log->dirty_log_pages,
3240                                                    EXTENT_DIRTY | EXTENT_NEW);
3241                         btrfs_wait_tree_log_extents(log,
3242                                                     EXTENT_DIRTY | EXTENT_NEW);
3243
3244                         if (trans)
3245                                 btrfs_abort_transaction(trans, ret);
3246                         else
3247                                 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3248                 }
3249         }
3250
3251         extent_io_tree_release(&log->dirty_log_pages);
3252         extent_io_tree_release(&log->log_csum_range);
3253
3254         btrfs_put_root(log);
3255 }
3256
3257 /*
3258  * free all the extents used by the tree log.  This should be called
3259  * at commit time of the full transaction
3260  */
3261 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3262 {
3263         if (root->log_root) {
3264                 free_log_tree(trans, root->log_root);
3265                 root->log_root = NULL;
3266                 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3267         }
3268         return 0;
3269 }
3270
3271 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3272                              struct btrfs_fs_info *fs_info)
3273 {
3274         if (fs_info->log_root_tree) {
3275                 free_log_tree(trans, fs_info->log_root_tree);
3276                 fs_info->log_root_tree = NULL;
3277                 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state);
3278         }
3279         return 0;
3280 }
3281
3282 /*
3283  * Check if an inode was logged in the current transaction. This correctly deals
3284  * with the case where the inode was logged but has a logged_trans of 0, which
3285  * happens if the inode is evicted and loaded again, as logged_trans is an in
3286  * memory only field (not persisted).
3287  *
3288  * Returns 1 if the inode was logged before in the transaction, 0 if it was not,
3289  * and < 0 on error.
3290  */
3291 static int inode_logged(const struct btrfs_trans_handle *trans,
3292                         struct btrfs_inode *inode,
3293                         struct btrfs_path *path_in)
3294 {
3295         struct btrfs_path *path = path_in;
3296         struct btrfs_key key;
3297         int ret;
3298
3299         if (inode->logged_trans == trans->transid)
3300                 return 1;
3301
3302         /*
3303          * If logged_trans is not 0, then we know the inode logged was not logged
3304          * in this transaction, so we can return false right away.
3305          */
3306         if (inode->logged_trans > 0)
3307                 return 0;
3308
3309         /*
3310          * If no log tree was created for this root in this transaction, then
3311          * the inode can not have been logged in this transaction. In that case
3312          * set logged_trans to anything greater than 0 and less than the current
3313          * transaction's ID, to avoid the search below in a future call in case
3314          * a log tree gets created after this.
3315          */
3316         if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) {
3317                 inode->logged_trans = trans->transid - 1;
3318                 return 0;
3319         }
3320
3321         /*
3322          * We have a log tree and the inode's logged_trans is 0. We can't tell
3323          * for sure if the inode was logged before in this transaction by looking
3324          * only at logged_trans. We could be pessimistic and assume it was, but
3325          * that can lead to unnecessarily logging an inode during rename and link
3326          * operations, and then further updating the log in followup rename and
3327          * link operations, specially if it's a directory, which adds latency
3328          * visible to applications doing a series of rename or link operations.
3329          *
3330          * A logged_trans of 0 here can mean several things:
3331          *
3332          * 1) The inode was never logged since the filesystem was mounted, and may
3333          *    or may have not been evicted and loaded again;
3334          *
3335          * 2) The inode was logged in a previous transaction, then evicted and
3336          *    then loaded again;
3337          *
3338          * 3) The inode was logged in the current transaction, then evicted and
3339          *    then loaded again.
3340          *
3341          * For cases 1) and 2) we don't want to return true, but we need to detect
3342          * case 3) and return true. So we do a search in the log root for the inode
3343          * item.
3344          */
3345         key.objectid = btrfs_ino(inode);
3346         key.type = BTRFS_INODE_ITEM_KEY;
3347         key.offset = 0;
3348
3349         if (!path) {
3350                 path = btrfs_alloc_path();
3351                 if (!path)
3352                         return -ENOMEM;
3353         }
3354
3355         ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
3356
3357         if (path_in)
3358                 btrfs_release_path(path);
3359         else
3360                 btrfs_free_path(path);
3361
3362         /*
3363          * Logging an inode always results in logging its inode item. So if we
3364          * did not find the item we know the inode was not logged for sure.
3365          */
3366         if (ret < 0) {
3367                 return ret;
3368         } else if (ret > 0) {
3369                 /*
3370                  * Set logged_trans to a value greater than 0 and less then the
3371                  * current transaction to avoid doing the search in future calls.
3372                  */
3373                 inode->logged_trans = trans->transid - 1;
3374                 return 0;
3375         }
3376
3377         /*
3378          * The inode was previously logged and then evicted, set logged_trans to
3379          * the current transacion's ID, to avoid future tree searches as long as
3380          * the inode is not evicted again.
3381          */
3382         inode->logged_trans = trans->transid;
3383
3384         /*
3385          * If it's a directory, then we must set last_dir_index_offset to the
3386          * maximum possible value, so that the next attempt to log the inode does
3387          * not skip checking if dir index keys found in modified subvolume tree
3388          * leaves have been logged before, otherwise it would result in attempts
3389          * to insert duplicate dir index keys in the log tree. This must be done
3390          * because last_dir_index_offset is an in-memory only field, not persisted
3391          * in the inode item or any other on-disk structure, so its value is lost
3392          * once the inode is evicted.
3393          */
3394         if (S_ISDIR(inode->vfs_inode.i_mode))
3395                 inode->last_dir_index_offset = (u64)-1;
3396
3397         return 1;
3398 }
3399
3400 /*
3401  * Delete a directory entry from the log if it exists.
3402  *
3403  * Returns < 0 on error
3404  *           1 if the entry does not exists
3405  *           0 if the entry existed and was successfully deleted
3406  */
3407 static int del_logged_dentry(struct btrfs_trans_handle *trans,
3408                              struct btrfs_root *log,
3409                              struct btrfs_path *path,
3410                              u64 dir_ino,
3411                              const struct fscrypt_str *name,
3412                              u64 index)
3413 {
3414         struct btrfs_dir_item *di;
3415
3416         /*
3417          * We only log dir index items of a directory, so we don't need to look
3418          * for dir item keys.
3419          */
3420         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3421                                          index, name, -1);
3422         if (IS_ERR(di))
3423                 return PTR_ERR(di);
3424         else if (!di)
3425                 return 1;
3426
3427         /*
3428          * We do not need to update the size field of the directory's
3429          * inode item because on log replay we update the field to reflect
3430          * all existing entries in the directory (see overwrite_item()).
3431          */
3432         return btrfs_delete_one_dir_name(trans, log, path, di);
3433 }
3434
3435 /*
3436  * If both a file and directory are logged, and unlinks or renames are
3437  * mixed in, we have a few interesting corners:
3438  *
3439  * create file X in dir Y
3440  * link file X to X.link in dir Y
3441  * fsync file X
3442  * unlink file X but leave X.link
3443  * fsync dir Y
3444  *
3445  * After a crash we would expect only X.link to exist.  But file X
3446  * didn't get fsync'd again so the log has back refs for X and X.link.
3447  *
3448  * We solve this by removing directory entries and inode backrefs from the
3449  * log when a file that was logged in the current transaction is
3450  * unlinked.  Any later fsync will include the updated log entries, and
3451  * we'll be able to reconstruct the proper directory items from backrefs.
3452  *
3453  * This optimizations allows us to avoid relogging the entire inode
3454  * or the entire directory.
3455  */
3456 void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3457                                   struct btrfs_root *root,
3458                                   const struct fscrypt_str *name,
3459                                   struct btrfs_inode *dir, u64 index)
3460 {
3461         struct btrfs_path *path;
3462         int ret;
3463
3464         ret = inode_logged(trans, dir, NULL);
3465         if (ret == 0)
3466                 return;
3467         else if (ret < 0) {
3468                 btrfs_set_log_full_commit(trans);
3469                 return;
3470         }
3471
3472         ret = join_running_log_trans(root);
3473         if (ret)
3474                 return;
3475
3476         mutex_lock(&dir->log_mutex);
3477
3478         path = btrfs_alloc_path();
3479         if (!path) {
3480                 ret = -ENOMEM;
3481                 goto out_unlock;
3482         }
3483
3484         ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir),
3485                                 name, index);
3486         btrfs_free_path(path);
3487 out_unlock:
3488         mutex_unlock(&dir->log_mutex);
3489         if (ret < 0)
3490                 btrfs_set_log_full_commit(trans);
3491         btrfs_end_log_trans(root);
3492 }
3493
3494 /* see comments for btrfs_del_dir_entries_in_log */
3495 void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3496                                 struct btrfs_root *root,
3497                                 const struct fscrypt_str *name,
3498                                 struct btrfs_inode *inode, u64 dirid)
3499 {
3500         struct btrfs_root *log;
3501         u64 index;
3502         int ret;
3503
3504         ret = inode_logged(trans, inode, NULL);
3505         if (ret == 0)
3506                 return;
3507         else if (ret < 0) {
3508                 btrfs_set_log_full_commit(trans);
3509                 return;
3510         }
3511
3512         ret = join_running_log_trans(root);
3513         if (ret)
3514                 return;
3515         log = root->log_root;
3516         mutex_lock(&inode->log_mutex);
3517
3518         ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode),
3519                                   dirid, &index);
3520         mutex_unlock(&inode->log_mutex);
3521         if (ret < 0 && ret != -ENOENT)
3522                 btrfs_set_log_full_commit(trans);
3523         btrfs_end_log_trans(root);
3524 }
3525
3526 /*
3527  * creates a range item in the log for 'dirid'.  first_offset and
3528  * last_offset tell us which parts of the key space the log should
3529  * be considered authoritative for.
3530  */
3531 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3532                                        struct btrfs_root *log,
3533                                        struct btrfs_path *path,
3534                                        u64 dirid,
3535                                        u64 first_offset, u64 last_offset)
3536 {
3537         int ret;
3538         struct btrfs_key key;
3539         struct btrfs_dir_log_item *item;
3540
3541         key.objectid = dirid;
3542         key.offset = first_offset;
3543         key.type = BTRFS_DIR_LOG_INDEX_KEY;
3544         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3545         /*
3546          * -EEXIST is fine and can happen sporadically when we are logging a
3547          * directory and have concurrent insertions in the subvolume's tree for
3548          * items from other inodes and that result in pushing off some dir items
3549          * from one leaf to another in order to accommodate for the new items.
3550          * This results in logging the same dir index range key.
3551          */
3552         if (ret && ret != -EEXIST)
3553                 return ret;
3554
3555         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3556                               struct btrfs_dir_log_item);
3557         if (ret == -EEXIST) {
3558                 const u64 curr_end = btrfs_dir_log_end(path->nodes[0], item);
3559
3560                 /*
3561                  * btrfs_del_dir_entries_in_log() might have been called during
3562                  * an unlink between the initial insertion of this key and the
3563                  * current update, or we might be logging a single entry deletion
3564                  * during a rename, so set the new last_offset to the max value.
3565                  */
3566                 last_offset = max(last_offset, curr_end);
3567         }
3568         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3569         btrfs_mark_buffer_dirty(trans, path->nodes[0]);
3570         btrfs_release_path(path);
3571         return 0;
3572 }
3573
3574 static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
3575                                  struct btrfs_inode *inode,
3576                                  struct extent_buffer *src,
3577                                  struct btrfs_path *dst_path,
3578                                  int start_slot,
3579                                  int count)
3580 {
3581         struct btrfs_root *log = inode->root->log_root;
3582         char *ins_data = NULL;
3583         struct btrfs_item_batch batch;
3584         struct extent_buffer *dst;
3585         unsigned long src_offset;
3586         unsigned long dst_offset;
3587         u64 last_index;
3588         struct btrfs_key key;
3589         u32 item_size;
3590         int ret;
3591         int i;
3592
3593         ASSERT(count > 0);
3594         batch.nr = count;
3595
3596         if (count == 1) {
3597                 btrfs_item_key_to_cpu(src, &key, start_slot);
3598                 item_size = btrfs_item_size(src, start_slot);
3599                 batch.keys = &key;
3600                 batch.data_sizes = &item_size;
3601                 batch.total_data_size = item_size;
3602         } else {
3603                 struct btrfs_key *ins_keys;
3604                 u32 *ins_sizes;
3605
3606                 ins_data = kmalloc(count * sizeof(u32) +
3607                                    count * sizeof(struct btrfs_key), GFP_NOFS);
3608                 if (!ins_data)
3609                         return -ENOMEM;
3610
3611                 ins_sizes = (u32 *)ins_data;
3612                 ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32));
3613                 batch.keys = ins_keys;
3614                 batch.data_sizes = ins_sizes;
3615                 batch.total_data_size = 0;
3616
3617                 for (i = 0; i < count; i++) {
3618                         const int slot = start_slot + i;
3619
3620                         btrfs_item_key_to_cpu(src, &ins_keys[i], slot);
3621                         ins_sizes[i] = btrfs_item_size(src, slot);
3622                         batch.total_data_size += ins_sizes[i];
3623                 }
3624         }
3625
3626         ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
3627         if (ret)
3628                 goto out;
3629
3630         dst = dst_path->nodes[0];
3631         /*
3632          * Copy all the items in bulk, in a single copy operation. Item data is
3633          * organized such that it's placed at the end of a leaf and from right
3634          * to left. For example, the data for the second item ends at an offset
3635          * that matches the offset where the data for the first item starts, the
3636          * data for the third item ends at an offset that matches the offset
3637          * where the data of the second items starts, and so on.
3638          * Therefore our source and destination start offsets for copy match the
3639          * offsets of the last items (highest slots).
3640          */
3641         dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1);
3642         src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1);
3643         copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size);
3644         btrfs_release_path(dst_path);
3645
3646         last_index = batch.keys[count - 1].offset;
3647         ASSERT(last_index > inode->last_dir_index_offset);
3648
3649         /*
3650          * If for some unexpected reason the last item's index is not greater
3651          * than the last index we logged, warn and force a transaction commit.
3652          */
3653         if (WARN_ON(last_index <= inode->last_dir_index_offset))
3654                 ret = BTRFS_LOG_FORCE_COMMIT;
3655         else
3656                 inode->last_dir_index_offset = last_index;
3657
3658         if (btrfs_get_first_dir_index_to_log(inode) == 0)
3659                 btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
3660 out:
3661         kfree(ins_data);
3662
3663         return ret;
3664 }
3665
3666 static int clone_leaf(struct btrfs_path *path, struct btrfs_log_ctx *ctx)
3667 {
3668         const int slot = path->slots[0];
3669
3670         if (ctx->scratch_eb) {
3671                 copy_extent_buffer_full(ctx->scratch_eb, path->nodes[0]);
3672         } else {
3673                 ctx->scratch_eb = btrfs_clone_extent_buffer(path->nodes[0]);
3674                 if (!ctx->scratch_eb)
3675                         return -ENOMEM;
3676         }
3677
3678         btrfs_release_path(path);
3679         path->nodes[0] = ctx->scratch_eb;
3680         path->slots[0] = slot;
3681         /*
3682          * Add extra ref to scratch eb so that it is not freed when callers
3683          * release the path, so we can reuse it later if needed.
3684          */
3685         atomic_inc(&ctx->scratch_eb->refs);
3686
3687         return 0;
3688 }
3689
3690 static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
3691                                   struct btrfs_inode *inode,
3692                                   struct btrfs_path *path,
3693                                   struct btrfs_path *dst_path,
3694                                   struct btrfs_log_ctx *ctx,
3695                                   u64 *last_old_dentry_offset)
3696 {
3697         struct btrfs_root *log = inode->root->log_root;
3698         struct extent_buffer *src;
3699         const int nritems = btrfs_header_nritems(path->nodes[0]);
3700         const u64 ino = btrfs_ino(inode);
3701         bool last_found = false;
3702         int batch_start = 0;
3703         int batch_size = 0;
3704         int ret;
3705
3706         /*
3707          * We need to clone the leaf, release the read lock on it, and use the
3708          * clone before modifying the log tree. See the comment at copy_items()
3709          * about why we need to do this.
3710          */
3711         ret = clone_leaf(path, ctx);
3712         if (ret < 0)
3713                 return ret;
3714
3715         src = path->nodes[0];
3716
3717         for (int i = path->slots[0]; i < nritems; i++) {
3718                 struct btrfs_dir_item *di;
3719                 struct btrfs_key key;
3720                 int ret;
3721
3722                 btrfs_item_key_to_cpu(src, &key, i);
3723
3724                 if (key.objectid != ino || key.type != BTRFS_DIR_INDEX_KEY) {
3725                         last_found = true;
3726                         break;
3727                 }
3728
3729                 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3730
3731                 /*
3732                  * Skip ranges of items that consist only of dir item keys created
3733                  * in past transactions. However if we find a gap, we must log a
3734                  * dir index range item for that gap, so that index keys in that
3735                  * gap are deleted during log replay.
3736                  */
3737                 if (btrfs_dir_transid(src, di) < trans->transid) {
3738                         if (key.offset > *last_old_dentry_offset + 1) {
3739                                 ret = insert_dir_log_key(trans, log, dst_path,
3740                                                  ino, *last_old_dentry_offset + 1,
3741                                                  key.offset - 1);
3742                                 if (ret < 0)
3743                                         return ret;
3744                         }
3745
3746                         *last_old_dentry_offset = key.offset;
3747                         continue;
3748                 }
3749
3750                 /* If we logged this dir index item before, we can skip it. */
3751                 if (key.offset <= inode->last_dir_index_offset)
3752                         continue;
3753
3754                 /*
3755                  * We must make sure that when we log a directory entry, the
3756                  * corresponding inode, after log replay, has a matching link
3757                  * count. For example:
3758                  *
3759                  * touch foo
3760                  * mkdir mydir
3761                  * sync
3762                  * ln foo mydir/bar
3763                  * xfs_io -c "fsync" mydir
3764                  * <crash>
3765                  * <mount fs and log replay>
3766                  *
3767                  * Would result in a fsync log that when replayed, our file inode
3768                  * would have a link count of 1, but we get two directory entries
3769                  * pointing to the same inode. After removing one of the names,
3770                  * it would not be possible to remove the other name, which
3771                  * resulted always in stale file handle errors, and would not be
3772                  * possible to rmdir the parent directory, since its i_size could
3773                  * never be decremented to the value BTRFS_EMPTY_DIR_SIZE,
3774                  * resulting in -ENOTEMPTY errors.
3775                  */
3776                 if (!ctx->log_new_dentries) {
3777                         struct btrfs_key di_key;
3778
3779                         btrfs_dir_item_key_to_cpu(src, di, &di_key);
3780                         if (di_key.type != BTRFS_ROOT_ITEM_KEY)
3781                                 ctx->log_new_dentries = true;
3782                 }
3783
3784                 if (batch_size == 0)
3785                         batch_start = i;
3786                 batch_size++;
3787         }
3788
3789         if (batch_size > 0) {
3790                 int ret;
3791
3792                 ret = flush_dir_items_batch(trans, inode, src, dst_path,
3793                                             batch_start, batch_size);
3794                 if (ret < 0)
3795                         return ret;
3796         }
3797
3798         return last_found ? 1 : 0;
3799 }
3800
3801 /*
3802  * log all the items included in the current transaction for a given
3803  * directory.  This also creates the range items in the log tree required
3804  * to replay anything deleted before the fsync
3805  */
3806 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3807                           struct btrfs_inode *inode,
3808                           struct btrfs_path *path,
3809                           struct btrfs_path *dst_path,
3810                           struct btrfs_log_ctx *ctx,
3811                           u64 min_offset, u64 *last_offset_ret)
3812 {
3813         struct btrfs_key min_key;
3814         struct btrfs_root *root = inode->root;
3815         struct btrfs_root *log = root->log_root;
3816         int ret;
3817         u64 last_old_dentry_offset = min_offset - 1;
3818         u64 last_offset = (u64)-1;
3819         u64 ino = btrfs_ino(inode);
3820
3821         min_key.objectid = ino;
3822         min_key.type = BTRFS_DIR_INDEX_KEY;
3823         min_key.offset = min_offset;
3824
3825         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3826
3827         /*
3828          * we didn't find anything from this transaction, see if there
3829          * is anything at all
3830          */
3831         if (ret != 0 || min_key.objectid != ino ||
3832             min_key.type != BTRFS_DIR_INDEX_KEY) {
3833                 min_key.objectid = ino;
3834                 min_key.type = BTRFS_DIR_INDEX_KEY;
3835                 min_key.offset = (u64)-1;
3836                 btrfs_release_path(path);
3837                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3838                 if (ret < 0) {
3839                         btrfs_release_path(path);
3840                         return ret;
3841                 }
3842                 ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3843
3844                 /* if ret == 0 there are items for this type,
3845                  * create a range to tell us the last key of this type.
3846                  * otherwise, there are no items in this directory after
3847                  * *min_offset, and we create a range to indicate that.
3848                  */
3849                 if (ret == 0) {
3850                         struct btrfs_key tmp;
3851
3852                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3853                                               path->slots[0]);
3854                         if (tmp.type == BTRFS_DIR_INDEX_KEY)
3855                                 last_old_dentry_offset = tmp.offset;
3856                 } else if (ret > 0) {
3857                         ret = 0;
3858                 }
3859
3860                 goto done;
3861         }
3862
3863         /* go backward to find any previous key */
3864         ret = btrfs_previous_item(root, path, ino, BTRFS_DIR_INDEX_KEY);
3865         if (ret == 0) {
3866                 struct btrfs_key tmp;
3867
3868                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3869                 /*
3870                  * The dir index key before the first one we found that needs to
3871                  * be logged might be in a previous leaf, and there might be a
3872                  * gap between these keys, meaning that we had deletions that
3873                  * happened. So the key range item we log (key type
3874                  * BTRFS_DIR_LOG_INDEX_KEY) must cover a range that starts at the
3875                  * previous key's offset plus 1, so that those deletes are replayed.
3876                  */
3877                 if (tmp.type == BTRFS_DIR_INDEX_KEY)
3878                         last_old_dentry_offset = tmp.offset;
3879         } else if (ret < 0) {
3880                 goto done;
3881         }
3882
3883         btrfs_release_path(path);
3884
3885         /*
3886          * Find the first key from this transaction again or the one we were at
3887          * in the loop below in case we had to reschedule. We may be logging the
3888          * directory without holding its VFS lock, which happen when logging new
3889          * dentries (through log_new_dir_dentries()) or in some cases when we
3890          * need to log the parent directory of an inode. This means a dir index
3891          * key might be deleted from the inode's root, and therefore we may not
3892          * find it anymore. If we can't find it, just move to the next key. We
3893          * can not bail out and ignore, because if we do that we will simply
3894          * not log dir index keys that come after the one that was just deleted
3895          * and we can end up logging a dir index range that ends at (u64)-1
3896          * (@last_offset is initialized to that), resulting in removing dir
3897          * entries we should not remove at log replay time.
3898          */
3899 search:
3900         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3901         if (ret > 0) {
3902                 ret = btrfs_next_item(root, path);
3903                 if (ret > 0) {
3904                         /* There are no more keys in the inode's root. */
3905                         ret = 0;
3906                         goto done;
3907                 }
3908         }
3909         if (ret < 0)
3910                 goto done;
3911
3912         /*
3913          * we have a block from this transaction, log every item in it
3914          * from our directory
3915          */
3916         while (1) {
3917                 ret = process_dir_items_leaf(trans, inode, path, dst_path, ctx,
3918                                              &last_old_dentry_offset);
3919                 if (ret != 0) {
3920                         if (ret > 0)
3921                                 ret = 0;
3922                         goto done;
3923                 }
3924                 path->slots[0] = btrfs_header_nritems(path->nodes[0]);
3925
3926                 /*
3927                  * look ahead to the next item and see if it is also
3928                  * from this directory and from this transaction
3929                  */
3930                 ret = btrfs_next_leaf(root, path);
3931                 if (ret) {
3932                         if (ret == 1) {
3933                                 last_offset = (u64)-1;
3934                                 ret = 0;
3935                         }
3936                         goto done;
3937                 }
3938                 btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]);
3939                 if (min_key.objectid != ino || min_key.type != BTRFS_DIR_INDEX_KEY) {
3940                         last_offset = (u64)-1;
3941                         goto done;
3942                 }
3943                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3944                         /*
3945                          * The next leaf was not changed in the current transaction
3946                          * and has at least one dir index key.
3947                          * We check for the next key because there might have been
3948                          * one or more deletions between the last key we logged and
3949                          * that next key. So the key range item we log (key type
3950                          * BTRFS_DIR_LOG_INDEX_KEY) must end at the next key's
3951                          * offset minus 1, so that those deletes are replayed.
3952                          */
3953                         last_offset = min_key.offset - 1;
3954                         goto done;
3955                 }
3956                 if (need_resched()) {
3957                         btrfs_release_path(path);
3958                         cond_resched();
3959                         goto search;
3960                 }
3961         }
3962 done:
3963         btrfs_release_path(path);
3964         btrfs_release_path(dst_path);
3965
3966         if (ret == 0) {
3967                 *last_offset_ret = last_offset;
3968                 /*
3969                  * In case the leaf was changed in the current transaction but
3970                  * all its dir items are from a past transaction, the last item
3971                  * in the leaf is a dir item and there's no gap between that last
3972                  * dir item and the first one on the next leaf (which did not
3973                  * change in the current transaction), then we don't need to log
3974                  * a range, last_old_dentry_offset is == to last_offset.
3975                  */
3976                 ASSERT(last_old_dentry_offset <= last_offset);
3977                 if (last_old_dentry_offset < last_offset)
3978                         ret = insert_dir_log_key(trans, log, path, ino,
3979                                                  last_old_dentry_offset + 1,
3980                                                  last_offset);
3981         }
3982
3983         return ret;
3984 }
3985
3986 /*
3987  * If the inode was logged before and it was evicted, then its
3988  * last_dir_index_offset is (u64)-1, so we don't the value of the last index
3989  * key offset. If that's the case, search for it and update the inode. This
3990  * is to avoid lookups in the log tree every time we try to insert a dir index
3991  * key from a leaf changed in the current transaction, and to allow us to always
3992  * do batch insertions of dir index keys.
3993  */
3994 static int update_last_dir_index_offset(struct btrfs_inode *inode,
3995                                         struct btrfs_path *path,
3996                                         const struct btrfs_log_ctx *ctx)
3997 {
3998         const u64 ino = btrfs_ino(inode);
3999         struct btrfs_key key;
4000         int ret;
4001
4002         lockdep_assert_held(&inode->log_mutex);
4003
4004         if (inode->last_dir_index_offset != (u64)-1)
4005                 return 0;
4006
4007         if (!ctx->logged_before) {
4008                 inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4009                 return 0;
4010         }
4011
4012         key.objectid = ino;
4013         key.type = BTRFS_DIR_INDEX_KEY;
4014         key.offset = (u64)-1;
4015
4016         ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0);
4017         /*
4018          * An error happened or we actually have an index key with an offset
4019          * value of (u64)-1. Bail out, we're done.
4020          */
4021         if (ret <= 0)
4022                 goto out;
4023
4024         ret = 0;
4025         inode->last_dir_index_offset = BTRFS_DIR_START_INDEX - 1;
4026
4027         /*
4028          * No dir index items, bail out and leave last_dir_index_offset with
4029          * the value right before the first valid index value.
4030          */
4031         if (path->slots[0] == 0)
4032                 goto out;
4033
4034         /*
4035          * btrfs_search_slot() left us at one slot beyond the slot with the last
4036          * index key, or beyond the last key of the directory that is not an
4037          * index key. If we have an index key before, set last_dir_index_offset
4038          * to its offset value, otherwise leave it with a value right before the
4039          * first valid index value, as it means we have an empty directory.
4040          */
4041         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4042         if (key.objectid == ino && key.type == BTRFS_DIR_INDEX_KEY)
4043                 inode->last_dir_index_offset = key.offset;
4044
4045 out:
4046         btrfs_release_path(path);
4047
4048         return ret;
4049 }
4050
4051 /*
4052  * logging directories is very similar to logging inodes, We find all the items
4053  * from the current transaction and write them to the log.
4054  *
4055  * The recovery code scans the directory in the subvolume, and if it finds a
4056  * key in the range logged that is not present in the log tree, then it means
4057  * that dir entry was unlinked during the transaction.
4058  *
4059  * In order for that scan to work, we must include one key smaller than
4060  * the smallest logged by this transaction and one key larger than the largest
4061  * key logged by this transaction.
4062  */
4063 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
4064                           struct btrfs_inode *inode,
4065                           struct btrfs_path *path,
4066                           struct btrfs_path *dst_path,
4067                           struct btrfs_log_ctx *ctx)
4068 {
4069         u64 min_key;
4070         u64 max_key;
4071         int ret;
4072
4073         ret = update_last_dir_index_offset(inode, path, ctx);
4074         if (ret)
4075                 return ret;
4076
4077         min_key = BTRFS_DIR_START_INDEX;
4078         max_key = 0;
4079
4080         while (1) {
4081                 ret = log_dir_items(trans, inode, path, dst_path,
4082                                 ctx, min_key, &max_key);
4083                 if (ret)
4084                         return ret;
4085                 if (max_key == (u64)-1)
4086                         break;
4087                 min_key = max_key + 1;
4088         }
4089
4090         return 0;
4091 }
4092
4093 /*
4094  * a helper function to drop items from the log before we relog an
4095  * inode.  max_key_type indicates the highest item type to remove.
4096  * This cannot be run for file data extents because it does not
4097  * free the extents they point to.
4098  */
4099 static int drop_inode_items(struct btrfs_trans_handle *trans,
4100                                   struct btrfs_root *log,
4101                                   struct btrfs_path *path,
4102                                   struct btrfs_inode *inode,
4103                                   int max_key_type)
4104 {
4105         int ret;
4106         struct btrfs_key key;
4107         struct btrfs_key found_key;
4108         int start_slot;
4109
4110         key.objectid = btrfs_ino(inode);
4111         key.type = max_key_type;
4112         key.offset = (u64)-1;
4113
4114         while (1) {
4115                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
4116                 if (ret < 0) {
4117                         break;
4118                 } else if (ret > 0) {
4119                         if (path->slots[0] == 0)
4120                                 break;
4121                         path->slots[0]--;
4122                 }
4123
4124                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
4125                                       path->slots[0]);
4126
4127                 if (found_key.objectid != key.objectid)
4128                         break;
4129
4130                 found_key.offset = 0;
4131                 found_key.type = 0;
4132                 ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
4133                 if (ret < 0)
4134                         break;
4135
4136                 ret = btrfs_del_items(trans, log, path, start_slot,
4137                                       path->slots[0] - start_slot + 1);
4138                 /*
4139                  * If start slot isn't 0 then we don't need to re-search, we've
4140                  * found the last guy with the objectid in this tree.
4141                  */
4142                 if (ret || start_slot != 0)
4143                         break;
4144                 btrfs_release_path(path);
4145         }
4146         btrfs_release_path(path);
4147         if (ret > 0)
4148                 ret = 0;
4149         return ret;
4150 }
4151
4152 static int truncate_inode_items(struct btrfs_trans_handle *trans,
4153                                 struct btrfs_root *log_root,
4154                                 struct btrfs_inode *inode,
4155                                 u64 new_size, u32 min_type)
4156 {
4157         struct btrfs_truncate_control control = {
4158                 .new_size = new_size,
4159                 .ino = btrfs_ino(inode),
4160                 .min_type = min_type,
4161                 .skip_ref_updates = true,
4162         };
4163
4164         return btrfs_truncate_inode_items(trans, log_root, &control);
4165 }
4166
4167 static void fill_inode_item(struct btrfs_trans_handle *trans,
4168                             struct extent_buffer *leaf,
4169                             struct btrfs_inode_item *item,
4170                             struct inode *inode, int log_inode_only,
4171                             u64 logged_isize)
4172 {
4173         struct btrfs_map_token token;
4174         u64 flags;
4175
4176         btrfs_init_map_token(&token, leaf);
4177
4178         if (log_inode_only) {
4179                 /* set the generation to zero so the recover code
4180                  * can tell the difference between an logging
4181                  * just to say 'this inode exists' and a logging
4182                  * to say 'update this inode with these values'
4183                  */
4184                 btrfs_set_token_inode_generation(&token, item, 0);
4185                 btrfs_set_token_inode_size(&token, item, logged_isize);
4186         } else {
4187                 btrfs_set_token_inode_generation(&token, item,
4188                                                  BTRFS_I(inode)->generation);
4189                 btrfs_set_token_inode_size(&token, item, inode->i_size);
4190         }
4191
4192         btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4193         btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4194         btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4195         btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4196
4197         btrfs_set_token_timespec_sec(&token, &item->atime,
4198                                      inode_get_atime_sec(inode));
4199         btrfs_set_token_timespec_nsec(&token, &item->atime,
4200                                       inode_get_atime_nsec(inode));
4201
4202         btrfs_set_token_timespec_sec(&token, &item->mtime,
4203                                      inode_get_mtime_sec(inode));
4204         btrfs_set_token_timespec_nsec(&token, &item->mtime,
4205                                       inode_get_mtime_nsec(inode));
4206
4207         btrfs_set_token_timespec_sec(&token, &item->ctime,
4208                                      inode_get_ctime_sec(inode));
4209         btrfs_set_token_timespec_nsec(&token, &item->ctime,
4210                                       inode_get_ctime_nsec(inode));
4211
4212         /*
4213          * We do not need to set the nbytes field, in fact during a fast fsync
4214          * its value may not even be correct, since a fast fsync does not wait
4215          * for ordered extent completion, which is where we update nbytes, it
4216          * only waits for writeback to complete. During log replay as we find
4217          * file extent items and replay them, we adjust the nbytes field of the
4218          * inode item in subvolume tree as needed (see overwrite_item()).
4219          */
4220
4221         btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4222         btrfs_set_token_inode_transid(&token, item, trans->transid);
4223         btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4224         flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4225                                           BTRFS_I(inode)->ro_flags);
4226         btrfs_set_token_inode_flags(&token, item, flags);
4227         btrfs_set_token_inode_block_group(&token, item, 0);
4228 }
4229
4230 static int log_inode_item(struct btrfs_trans_handle *trans,
4231                           struct btrfs_root *log, struct btrfs_path *path,
4232                           struct btrfs_inode *inode, bool inode_item_dropped)
4233 {
4234         struct btrfs_inode_item *inode_item;
4235         int ret;
4236
4237         /*
4238          * If we are doing a fast fsync and the inode was logged before in the
4239          * current transaction, then we know the inode was previously logged and
4240          * it exists in the log tree. For performance reasons, in this case use
4241          * btrfs_search_slot() directly with ins_len set to 0 so that we never
4242          * attempt a write lock on the leaf's parent, which adds unnecessary lock
4243          * contention in case there are concurrent fsyncs for other inodes of the
4244          * same subvolume. Using btrfs_insert_empty_item() when the inode item
4245          * already exists can also result in unnecessarily splitting a leaf.
4246          */
4247         if (!inode_item_dropped && inode->logged_trans == trans->transid) {
4248                 ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1);
4249                 ASSERT(ret <= 0);
4250                 if (ret > 0)
4251                         ret = -ENOENT;
4252         } else {
4253                 /*
4254                  * This means it is the first fsync in the current transaction,
4255                  * so the inode item is not in the log and we need to insert it.
4256                  * We can never get -EEXIST because we are only called for a fast
4257                  * fsync and in case an inode eviction happens after the inode was
4258                  * logged before in the current transaction, when we load again
4259                  * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime
4260                  * flags and set ->logged_trans to 0.
4261                  */
4262                 ret = btrfs_insert_empty_item(trans, log, path, &inode->location,
4263                                               sizeof(*inode_item));
4264                 ASSERT(ret != -EEXIST);
4265         }
4266         if (ret)
4267                 return ret;
4268         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4269                                     struct btrfs_inode_item);
4270         fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
4271                         0, 0);
4272         btrfs_release_path(path);
4273         return 0;
4274 }
4275
4276 static int log_csums(struct btrfs_trans_handle *trans,
4277                      struct btrfs_inode *inode,
4278                      struct btrfs_root *log_root,
4279                      struct btrfs_ordered_sum *sums)
4280 {
4281         const u64 lock_end = sums->logical + sums->len - 1;
4282         struct extent_state *cached_state = NULL;
4283         int ret;
4284
4285         /*
4286          * If this inode was not used for reflink operations in the current
4287          * transaction with new extents, then do the fast path, no need to
4288          * worry about logging checksum items with overlapping ranges.
4289          */
4290         if (inode->last_reflink_trans < trans->transid)
4291                 return btrfs_csum_file_blocks(trans, log_root, sums);
4292
4293         /*
4294          * Serialize logging for checksums. This is to avoid racing with the
4295          * same checksum being logged by another task that is logging another
4296          * file which happens to refer to the same extent as well. Such races
4297          * can leave checksum items in the log with overlapping ranges.
4298          */
4299         ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4300                           &cached_state);
4301         if (ret)
4302                 return ret;
4303         /*
4304          * Due to extent cloning, we might have logged a csum item that covers a
4305          * subrange of a cloned extent, and later we can end up logging a csum
4306          * item for a larger subrange of the same extent or the entire range.
4307          * This would leave csum items in the log tree that cover the same range
4308          * and break the searches for checksums in the log tree, resulting in
4309          * some checksums missing in the fs/subvolume tree. So just delete (or
4310          * trim and adjust) any existing csum items in the log for this range.
4311          */
4312         ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len);
4313         if (!ret)
4314                 ret = btrfs_csum_file_blocks(trans, log_root, sums);
4315
4316         unlock_extent(&log_root->log_csum_range, sums->logical, lock_end,
4317                       &cached_state);
4318
4319         return ret;
4320 }
4321
4322 static noinline int copy_items(struct btrfs_trans_handle *trans,
4323                                struct btrfs_inode *inode,
4324                                struct btrfs_path *dst_path,
4325                                struct btrfs_path *src_path,
4326                                int start_slot, int nr, int inode_only,
4327                                u64 logged_isize, struct btrfs_log_ctx *ctx)
4328 {
4329         struct btrfs_root *log = inode->root->log_root;
4330         struct btrfs_file_extent_item *extent;
4331         struct extent_buffer *src;
4332         int ret;
4333         struct btrfs_key *ins_keys;
4334         u32 *ins_sizes;
4335         struct btrfs_item_batch batch;
4336         char *ins_data;
4337         int dst_index;
4338         const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
4339         const u64 i_size = i_size_read(&inode->vfs_inode);
4340
4341         /*
4342          * To keep lockdep happy and avoid deadlocks, clone the source leaf and
4343          * use the clone. This is because otherwise we would be changing the log
4344          * tree, to insert items from the subvolume tree or insert csum items,
4345          * while holding a read lock on a leaf from the subvolume tree, which
4346          * creates a nasty lock dependency when COWing log tree nodes/leaves:
4347          *
4348          * 1) Modifying the log tree triggers an extent buffer allocation while
4349          *    holding a write lock on a parent extent buffer from the log tree.
4350          *    Allocating the pages for an extent buffer, or the extent buffer
4351          *    struct, can trigger inode eviction and finally the inode eviction
4352          *    will trigger a release/remove of a delayed node, which requires
4353          *    taking the delayed node's mutex;
4354          *
4355          * 2) Allocating a metadata extent for a log tree can trigger the async
4356          *    reclaim thread and make us wait for it to release enough space and
4357          *    unblock our reservation ticket. The reclaim thread can start
4358          *    flushing delayed items, and that in turn results in the need to
4359          *    lock delayed node mutexes and in the need to write lock extent
4360          *    buffers of a subvolume tree - all this while holding a write lock
4361          *    on the parent extent buffer in the log tree.
4362          *
4363          * So one task in scenario 1) running in parallel with another task in
4364          * scenario 2) could lead to a deadlock, one wanting to lock a delayed
4365          * node mutex while having a read lock on a leaf from the subvolume,
4366          * while the other is holding the delayed node's mutex and wants to
4367          * write lock the same subvolume leaf for flushing delayed items.
4368          */
4369         ret = clone_leaf(src_path, ctx);
4370         if (ret < 0)
4371                 return ret;
4372
4373         src = src_path->nodes[0];
4374
4375         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4376                            nr * sizeof(u32), GFP_NOFS);
4377         if (!ins_data)
4378                 return -ENOMEM;
4379
4380         ins_sizes = (u32 *)ins_data;
4381         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4382         batch.keys = ins_keys;
4383         batch.data_sizes = ins_sizes;
4384         batch.total_data_size = 0;
4385         batch.nr = 0;
4386
4387         dst_index = 0;
4388         for (int i = 0; i < nr; i++) {
4389                 const int src_slot = start_slot + i;
4390                 struct btrfs_root *csum_root;
4391                 struct btrfs_ordered_sum *sums;
4392                 struct btrfs_ordered_sum *sums_next;
4393                 LIST_HEAD(ordered_sums);
4394                 u64 disk_bytenr;
4395                 u64 disk_num_bytes;
4396                 u64 extent_offset;
4397                 u64 extent_num_bytes;
4398                 bool is_old_extent;
4399
4400                 btrfs_item_key_to_cpu(src, &ins_keys[dst_index], src_slot);
4401
4402                 if (ins_keys[dst_index].type != BTRFS_EXTENT_DATA_KEY)
4403                         goto add_to_batch;
4404
4405                 extent = btrfs_item_ptr(src, src_slot,
4406                                         struct btrfs_file_extent_item);
4407
4408                 is_old_extent = (btrfs_file_extent_generation(src, extent) <
4409                                  trans->transid);
4410
4411                 /*
4412                  * Don't copy extents from past generations. That would make us
4413                  * log a lot more metadata for common cases like doing only a
4414                  * few random writes into a file and then fsync it for the first
4415                  * time or after the full sync flag is set on the inode. We can
4416                  * get leaves full of extent items, most of which are from past
4417                  * generations, so we can skip them - as long as the inode has
4418                  * not been the target of a reflink operation in this transaction,
4419                  * as in that case it might have had file extent items with old
4420                  * generations copied into it. We also must always log prealloc
4421                  * extents that start at or beyond eof, otherwise we would lose
4422                  * them on log replay.
4423                  */
4424                 if (is_old_extent &&
4425                     ins_keys[dst_index].offset < i_size &&
4426                     inode->last_reflink_trans < trans->transid)
4427                         continue;
4428
4429                 if (skip_csum)
4430                         goto add_to_batch;
4431
4432                 /* Only regular extents have checksums. */
4433                 if (btrfs_file_extent_type(src, extent) != BTRFS_FILE_EXTENT_REG)
4434                         goto add_to_batch;
4435
4436                 /*
4437                  * If it's an extent created in a past transaction, then its
4438                  * checksums are already accessible from the committed csum tree,
4439                  * no need to log them.
4440                  */
4441                 if (is_old_extent)
4442                         goto add_to_batch;
4443
4444                 disk_bytenr = btrfs_file_extent_disk_bytenr(src, extent);
4445                 /* If it's an explicit hole, there are no checksums. */
4446                 if (disk_bytenr == 0)
4447                         goto add_to_batch;
4448
4449                 disk_num_bytes = btrfs_file_extent_disk_num_bytes(src, extent);
4450
4451                 if (btrfs_file_extent_compression(src, extent)) {
4452                         extent_offset = 0;
4453                         extent_num_bytes = disk_num_bytes;
4454                 } else {
4455                         extent_offset = btrfs_file_extent_offset(src, extent);
4456                         extent_num_bytes = btrfs_file_extent_num_bytes(src, extent);
4457                 }
4458
4459                 csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr);
4460                 disk_bytenr += extent_offset;
4461                 ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4462                                               disk_bytenr + extent_num_bytes - 1,
4463                                               &ordered_sums, 0, false);
4464                 if (ret)
4465                         goto out;
4466
4467                 list_for_each_entry_safe(sums, sums_next, &ordered_sums, list) {
4468                         if (!ret)
4469                                 ret = log_csums(trans, inode, log, sums);
4470                         list_del(&sums->list);
4471                         kfree(sums);
4472                 }
4473                 if (ret)
4474                         goto out;
4475
4476 add_to_batch:
4477                 ins_sizes[dst_index] = btrfs_item_size(src, src_slot);
4478                 batch.total_data_size += ins_sizes[dst_index];
4479                 batch.nr++;
4480                 dst_index++;
4481         }
4482
4483         /*
4484          * We have a leaf full of old extent items that don't need to be logged,
4485          * so we don't need to do anything.
4486          */
4487         if (batch.nr == 0)
4488                 goto out;
4489
4490         ret = btrfs_insert_empty_items(trans, log, dst_path, &batch);
4491         if (ret)
4492                 goto out;
4493
4494         dst_index = 0;
4495         for (int i = 0; i < nr; i++) {
4496                 const int src_slot = start_slot + i;
4497                 const int dst_slot = dst_path->slots[0] + dst_index;
4498                 struct btrfs_key key;
4499                 unsigned long src_offset;
4500                 unsigned long dst_offset;
4501
4502                 /*
4503                  * We're done, all the remaining items in the source leaf
4504                  * correspond to old file extent items.
4505                  */
4506                 if (dst_index >= batch.nr)
4507                         break;
4508
4509                 btrfs_item_key_to_cpu(src, &key, src_slot);
4510
4511                 if (key.type != BTRFS_EXTENT_DATA_KEY)
4512                         goto copy_item;
4513
4514                 extent = btrfs_item_ptr(src, src_slot,
4515                                         struct btrfs_file_extent_item);
4516
4517                 /* See the comment in the previous loop, same logic. */
4518                 if (btrfs_file_extent_generation(src, extent) < trans->transid &&
4519                     key.offset < i_size &&
4520                     inode->last_reflink_trans < trans->transid)
4521                         continue;
4522
4523 copy_item:
4524                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], dst_slot);
4525                 src_offset = btrfs_item_ptr_offset(src, src_slot);
4526
4527                 if (key.type == BTRFS_INODE_ITEM_KEY) {
4528                         struct btrfs_inode_item *inode_item;
4529
4530                         inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot,
4531                                                     struct btrfs_inode_item);
4532                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
4533                                         &inode->vfs_inode,
4534                                         inode_only == LOG_INODE_EXISTS,
4535                                         logged_isize);
4536                 } else {
4537                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4538                                            src_offset, ins_sizes[dst_index]);
4539                 }
4540
4541                 dst_index++;
4542         }
4543
4544         btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]);
4545         btrfs_release_path(dst_path);
4546 out:
4547         kfree(ins_data);
4548
4549         return ret;
4550 }
4551
4552 static int extent_cmp(void *priv, const struct list_head *a,
4553                       const struct list_head *b)
4554 {
4555         const struct extent_map *em1, *em2;
4556
4557         em1 = list_entry(a, struct extent_map, list);
4558         em2 = list_entry(b, struct extent_map, list);
4559
4560         if (em1->start < em2->start)
4561                 return -1;
4562         else if (em1->start > em2->start)
4563                 return 1;
4564         return 0;
4565 }
4566
4567 static int log_extent_csums(struct btrfs_trans_handle *trans,
4568                             struct btrfs_inode *inode,
4569                             struct btrfs_root *log_root,
4570                             const struct extent_map *em,
4571                             struct btrfs_log_ctx *ctx)
4572 {
4573         struct btrfs_ordered_extent *ordered;
4574         struct btrfs_root *csum_root;
4575         u64 csum_offset;
4576         u64 csum_len;
4577         u64 mod_start = em->mod_start;
4578         u64 mod_len = em->mod_len;
4579         LIST_HEAD(ordered_sums);
4580         int ret = 0;
4581
4582         if (inode->flags & BTRFS_INODE_NODATASUM ||
4583             (em->flags & EXTENT_FLAG_PREALLOC) ||
4584             em->block_start == EXTENT_MAP_HOLE)
4585                 return 0;
4586
4587         list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4588                 const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4589                 const u64 mod_end = mod_start + mod_len;
4590                 struct btrfs_ordered_sum *sums;
4591
4592                 if (mod_len == 0)
4593                         break;
4594
4595                 if (ordered_end <= mod_start)
4596                         continue;
4597                 if (mod_end <= ordered->file_offset)
4598                         break;
4599
4600                 /*
4601                  * We are going to copy all the csums on this ordered extent, so
4602                  * go ahead and adjust mod_start and mod_len in case this ordered
4603                  * extent has already been logged.
4604                  */
4605                 if (ordered->file_offset > mod_start) {
4606                         if (ordered_end >= mod_end)
4607                                 mod_len = ordered->file_offset - mod_start;
4608                         /*
4609                          * If we have this case
4610                          *
4611                          * |--------- logged extent ---------|
4612                          *       |----- ordered extent ----|
4613                          *
4614                          * Just don't mess with mod_start and mod_len, we'll
4615                          * just end up logging more csums than we need and it
4616                          * will be ok.
4617                          */
4618                 } else {
4619                         if (ordered_end < mod_end) {
4620                                 mod_len = mod_end - ordered_end;
4621                                 mod_start = ordered_end;
4622                         } else {
4623                                 mod_len = 0;
4624                         }
4625                 }
4626
4627                 /*
4628                  * To keep us from looping for the above case of an ordered
4629                  * extent that falls inside of the logged extent.
4630                  */
4631                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4632                         continue;
4633
4634                 list_for_each_entry(sums, &ordered->list, list) {
4635                         ret = log_csums(trans, inode, log_root, sums);
4636                         if (ret)
4637                                 return ret;
4638                 }
4639         }
4640
4641         /* We're done, found all csums in the ordered extents. */
4642         if (mod_len == 0)
4643                 return 0;
4644
4645         /* If we're compressed we have to save the entire range of csums. */
4646         if (extent_map_is_compressed(em)) {
4647                 csum_offset = 0;
4648                 csum_len = max(em->block_len, em->orig_block_len);
4649         } else {
4650                 csum_offset = mod_start - em->start;
4651                 csum_len = mod_len;
4652         }
4653
4654         /* block start is already adjusted for the file extent offset. */
4655         csum_root = btrfs_csum_root(trans->fs_info, em->block_start);
4656         ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset,
4657                                       em->block_start + csum_offset +
4658                                       csum_len - 1, &ordered_sums, 0, false);
4659         if (ret)
4660                 return ret;
4661
4662         while (!list_empty(&ordered_sums)) {
4663                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4664                                                    struct btrfs_ordered_sum,
4665                                                    list);
4666                 if (!ret)
4667                         ret = log_csums(trans, inode, log_root, sums);
4668                 list_del(&sums->list);
4669                 kfree(sums);
4670         }
4671
4672         return ret;
4673 }
4674
4675 static int log_one_extent(struct btrfs_trans_handle *trans,
4676                           struct btrfs_inode *inode,
4677                           const struct extent_map *em,
4678                           struct btrfs_path *path,
4679                           struct btrfs_log_ctx *ctx)
4680 {
4681         struct btrfs_drop_extents_args drop_args = { 0 };
4682         struct btrfs_root *log = inode->root->log_root;
4683         struct btrfs_file_extent_item fi = { 0 };
4684         struct extent_buffer *leaf;
4685         struct btrfs_key key;
4686         enum btrfs_compression_type compress_type;
4687         u64 extent_offset = em->start - em->orig_start;
4688         u64 block_len;
4689         int ret;
4690
4691         btrfs_set_stack_file_extent_generation(&fi, trans->transid);
4692         if (em->flags & EXTENT_FLAG_PREALLOC)
4693                 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
4694         else
4695                 btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
4696
4697         block_len = max(em->block_len, em->orig_block_len);
4698         compress_type = extent_map_compression(em);
4699         if (compress_type != BTRFS_COMPRESS_NONE) {
4700                 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
4701                 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4702         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4703                 btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start -
4704                                                         extent_offset);
4705                 btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
4706         }
4707
4708         btrfs_set_stack_file_extent_offset(&fi, extent_offset);
4709         btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
4710         btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
4711         btrfs_set_stack_file_extent_compression(&fi, compress_type);
4712
4713         ret = log_extent_csums(trans, inode, log, em, ctx);
4714         if (ret)
4715                 return ret;
4716
4717         /*
4718          * If this is the first time we are logging the inode in the current
4719          * transaction, we can avoid btrfs_drop_extents(), which is expensive
4720          * because it does a deletion search, which always acquires write locks
4721          * for extent buffers at levels 2, 1 and 0. This not only wastes time
4722          * but also adds significant contention in a log tree, since log trees
4723          * are small, with a root at level 2 or 3 at most, due to their short
4724          * life span.
4725          */
4726         if (ctx->logged_before) {
4727                 drop_args.path = path;
4728                 drop_args.start = em->start;
4729                 drop_args.end = em->start + em->len;
4730                 drop_args.replace_extent = true;
4731                 drop_args.extent_item_size = sizeof(fi);
4732                 ret = btrfs_drop_extents(trans, log, inode, &drop_args);
4733                 if (ret)
4734                         return ret;
4735         }
4736
4737         if (!drop_args.extent_inserted) {
4738                 key.objectid = btrfs_ino(inode);
4739                 key.type = BTRFS_EXTENT_DATA_KEY;
4740                 key.offset = em->start;
4741
4742                 ret = btrfs_insert_empty_item(trans, log, path, &key,
4743                                               sizeof(fi));
4744                 if (ret)
4745                         return ret;
4746         }
4747         leaf = path->nodes[0];
4748         write_extent_buffer(leaf, &fi,
4749                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4750                             sizeof(fi));
4751         btrfs_mark_buffer_dirty(trans, leaf);
4752
4753         btrfs_release_path(path);
4754
4755         return ret;
4756 }
4757
4758 /*
4759  * Log all prealloc extents beyond the inode's i_size to make sure we do not
4760  * lose them after doing a full/fast fsync and replaying the log. We scan the
4761  * subvolume's root instead of iterating the inode's extent map tree because
4762  * otherwise we can log incorrect extent items based on extent map conversion.
4763  * That can happen due to the fact that extent maps are merged when they
4764  * are not in the extent map tree's list of modified extents.
4765  */
4766 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4767                                       struct btrfs_inode *inode,
4768                                       struct btrfs_path *path,
4769                                       struct btrfs_log_ctx *ctx)
4770 {
4771         struct btrfs_root *root = inode->root;
4772         struct btrfs_key key;
4773         const u64 i_size = i_size_read(&inode->vfs_inode);
4774         const u64 ino = btrfs_ino(inode);
4775         struct btrfs_path *dst_path = NULL;
4776         bool dropped_extents = false;
4777         u64 truncate_offset = i_size;
4778         struct extent_buffer *leaf;
4779         int slot;
4780         int ins_nr = 0;
4781         int start_slot = 0;
4782         int ret;
4783
4784         if (!(inode->flags & BTRFS_INODE_PREALLOC))
4785                 return 0;
4786
4787         key.objectid = ino;
4788         key.type = BTRFS_EXTENT_DATA_KEY;
4789         key.offset = i_size;
4790         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4791         if (ret < 0)
4792                 goto out;
4793
4794         /*
4795          * We must check if there is a prealloc extent that starts before the
4796          * i_size and crosses the i_size boundary. This is to ensure later we
4797          * truncate down to the end of that extent and not to the i_size, as
4798          * otherwise we end up losing part of the prealloc extent after a log
4799          * replay and with an implicit hole if there is another prealloc extent
4800          * that starts at an offset beyond i_size.
4801          */
4802         ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4803         if (ret < 0)
4804                 goto out;
4805
4806         if (ret == 0) {
4807                 struct btrfs_file_extent_item *ei;
4808
4809                 leaf = path->nodes[0];
4810                 slot = path->slots[0];
4811                 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4812
4813                 if (btrfs_file_extent_type(leaf, ei) ==
4814                     BTRFS_FILE_EXTENT_PREALLOC) {
4815                         u64 extent_end;
4816
4817                         btrfs_item_key_to_cpu(leaf, &key, slot);
4818                         extent_end = key.offset +
4819                                 btrfs_file_extent_num_bytes(leaf, ei);
4820
4821                         if (extent_end > i_size)
4822                                 truncate_offset = extent_end;
4823                 }
4824         } else {
4825                 ret = 0;
4826         }
4827
4828         while (true) {
4829                 leaf = path->nodes[0];
4830                 slot = path->slots[0];
4831
4832                 if (slot >= btrfs_header_nritems(leaf)) {
4833                         if (ins_nr > 0) {
4834                                 ret = copy_items(trans, inode, dst_path, path,
4835                                                  start_slot, ins_nr, 1, 0, ctx);
4836                                 if (ret < 0)
4837                                         goto out;
4838                                 ins_nr = 0;
4839                         }
4840                         ret = btrfs_next_leaf(root, path);
4841                         if (ret < 0)
4842                                 goto out;
4843                         if (ret > 0) {
4844                                 ret = 0;
4845                                 break;
4846                         }
4847                         continue;
4848                 }
4849
4850                 btrfs_item_key_to_cpu(leaf, &key, slot);
4851                 if (key.objectid > ino)
4852                         break;
4853                 if (WARN_ON_ONCE(key.objectid < ino) ||
4854                     key.type < BTRFS_EXTENT_DATA_KEY ||
4855                     key.offset < i_size) {
4856                         path->slots[0]++;
4857                         continue;
4858                 }
4859                 if (!dropped_extents) {
4860                         /*
4861                          * Avoid logging extent items logged in past fsync calls
4862                          * and leading to duplicate keys in the log tree.
4863                          */
4864                         ret = truncate_inode_items(trans, root->log_root, inode,
4865                                                    truncate_offset,
4866                                                    BTRFS_EXTENT_DATA_KEY);
4867                         if (ret)
4868                                 goto out;
4869                         dropped_extents = true;
4870                 }
4871                 if (ins_nr == 0)
4872                         start_slot = slot;
4873                 ins_nr++;
4874                 path->slots[0]++;
4875                 if (!dst_path) {
4876                         dst_path = btrfs_alloc_path();
4877                         if (!dst_path) {
4878                                 ret = -ENOMEM;
4879                                 goto out;
4880                         }
4881                 }
4882         }
4883         if (ins_nr > 0)
4884                 ret = copy_items(trans, inode, dst_path, path,
4885                                  start_slot, ins_nr, 1, 0, ctx);
4886 out:
4887         btrfs_release_path(path);
4888         btrfs_free_path(dst_path);
4889         return ret;
4890 }
4891
4892 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4893                                      struct btrfs_inode *inode,
4894                                      struct btrfs_path *path,
4895                                      struct btrfs_log_ctx *ctx)
4896 {
4897         struct btrfs_ordered_extent *ordered;
4898         struct btrfs_ordered_extent *tmp;
4899         struct extent_map *em, *n;
4900         LIST_HEAD(extents);
4901         struct extent_map_tree *tree = &inode->extent_tree;
4902         int ret = 0;
4903         int num = 0;
4904
4905         write_lock(&tree->lock);
4906
4907         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4908                 list_del_init(&em->list);
4909                 /*
4910                  * Just an arbitrary number, this can be really CPU intensive
4911                  * once we start getting a lot of extents, and really once we
4912                  * have a bunch of extents we just want to commit since it will
4913                  * be faster.
4914                  */
4915                 if (++num > 32768) {
4916                         list_del_init(&tree->modified_extents);
4917                         ret = -EFBIG;
4918                         goto process;
4919                 }
4920
4921                 if (em->generation < trans->transid)
4922                         continue;
4923
4924                 /* We log prealloc extents beyond eof later. */
4925                 if ((em->flags & EXTENT_FLAG_PREALLOC) &&
4926                     em->start >= i_size_read(&inode->vfs_inode))
4927                         continue;
4928
4929                 /* Need a ref to keep it from getting evicted from cache */
4930                 refcount_inc(&em->refs);
4931                 em->flags |= EXTENT_FLAG_LOGGING;
4932                 list_add_tail(&em->list, &extents);
4933                 num++;
4934         }
4935
4936         list_sort(NULL, &extents, extent_cmp);
4937 process:
4938         while (!list_empty(&extents)) {
4939                 em = list_entry(extents.next, struct extent_map, list);
4940
4941                 list_del_init(&em->list);
4942
4943                 /*
4944                  * If we had an error we just need to delete everybody from our
4945                  * private list.
4946                  */
4947                 if (ret) {
4948                         clear_em_logging(tree, em);
4949                         free_extent_map(em);
4950                         continue;
4951                 }
4952
4953                 write_unlock(&tree->lock);
4954
4955                 ret = log_one_extent(trans, inode, em, path, ctx);
4956                 write_lock(&tree->lock);
4957                 clear_em_logging(tree, em);
4958                 free_extent_map(em);
4959         }
4960         WARN_ON(!list_empty(&extents));
4961         write_unlock(&tree->lock);
4962
4963         if (!ret)
4964                 ret = btrfs_log_prealloc_extents(trans, inode, path, ctx);
4965         if (ret)
4966                 return ret;
4967
4968         /*
4969          * We have logged all extents successfully, now make sure the commit of
4970          * the current transaction waits for the ordered extents to complete
4971          * before it commits and wipes out the log trees, otherwise we would
4972          * lose data if an ordered extents completes after the transaction
4973          * commits and a power failure happens after the transaction commit.
4974          */
4975         list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4976                 list_del_init(&ordered->log_list);
4977                 set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4978
4979                 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4980                         spin_lock_irq(&inode->ordered_tree_lock);
4981                         if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4982                                 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4983                                 atomic_inc(&trans->transaction->pending_ordered);
4984                         }
4985                         spin_unlock_irq(&inode->ordered_tree_lock);
4986                 }
4987                 btrfs_put_ordered_extent(ordered);
4988         }
4989
4990         return 0;
4991 }
4992
4993 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4994                              struct btrfs_path *path, u64 *size_ret)
4995 {
4996         struct btrfs_key key;
4997         int ret;
4998
4999         key.objectid = btrfs_ino(inode);
5000         key.type = BTRFS_INODE_ITEM_KEY;
5001         key.offset = 0;
5002
5003         ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
5004         if (ret < 0) {
5005                 return ret;
5006         } else if (ret > 0) {
5007                 *size_ret = 0;
5008         } else {
5009                 struct btrfs_inode_item *item;
5010
5011                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5012                                       struct btrfs_inode_item);
5013                 *size_ret = btrfs_inode_size(path->nodes[0], item);
5014                 /*
5015                  * If the in-memory inode's i_size is smaller then the inode
5016                  * size stored in the btree, return the inode's i_size, so
5017                  * that we get a correct inode size after replaying the log
5018                  * when before a power failure we had a shrinking truncate
5019                  * followed by addition of a new name (rename / new hard link).
5020                  * Otherwise return the inode size from the btree, to avoid
5021                  * data loss when replaying a log due to previously doing a
5022                  * write that expands the inode's size and logging a new name
5023                  * immediately after.
5024                  */
5025                 if (*size_ret > inode->vfs_inode.i_size)
5026                         *size_ret = inode->vfs_inode.i_size;
5027         }
5028
5029         btrfs_release_path(path);
5030         return 0;
5031 }
5032
5033 /*
5034  * At the moment we always log all xattrs. This is to figure out at log replay
5035  * time which xattrs must have their deletion replayed. If a xattr is missing
5036  * in the log tree and exists in the fs/subvol tree, we delete it. This is
5037  * because if a xattr is deleted, the inode is fsynced and a power failure
5038  * happens, causing the log to be replayed the next time the fs is mounted,
5039  * we want the xattr to not exist anymore (same behaviour as other filesystems
5040  * with a journal, ext3/4, xfs, f2fs, etc).
5041  */
5042 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
5043                                 struct btrfs_inode *inode,
5044                                 struct btrfs_path *path,
5045                                 struct btrfs_path *dst_path,
5046                                 struct btrfs_log_ctx *ctx)
5047 {
5048         struct btrfs_root *root = inode->root;
5049         int ret;
5050         struct btrfs_key key;
5051         const u64 ino = btrfs_ino(inode);
5052         int ins_nr = 0;
5053         int start_slot = 0;
5054         bool found_xattrs = false;
5055
5056         if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
5057                 return 0;
5058
5059         key.objectid = ino;
5060         key.type = BTRFS_XATTR_ITEM_KEY;
5061         key.offset = 0;
5062
5063         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5064         if (ret < 0)
5065                 return ret;
5066
5067         while (true) {
5068                 int slot = path->slots[0];
5069                 struct extent_buffer *leaf = path->nodes[0];
5070                 int nritems = btrfs_header_nritems(leaf);
5071
5072                 if (slot >= nritems) {
5073                         if (ins_nr > 0) {
5074                                 ret = copy_items(trans, inode, dst_path, path,
5075                                                  start_slot, ins_nr, 1, 0, ctx);
5076                                 if (ret < 0)
5077                                         return ret;
5078                                 ins_nr = 0;
5079                         }
5080                         ret = btrfs_next_leaf(root, path);
5081                         if (ret < 0)
5082                                 return ret;
5083                         else if (ret > 0)
5084                                 break;
5085                         continue;
5086                 }
5087
5088                 btrfs_item_key_to_cpu(leaf, &key, slot);
5089                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
5090                         break;
5091
5092                 if (ins_nr == 0)
5093                         start_slot = slot;
5094                 ins_nr++;
5095                 path->slots[0]++;
5096                 found_xattrs = true;
5097                 cond_resched();
5098         }
5099         if (ins_nr > 0) {
5100                 ret = copy_items(trans, inode, dst_path, path,
5101                                  start_slot, ins_nr, 1, 0, ctx);
5102                 if (ret < 0)
5103                         return ret;
5104         }
5105
5106         if (!found_xattrs)
5107                 set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
5108
5109         return 0;
5110 }
5111
5112 /*
5113  * When using the NO_HOLES feature if we punched a hole that causes the
5114  * deletion of entire leafs or all the extent items of the first leaf (the one
5115  * that contains the inode item and references) we may end up not processing
5116  * any extents, because there are no leafs with a generation matching the
5117  * current transaction that have extent items for our inode. So we need to find
5118  * if any holes exist and then log them. We also need to log holes after any
5119  * truncate operation that changes the inode's size.
5120  */
5121 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
5122                            struct btrfs_inode *inode,
5123                            struct btrfs_path *path)
5124 {
5125         struct btrfs_root *root = inode->root;
5126         struct btrfs_fs_info *fs_info = root->fs_info;
5127         struct btrfs_key key;
5128         const u64 ino = btrfs_ino(inode);
5129         const u64 i_size = i_size_read(&inode->vfs_inode);
5130         u64 prev_extent_end = 0;
5131         int ret;
5132
5133         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
5134                 return 0;
5135
5136         key.objectid = ino;
5137         key.type = BTRFS_EXTENT_DATA_KEY;
5138         key.offset = 0;
5139
5140         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5141         if (ret < 0)
5142                 return ret;
5143
5144         while (true) {
5145                 struct extent_buffer *leaf = path->nodes[0];
5146
5147                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
5148                         ret = btrfs_next_leaf(root, path);
5149                         if (ret < 0)
5150                                 return ret;
5151                         if (ret > 0) {
5152                                 ret = 0;
5153                                 break;
5154                         }
5155                         leaf = path->nodes[0];
5156                 }
5157
5158                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5159                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
5160                         break;
5161
5162                 /* We have a hole, log it. */
5163                 if (prev_extent_end < key.offset) {
5164                         const u64 hole_len = key.offset - prev_extent_end;
5165
5166                         /*
5167                          * Release the path to avoid deadlocks with other code
5168                          * paths that search the root while holding locks on
5169                          * leafs from the log root.
5170                          */
5171                         btrfs_release_path(path);
5172                         ret = btrfs_insert_hole_extent(trans, root->log_root,
5173                                                        ino, prev_extent_end,
5174                                                        hole_len);
5175                         if (ret < 0)
5176                                 return ret;
5177
5178                         /*
5179                          * Search for the same key again in the root. Since it's
5180                          * an extent item and we are holding the inode lock, the
5181                          * key must still exist. If it doesn't just emit warning
5182                          * and return an error to fall back to a transaction
5183                          * commit.
5184                          */
5185                         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5186                         if (ret < 0)
5187                                 return ret;
5188                         if (WARN_ON(ret > 0))
5189                                 return -ENOENT;
5190                         leaf = path->nodes[0];
5191                 }
5192
5193                 prev_extent_end = btrfs_file_extent_end(path);
5194                 path->slots[0]++;
5195                 cond_resched();
5196         }
5197
5198         if (prev_extent_end < i_size) {
5199                 u64 hole_len;
5200
5201                 btrfs_release_path(path);
5202                 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
5203                 ret = btrfs_insert_hole_extent(trans, root->log_root, ino,
5204                                                prev_extent_end, hole_len);
5205                 if (ret < 0)
5206                         return ret;
5207         }
5208
5209         return 0;
5210 }
5211
5212 /*
5213  * When we are logging a new inode X, check if it doesn't have a reference that
5214  * matches the reference from some other inode Y created in a past transaction
5215  * and that was renamed in the current transaction. If we don't do this, then at
5216  * log replay time we can lose inode Y (and all its files if it's a directory):
5217  *
5218  * mkdir /mnt/x
5219  * echo "hello world" > /mnt/x/foobar
5220  * sync
5221  * mv /mnt/x /mnt/y
5222  * mkdir /mnt/x                 # or touch /mnt/x
5223  * xfs_io -c fsync /mnt/x
5224  * <power fail>
5225  * mount fs, trigger log replay
5226  *
5227  * After the log replay procedure, we would lose the first directory and all its
5228  * files (file foobar).
5229  * For the case where inode Y is not a directory we simply end up losing it:
5230  *
5231  * echo "123" > /mnt/foo
5232  * sync
5233  * mv /mnt/foo /mnt/bar
5234  * echo "abc" > /mnt/foo
5235  * xfs_io -c fsync /mnt/foo
5236  * <power fail>
5237  *
5238  * We also need this for cases where a snapshot entry is replaced by some other
5239  * entry (file or directory) otherwise we end up with an unreplayable log due to
5240  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
5241  * if it were a regular entry:
5242  *
5243  * mkdir /mnt/x
5244  * btrfs subvolume snapshot /mnt /mnt/x/snap
5245  * btrfs subvolume delete /mnt/x/snap
5246  * rmdir /mnt/x
5247  * mkdir /mnt/x
5248  * fsync /mnt/x or fsync some new file inside it
5249  * <power fail>
5250  *
5251  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
5252  * the same transaction.
5253  */
5254 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
5255                                          const int slot,
5256                                          const struct btrfs_key *key,
5257                                          struct btrfs_inode *inode,
5258                                          u64 *other_ino, u64 *other_parent)
5259 {
5260         int ret;
5261         struct btrfs_path *search_path;
5262         char *name = NULL;
5263         u32 name_len = 0;
5264         u32 item_size = btrfs_item_size(eb, slot);
5265         u32 cur_offset = 0;
5266         unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
5267
5268         search_path = btrfs_alloc_path();
5269         if (!search_path)
5270                 return -ENOMEM;
5271         search_path->search_commit_root = 1;
5272         search_path->skip_locking = 1;
5273
5274         while (cur_offset < item_size) {
5275                 u64 parent;
5276                 u32 this_name_len;
5277                 u32 this_len;
5278                 unsigned long name_ptr;
5279                 struct btrfs_dir_item *di;
5280                 struct fscrypt_str name_str;
5281
5282                 if (key->type == BTRFS_INODE_REF_KEY) {
5283                         struct btrfs_inode_ref *iref;
5284
5285                         iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
5286                         parent = key->offset;
5287                         this_name_len = btrfs_inode_ref_name_len(eb, iref);
5288                         name_ptr = (unsigned long)(iref + 1);
5289                         this_len = sizeof(*iref) + this_name_len;
5290                 } else {
5291                         struct btrfs_inode_extref *extref;
5292
5293                         extref = (struct btrfs_inode_extref *)(ptr +
5294                                                                cur_offset);
5295                         parent = btrfs_inode_extref_parent(eb, extref);
5296                         this_name_len = btrfs_inode_extref_name_len(eb, extref);
5297                         name_ptr = (unsigned long)&extref->name;
5298                         this_len = sizeof(*extref) + this_name_len;
5299                 }
5300
5301                 if (this_name_len > name_len) {
5302                         char *new_name;
5303
5304                         new_name = krealloc(name, this_name_len, GFP_NOFS);
5305                         if (!new_name) {
5306                                 ret = -ENOMEM;
5307                                 goto out;
5308                         }
5309                         name_len = this_name_len;
5310                         name = new_name;
5311                 }
5312
5313                 read_extent_buffer(eb, name, name_ptr, this_name_len);
5314
5315                 name_str.name = name;
5316                 name_str.len = this_name_len;
5317                 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
5318                                 parent, &name_str, 0);
5319                 if (di && !IS_ERR(di)) {
5320                         struct btrfs_key di_key;
5321
5322                         btrfs_dir_item_key_to_cpu(search_path->nodes[0],
5323                                                   di, &di_key);
5324                         if (di_key.type == BTRFS_INODE_ITEM_KEY) {
5325                                 if (di_key.objectid != key->objectid) {
5326                                         ret = 1;
5327                                         *other_ino = di_key.objectid;
5328                                         *other_parent = parent;
5329                                 } else {
5330                                         ret = 0;
5331                                 }
5332                         } else {
5333                                 ret = -EAGAIN;
5334                         }
5335                         goto out;
5336                 } else if (IS_ERR(di)) {
5337                         ret = PTR_ERR(di);
5338                         goto out;
5339                 }
5340                 btrfs_release_path(search_path);
5341
5342                 cur_offset += this_len;
5343         }
5344         ret = 0;
5345 out:
5346         btrfs_free_path(search_path);
5347         kfree(name);
5348         return ret;
5349 }
5350
5351 /*
5352  * Check if we need to log an inode. This is used in contexts where while
5353  * logging an inode we need to log another inode (either that it exists or in
5354  * full mode). This is used instead of btrfs_inode_in_log() because the later
5355  * requires the inode to be in the log and have the log transaction committed,
5356  * while here we do not care if the log transaction was already committed - our
5357  * caller will commit the log later - and we want to avoid logging an inode
5358  * multiple times when multiple tasks have joined the same log transaction.
5359  */
5360 static bool need_log_inode(const struct btrfs_trans_handle *trans,
5361                            struct btrfs_inode *inode)
5362 {
5363         /*
5364          * If a directory was not modified, no dentries added or removed, we can
5365          * and should avoid logging it.
5366          */
5367         if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid)
5368                 return false;
5369
5370         /*
5371          * If this inode does not have new/updated/deleted xattrs since the last
5372          * time it was logged and is flagged as logged in the current transaction,
5373          * we can skip logging it. As for new/deleted names, those are updated in
5374          * the log by link/unlink/rename operations.
5375          * In case the inode was logged and then evicted and reloaded, its
5376          * logged_trans will be 0, in which case we have to fully log it since
5377          * logged_trans is a transient field, not persisted.
5378          */
5379         if (inode_logged(trans, inode, NULL) == 1 &&
5380             !test_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags))
5381                 return false;
5382
5383         return true;
5384 }
5385
5386 struct btrfs_dir_list {
5387         u64 ino;
5388         struct list_head list;
5389 };
5390
5391 /*
5392  * Log the inodes of the new dentries of a directory.
5393  * See process_dir_items_leaf() for details about why it is needed.
5394  * This is a recursive operation - if an existing dentry corresponds to a
5395  * directory, that directory's new entries are logged too (same behaviour as
5396  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5397  * the dentries point to we do not acquire their VFS lock, otherwise lockdep
5398  * complains about the following circular lock dependency / possible deadlock:
5399  *
5400  *        CPU0                                        CPU1
5401  *        ----                                        ----
5402  * lock(&type->i_mutex_dir_key#3/2);
5403  *                                            lock(sb_internal#2);
5404  *                                            lock(&type->i_mutex_dir_key#3/2);
5405  * lock(&sb->s_type->i_mutex_key#14);
5406  *
5407  * Where sb_internal is the lock (a counter that works as a lock) acquired by
5408  * sb_start_intwrite() in btrfs_start_transaction().
5409  * Not acquiring the VFS lock of the inodes is still safe because:
5410  *
5411  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5412  *    that while logging the inode new references (names) are added or removed
5413  *    from the inode, leaving the logged inode item with a link count that does
5414  *    not match the number of logged inode reference items. This is fine because
5415  *    at log replay time we compute the real number of links and correct the
5416  *    link count in the inode item (see replay_one_buffer() and
5417  *    link_to_fixup_dir());
5418  *
5419  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5420  *    while logging the inode's items new index items (key type
5421  *    BTRFS_DIR_INDEX_KEY) are added to fs/subvol tree and the logged inode item
5422  *    has a size that doesn't match the sum of the lengths of all the logged
5423  *    names - this is ok, not a problem, because at log replay time we set the
5424  *    directory's i_size to the correct value (see replay_one_name() and
5425  *    overwrite_item()).
5426  */
5427 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5428                                 struct btrfs_inode *start_inode,
5429                                 struct btrfs_log_ctx *ctx)
5430 {
5431         struct btrfs_root *root = start_inode->root;
5432         struct btrfs_fs_info *fs_info = root->fs_info;
5433         struct btrfs_path *path;
5434         LIST_HEAD(dir_list);
5435         struct btrfs_dir_list *dir_elem;
5436         u64 ino = btrfs_ino(start_inode);
5437         struct btrfs_inode *curr_inode = start_inode;
5438         int ret = 0;
5439
5440         /*
5441          * If we are logging a new name, as part of a link or rename operation,
5442          * don't bother logging new dentries, as we just want to log the names
5443          * of an inode and that any new parents exist.
5444          */
5445         if (ctx->logging_new_name)
5446                 return 0;
5447
5448         path = btrfs_alloc_path();
5449         if (!path)
5450                 return -ENOMEM;
5451
5452         /* Pairs with btrfs_add_delayed_iput below. */
5453         ihold(&curr_inode->vfs_inode);
5454
5455         while (true) {
5456                 struct inode *vfs_inode;
5457                 struct btrfs_key key;
5458                 struct btrfs_key found_key;
5459                 u64 next_index;
5460                 bool continue_curr_inode = true;
5461                 int iter_ret;
5462
5463                 key.objectid = ino;
5464                 key.type = BTRFS_DIR_INDEX_KEY;
5465                 key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
5466                 next_index = key.offset;
5467 again:
5468                 btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
5469                         struct extent_buffer *leaf = path->nodes[0];
5470                         struct btrfs_dir_item *di;
5471                         struct btrfs_key di_key;
5472                         struct inode *di_inode;
5473                         int log_mode = LOG_INODE_EXISTS;
5474                         int type;
5475
5476                         if (found_key.objectid != ino ||
5477                             found_key.type != BTRFS_DIR_INDEX_KEY) {
5478                                 continue_curr_inode = false;
5479                                 break;
5480                         }
5481
5482                         next_index = found_key.offset + 1;
5483
5484                         di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5485                         type = btrfs_dir_ftype(leaf, di);
5486                         if (btrfs_dir_transid(leaf, di) < trans->transid)
5487                                 continue;
5488                         btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5489                         if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5490                                 continue;
5491
5492                         btrfs_release_path(path);
5493                         di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
5494                         if (IS_ERR(di_inode)) {
5495                                 ret = PTR_ERR(di_inode);
5496                                 goto out;
5497                         }
5498
5499                         if (!need_log_inode(trans, BTRFS_I(di_inode))) {
5500                                 btrfs_add_delayed_iput(BTRFS_I(di_inode));
5501                                 break;
5502                         }
5503
5504                         ctx->log_new_dentries = false;
5505                         if (type == BTRFS_FT_DIR)
5506                                 log_mode = LOG_INODE_ALL;
5507                         ret = btrfs_log_inode(trans, BTRFS_I(di_inode),
5508                                               log_mode, ctx);
5509                         btrfs_add_delayed_iput(BTRFS_I(di_inode));
5510                         if (ret)
5511                                 goto out;
5512                         if (ctx->log_new_dentries) {
5513                                 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5514                                 if (!dir_elem) {
5515                                         ret = -ENOMEM;
5516                                         goto out;
5517                                 }
5518                                 dir_elem->ino = di_key.objectid;
5519                                 list_add_tail(&dir_elem->list, &dir_list);
5520                         }
5521                         break;
5522                 }
5523
5524                 btrfs_release_path(path);
5525
5526                 if (iter_ret < 0) {
5527                         ret = iter_ret;
5528                         goto out;
5529                 } else if (iter_ret > 0) {
5530                         continue_curr_inode = false;
5531                 } else {
5532                         key = found_key;
5533                 }
5534
5535                 if (continue_curr_inode && key.offset < (u64)-1) {
5536                         key.offset++;
5537                         goto again;
5538                 }
5539
5540                 btrfs_set_first_dir_index_to_log(curr_inode, next_index);
5541
5542                 if (list_empty(&dir_list))
5543                         break;
5544
5545                 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, list);
5546                 ino = dir_elem->ino;
5547                 list_del(&dir_elem->list);
5548                 kfree(dir_elem);
5549
5550                 btrfs_add_delayed_iput(curr_inode);
5551                 curr_inode = NULL;
5552
5553                 vfs_inode = btrfs_iget(fs_info->sb, ino, root);
5554                 if (IS_ERR(vfs_inode)) {
5555                         ret = PTR_ERR(vfs_inode);
5556                         break;
5557                 }
5558                 curr_inode = BTRFS_I(vfs_inode);
5559         }
5560 out:
5561         btrfs_free_path(path);
5562         if (curr_inode)
5563                 btrfs_add_delayed_iput(curr_inode);
5564
5565         if (ret) {
5566                 struct btrfs_dir_list *next;
5567
5568                 list_for_each_entry_safe(dir_elem, next, &dir_list, list)
5569                         kfree(dir_elem);
5570         }
5571
5572         return ret;
5573 }
5574
5575 struct btrfs_ino_list {
5576         u64 ino;
5577         u64 parent;
5578         struct list_head list;
5579 };
5580
5581 static void free_conflicting_inodes(struct btrfs_log_ctx *ctx)
5582 {
5583         struct btrfs_ino_list *curr;
5584         struct btrfs_ino_list *next;
5585
5586         list_for_each_entry_safe(curr, next, &ctx->conflict_inodes, list) {
5587                 list_del(&curr->list);
5588                 kfree(curr);
5589         }
5590 }
5591
5592 static int conflicting_inode_is_dir(struct btrfs_root *root, u64 ino,
5593                                     struct btrfs_path *path)
5594 {
5595         struct btrfs_key key;
5596         int ret;
5597
5598         key.objectid = ino;
5599         key.type = BTRFS_INODE_ITEM_KEY;
5600         key.offset = 0;
5601
5602         path->search_commit_root = 1;
5603         path->skip_locking = 1;
5604
5605         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5606         if (WARN_ON_ONCE(ret > 0)) {
5607                 /*
5608                  * We have previously found the inode through the commit root
5609                  * so this should not happen. If it does, just error out and
5610                  * fallback to a transaction commit.
5611                  */
5612                 ret = -ENOENT;
5613         } else if (ret == 0) {
5614                 struct btrfs_inode_item *item;
5615
5616                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5617                                       struct btrfs_inode_item);
5618                 if (S_ISDIR(btrfs_inode_mode(path->nodes[0], item)))
5619                         ret = 1;
5620         }
5621
5622         btrfs_release_path(path);
5623         path->search_commit_root = 0;
5624         path->skip_locking = 0;
5625
5626         return ret;
5627 }
5628
5629 static int add_conflicting_inode(struct btrfs_trans_handle *trans,
5630                                  struct btrfs_root *root,
5631                                  struct btrfs_path *path,
5632                                  u64 ino, u64 parent,
5633                                  struct btrfs_log_ctx *ctx)
5634 {
5635         struct btrfs_ino_list *ino_elem;
5636         struct inode *inode;
5637
5638         /*
5639          * It's rare to have a lot of conflicting inodes, in practice it is not
5640          * common to have more than 1 or 2. We don't want to collect too many,
5641          * as we could end up logging too many inodes (even if only in
5642          * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
5643          * commits.
5644          */
5645         if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
5646                 return BTRFS_LOG_FORCE_COMMIT;
5647
5648         inode = btrfs_iget(root->fs_info->sb, ino, root);
5649         /*
5650          * If the other inode that had a conflicting dir entry was deleted in
5651          * the current transaction then we either:
5652          *
5653          * 1) Log the parent directory (later after adding it to the list) if
5654          *    the inode is a directory. This is because it may be a deleted
5655          *    subvolume/snapshot or it may be a regular directory that had
5656          *    deleted subvolumes/snapshots (or subdirectories that had them),
5657          *    and at the moment we can't deal with dropping subvolumes/snapshots
5658          *    during log replay. So we just log the parent, which will result in
5659          *    a fallback to a transaction commit if we are dealing with those
5660          *    cases (last_unlink_trans will match the current transaction);
5661          *
5662          * 2) Do nothing if it's not a directory. During log replay we simply
5663          *    unlink the conflicting dentry from the parent directory and then
5664          *    add the dentry for our inode. Like this we can avoid logging the
5665          *    parent directory (and maybe fallback to a transaction commit in
5666          *    case it has a last_unlink_trans == trans->transid, due to moving
5667          *    some inode from it to some other directory).
5668          */
5669         if (IS_ERR(inode)) {
5670                 int ret = PTR_ERR(inode);
5671
5672                 if (ret != -ENOENT)
5673                         return ret;
5674
5675                 ret = conflicting_inode_is_dir(root, ino, path);
5676                 /* Not a directory or we got an error. */
5677                 if (ret <= 0)
5678                         return ret;
5679
5680                 /* Conflicting inode is a directory, so we'll log its parent. */
5681                 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5682                 if (!ino_elem)
5683                         return -ENOMEM;
5684                 ino_elem->ino = ino;
5685                 ino_elem->parent = parent;
5686                 list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5687                 ctx->num_conflict_inodes++;
5688
5689                 return 0;
5690         }
5691
5692         /*
5693          * If the inode was already logged skip it - otherwise we can hit an
5694          * infinite loop. Example:
5695          *
5696          * From the commit root (previous transaction) we have the following
5697          * inodes:
5698          *
5699          * inode 257 a directory
5700          * inode 258 with references "zz" and "zz_link" on inode 257
5701          * inode 259 with reference "a" on inode 257
5702          *
5703          * And in the current (uncommitted) transaction we have:
5704          *
5705          * inode 257 a directory, unchanged
5706          * inode 258 with references "a" and "a2" on inode 257
5707          * inode 259 with reference "zz_link" on inode 257
5708          * inode 261 with reference "zz" on inode 257
5709          *
5710          * When logging inode 261 the following infinite loop could
5711          * happen if we don't skip already logged inodes:
5712          *
5713          * - we detect inode 258 as a conflicting inode, with inode 261
5714          *   on reference "zz", and log it;
5715          *
5716          * - we detect inode 259 as a conflicting inode, with inode 258
5717          *   on reference "a", and log it;
5718          *
5719          * - we detect inode 258 as a conflicting inode, with inode 259
5720          *   on reference "zz_link", and log it - again! After this we
5721          *   repeat the above steps forever.
5722          *
5723          * Here we can use need_log_inode() because we only need to log the
5724          * inode in LOG_INODE_EXISTS mode and rename operations update the log,
5725          * so that the log ends up with the new name and without the old name.
5726          */
5727         if (!need_log_inode(trans, BTRFS_I(inode))) {
5728                 btrfs_add_delayed_iput(BTRFS_I(inode));
5729                 return 0;
5730         }
5731
5732         btrfs_add_delayed_iput(BTRFS_I(inode));
5733
5734         ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5735         if (!ino_elem)
5736                 return -ENOMEM;
5737         ino_elem->ino = ino;
5738         ino_elem->parent = parent;
5739         list_add_tail(&ino_elem->list, &ctx->conflict_inodes);
5740         ctx->num_conflict_inodes++;
5741
5742         return 0;
5743 }
5744
5745 static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
5746                                   struct btrfs_root *root,
5747                                   struct btrfs_log_ctx *ctx)
5748 {
5749         struct btrfs_fs_info *fs_info = root->fs_info;
5750         int ret = 0;
5751
5752         /*
5753          * Conflicting inodes are logged by the first call to btrfs_log_inode(),
5754          * otherwise we could have unbounded recursion of btrfs_log_inode()
5755          * calls. This check guarantees we can have only 1 level of recursion.
5756          */
5757         if (ctx->logging_conflict_inodes)
5758                 return 0;
5759
5760         ctx->logging_conflict_inodes = true;
5761
5762         /*
5763          * New conflicting inodes may be found and added to the list while we
5764          * are logging a conflicting inode, so keep iterating while the list is
5765          * not empty.
5766          */
5767         while (!list_empty(&ctx->conflict_inodes)) {
5768                 struct btrfs_ino_list *curr;
5769                 struct inode *inode;
5770                 u64 ino;
5771                 u64 parent;
5772
5773                 curr = list_first_entry(&ctx->conflict_inodes,
5774                                         struct btrfs_ino_list, list);
5775                 ino = curr->ino;
5776                 parent = curr->parent;
5777                 list_del(&curr->list);
5778                 kfree(curr);
5779
5780                 inode = btrfs_iget(fs_info->sb, ino, root);
5781                 /*
5782                  * If the other inode that had a conflicting dir entry was
5783                  * deleted in the current transaction, we need to log its parent
5784                  * directory. See the comment at add_conflicting_inode().
5785                  */
5786                 if (IS_ERR(inode)) {
5787                         ret = PTR_ERR(inode);
5788                         if (ret != -ENOENT)
5789                                 break;
5790
5791                         inode = btrfs_iget(fs_info->sb, parent, root);
5792                         if (IS_ERR(inode)) {
5793                                 ret = PTR_ERR(inode);
5794                                 break;
5795                         }
5796
5797                         /*
5798                          * Always log the directory, we cannot make this
5799                          * conditional on need_log_inode() because the directory
5800                          * might have been logged in LOG_INODE_EXISTS mode or
5801                          * the dir index of the conflicting inode is not in a
5802                          * dir index key range logged for the directory. So we
5803                          * must make sure the deletion is recorded.
5804                          */
5805                         ret = btrfs_log_inode(trans, BTRFS_I(inode),
5806                                               LOG_INODE_ALL, ctx);
5807                         btrfs_add_delayed_iput(BTRFS_I(inode));
5808                         if (ret)
5809                                 break;
5810                         continue;
5811                 }
5812
5813                 /*
5814                  * Here we can use need_log_inode() because we only need to log
5815                  * the inode in LOG_INODE_EXISTS mode and rename operations
5816                  * update the log, so that the log ends up with the new name and
5817                  * without the old name.
5818                  *
5819                  * We did this check at add_conflicting_inode(), but here we do
5820                  * it again because if some other task logged the inode after
5821                  * that, we can avoid doing it again.
5822                  */
5823                 if (!need_log_inode(trans, BTRFS_I(inode))) {
5824                         btrfs_add_delayed_iput(BTRFS_I(inode));
5825                         continue;
5826                 }
5827
5828                 /*
5829                  * We are safe logging the other inode without acquiring its
5830                  * lock as long as we log with the LOG_INODE_EXISTS mode. We
5831                  * are safe against concurrent renames of the other inode as
5832                  * well because during a rename we pin the log and update the
5833                  * log with the new name before we unpin it.
5834                  */
5835                 ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx);
5836                 btrfs_add_delayed_iput(BTRFS_I(inode));
5837                 if (ret)
5838                         break;
5839         }
5840
5841         ctx->logging_conflict_inodes = false;
5842         if (ret)
5843                 free_conflicting_inodes(ctx);
5844
5845         return ret;
5846 }
5847
5848 static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5849                                    struct btrfs_inode *inode,
5850                                    struct btrfs_key *min_key,
5851                                    const struct btrfs_key *max_key,
5852                                    struct btrfs_path *path,
5853                                    struct btrfs_path *dst_path,
5854                                    const u64 logged_isize,
5855                                    const int inode_only,
5856                                    struct btrfs_log_ctx *ctx,
5857                                    bool *need_log_inode_item)
5858 {
5859         const u64 i_size = i_size_read(&inode->vfs_inode);
5860         struct btrfs_root *root = inode->root;
5861         int ins_start_slot = 0;
5862         int ins_nr = 0;
5863         int ret;
5864
5865         while (1) {
5866                 ret = btrfs_search_forward(root, min_key, path, trans->transid);
5867                 if (ret < 0)
5868                         return ret;
5869                 if (ret > 0) {
5870                         ret = 0;
5871                         break;
5872                 }
5873 again:
5874                 /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5875                 if (min_key->objectid != max_key->objectid)
5876                         break;
5877                 if (min_key->type > max_key->type)
5878                         break;
5879
5880                 if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5881                         *need_log_inode_item = false;
5882                 } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5883                            min_key->offset >= i_size) {
5884                         /*
5885                          * Extents at and beyond eof are logged with
5886                          * btrfs_log_prealloc_extents().
5887                          * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5888                          * and no keys greater than that, so bail out.
5889                          */
5890                         break;
5891                 } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5892                             min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5893                            (inode->generation == trans->transid ||
5894                             ctx->logging_conflict_inodes)) {
5895                         u64 other_ino = 0;
5896                         u64 other_parent = 0;
5897
5898                         ret = btrfs_check_ref_name_override(path->nodes[0],
5899                                         path->slots[0], min_key, inode,
5900                                         &other_ino, &other_parent);
5901                         if (ret < 0) {
5902                                 return ret;
5903                         } else if (ret > 0 &&
5904                                    other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5905                                 if (ins_nr > 0) {
5906                                         ins_nr++;
5907                                 } else {
5908                                         ins_nr = 1;
5909                                         ins_start_slot = path->slots[0];
5910                                 }
5911                                 ret = copy_items(trans, inode, dst_path, path,
5912                                                  ins_start_slot, ins_nr,
5913                                                  inode_only, logged_isize, ctx);
5914                                 if (ret < 0)
5915                                         return ret;
5916                                 ins_nr = 0;
5917
5918                                 btrfs_release_path(path);
5919                                 ret = add_conflicting_inode(trans, root, path,
5920                                                             other_ino,
5921                                                             other_parent, ctx);
5922                                 if (ret)
5923                                         return ret;
5924                                 goto next_key;
5925                         }
5926                 } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5927                         /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5928                         if (ins_nr == 0)
5929                                 goto next_slot;
5930                         ret = copy_items(trans, inode, dst_path, path,
5931                                          ins_start_slot,
5932                                          ins_nr, inode_only, logged_isize, ctx);
5933                         if (ret < 0)
5934                                 return ret;
5935                         ins_nr = 0;
5936                         goto next_slot;
5937                 }
5938
5939                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5940                         ins_nr++;
5941                         goto next_slot;
5942                 } else if (!ins_nr) {
5943                         ins_start_slot = path->slots[0];
5944                         ins_nr = 1;
5945                         goto next_slot;
5946                 }
5947
5948                 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5949                                  ins_nr, inode_only, logged_isize, ctx);
5950                 if (ret < 0)
5951                         return ret;
5952                 ins_nr = 1;
5953                 ins_start_slot = path->slots[0];
5954 next_slot:
5955                 path->slots[0]++;
5956                 if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5957                         btrfs_item_key_to_cpu(path->nodes[0], min_key,
5958                                               path->slots[0]);
5959                         goto again;
5960                 }
5961                 if (ins_nr) {
5962                         ret = copy_items(trans, inode, dst_path, path,
5963                                          ins_start_slot, ins_nr, inode_only,
5964                                          logged_isize, ctx);
5965                         if (ret < 0)
5966                                 return ret;
5967                         ins_nr = 0;
5968                 }
5969                 btrfs_release_path(path);
5970 next_key:
5971                 if (min_key->offset < (u64)-1) {
5972                         min_key->offset++;
5973                 } else if (min_key->type < max_key->type) {
5974                         min_key->type++;
5975                         min_key->offset = 0;
5976                 } else {
5977                         break;
5978                 }
5979
5980                 /*
5981                  * We may process many leaves full of items for our inode, so
5982                  * avoid monopolizing a cpu for too long by rescheduling while
5983                  * not holding locks on any tree.
5984                  */
5985                 cond_resched();
5986         }
5987         if (ins_nr) {
5988                 ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5989                                  ins_nr, inode_only, logged_isize, ctx);
5990                 if (ret)
5991                         return ret;
5992         }
5993
5994         if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5995                 /*
5996                  * Release the path because otherwise we might attempt to double
5997                  * lock the same leaf with btrfs_log_prealloc_extents() below.
5998                  */
5999                 btrfs_release_path(path);
6000                 ret = btrfs_log_prealloc_extents(trans, inode, dst_path, ctx);
6001         }
6002
6003         return ret;
6004 }
6005
6006 static int insert_delayed_items_batch(struct btrfs_trans_handle *trans,
6007                                       struct btrfs_root *log,
6008                                       struct btrfs_path *path,
6009                                       const struct btrfs_item_batch *batch,
6010                                       const struct btrfs_delayed_item *first_item)
6011 {
6012         const struct btrfs_delayed_item *curr = first_item;
6013         int ret;
6014
6015         ret = btrfs_insert_empty_items(trans, log, path, batch);
6016         if (ret)
6017                 return ret;
6018
6019         for (int i = 0; i < batch->nr; i++) {
6020                 char *data_ptr;
6021
6022                 data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char);
6023                 write_extent_buffer(path->nodes[0], &curr->data,
6024                                     (unsigned long)data_ptr, curr->data_len);
6025                 curr = list_next_entry(curr, log_list);
6026                 path->slots[0]++;
6027         }
6028
6029         btrfs_release_path(path);
6030
6031         return 0;
6032 }
6033
6034 static int log_delayed_insertion_items(struct btrfs_trans_handle *trans,
6035                                        struct btrfs_inode *inode,
6036                                        struct btrfs_path *path,
6037                                        const struct list_head *delayed_ins_list,
6038                                        struct btrfs_log_ctx *ctx)
6039 {
6040         /* 195 (4095 bytes of keys and sizes) fits in a single 4K page. */
6041         const int max_batch_size = 195;
6042         const int leaf_data_size = BTRFS_LEAF_DATA_SIZE(trans->fs_info);
6043         const u64 ino = btrfs_ino(inode);
6044         struct btrfs_root *log = inode->root->log_root;
6045         struct btrfs_item_batch batch = {
6046                 .nr = 0,
6047                 .total_data_size = 0,
6048         };
6049         const struct btrfs_delayed_item *first = NULL;
6050         const struct btrfs_delayed_item *curr;
6051         char *ins_data;
6052         struct btrfs_key *ins_keys;
6053         u32 *ins_sizes;
6054         u64 curr_batch_size = 0;
6055         int batch_idx = 0;
6056         int ret;
6057
6058         /* We are adding dir index items to the log tree. */
6059         lockdep_assert_held(&inode->log_mutex);
6060
6061         /*
6062          * We collect delayed items before copying index keys from the subvolume
6063          * to the log tree. However just after we collected them, they may have
6064          * been flushed (all of them or just some of them), and therefore we
6065          * could have copied them from the subvolume tree to the log tree.
6066          * So find the first delayed item that was not yet logged (they are
6067          * sorted by index number).
6068          */
6069         list_for_each_entry(curr, delayed_ins_list, log_list) {
6070                 if (curr->index > inode->last_dir_index_offset) {
6071                         first = curr;
6072                         break;
6073                 }
6074         }
6075
6076         /* Empty list or all delayed items were already logged. */
6077         if (!first)
6078                 return 0;
6079
6080         ins_data = kmalloc(max_batch_size * sizeof(u32) +
6081                            max_batch_size * sizeof(struct btrfs_key), GFP_NOFS);
6082         if (!ins_data)
6083                 return -ENOMEM;
6084         ins_sizes = (u32 *)ins_data;
6085         batch.data_sizes = ins_sizes;
6086         ins_keys = (struct btrfs_key *)(ins_data + max_batch_size * sizeof(u32));
6087         batch.keys = ins_keys;
6088
6089         curr = first;
6090         while (!list_entry_is_head(curr, delayed_ins_list, log_list)) {
6091                 const u32 curr_size = curr->data_len + sizeof(struct btrfs_item);
6092
6093                 if (curr_batch_size + curr_size > leaf_data_size ||
6094                     batch.nr == max_batch_size) {
6095                         ret = insert_delayed_items_batch(trans, log, path,
6096                                                          &batch, first);
6097                         if (ret)
6098                                 goto out;
6099                         batch_idx = 0;
6100                         batch.nr = 0;
6101                         batch.total_data_size = 0;
6102                         curr_batch_size = 0;
6103                         first = curr;
6104                 }
6105
6106                 ins_sizes[batch_idx] = curr->data_len;
6107                 ins_keys[batch_idx].objectid = ino;
6108                 ins_keys[batch_idx].type = BTRFS_DIR_INDEX_KEY;
6109                 ins_keys[batch_idx].offset = curr->index;
6110                 curr_batch_size += curr_size;
6111                 batch.total_data_size += curr->data_len;
6112                 batch.nr++;
6113                 batch_idx++;
6114                 curr = list_next_entry(curr, log_list);
6115         }
6116
6117         ASSERT(batch.nr >= 1);
6118         ret = insert_delayed_items_batch(trans, log, path, &batch, first);
6119
6120         curr = list_last_entry(delayed_ins_list, struct btrfs_delayed_item,
6121                                log_list);
6122         inode->last_dir_index_offset = curr->index;
6123 out:
6124         kfree(ins_data);
6125
6126         return ret;
6127 }
6128
6129 static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
6130                                       struct btrfs_inode *inode,
6131                                       struct btrfs_path *path,
6132                                       const struct list_head *delayed_del_list,
6133                                       struct btrfs_log_ctx *ctx)
6134 {
6135         const u64 ino = btrfs_ino(inode);
6136         const struct btrfs_delayed_item *curr;
6137
6138         curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6139                                 log_list);
6140
6141         while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6142                 u64 first_dir_index = curr->index;
6143                 u64 last_dir_index;
6144                 const struct btrfs_delayed_item *next;
6145                 int ret;
6146
6147                 /*
6148                  * Find a range of consecutive dir index items to delete. Like
6149                  * this we log a single dir range item spanning several contiguous
6150                  * dir items instead of logging one range item per dir index item.
6151                  */
6152                 next = list_next_entry(curr, log_list);
6153                 while (!list_entry_is_head(next, delayed_del_list, log_list)) {
6154                         if (next->index != curr->index + 1)
6155                                 break;
6156                         curr = next;
6157                         next = list_next_entry(next, log_list);
6158                 }
6159
6160                 last_dir_index = curr->index;
6161                 ASSERT(last_dir_index >= first_dir_index);
6162
6163                 ret = insert_dir_log_key(trans, inode->root->log_root, path,
6164                                          ino, first_dir_index, last_dir_index);
6165                 if (ret)
6166                         return ret;
6167                 curr = list_next_entry(curr, log_list);
6168         }
6169
6170         return 0;
6171 }
6172
6173 static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
6174                                         struct btrfs_inode *inode,
6175                                         struct btrfs_path *path,
6176                                         struct btrfs_log_ctx *ctx,
6177                                         const struct list_head *delayed_del_list,
6178                                         const struct btrfs_delayed_item *first,
6179                                         const struct btrfs_delayed_item **last_ret)
6180 {
6181         const struct btrfs_delayed_item *next;
6182         struct extent_buffer *leaf = path->nodes[0];
6183         const int last_slot = btrfs_header_nritems(leaf) - 1;
6184         int slot = path->slots[0] + 1;
6185         const u64 ino = btrfs_ino(inode);
6186
6187         next = list_next_entry(first, log_list);
6188
6189         while (slot < last_slot &&
6190                !list_entry_is_head(next, delayed_del_list, log_list)) {
6191                 struct btrfs_key key;
6192
6193                 btrfs_item_key_to_cpu(leaf, &key, slot);
6194                 if (key.objectid != ino ||
6195                     key.type != BTRFS_DIR_INDEX_KEY ||
6196                     key.offset != next->index)
6197                         break;
6198
6199                 slot++;
6200                 *last_ret = next;
6201                 next = list_next_entry(next, log_list);
6202         }
6203
6204         return btrfs_del_items(trans, inode->root->log_root, path,
6205                                path->slots[0], slot - path->slots[0]);
6206 }
6207
6208 static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
6209                                              struct btrfs_inode *inode,
6210                                              struct btrfs_path *path,
6211                                              const struct list_head *delayed_del_list,
6212                                              struct btrfs_log_ctx *ctx)
6213 {
6214         struct btrfs_root *log = inode->root->log_root;
6215         const struct btrfs_delayed_item *curr;
6216         u64 last_range_start = 0;
6217         u64 last_range_end = 0;
6218         struct btrfs_key key;
6219
6220         key.objectid = btrfs_ino(inode);
6221         key.type = BTRFS_DIR_INDEX_KEY;
6222         curr = list_first_entry(delayed_del_list, struct btrfs_delayed_item,
6223                                 log_list);
6224
6225         while (!list_entry_is_head(curr, delayed_del_list, log_list)) {
6226                 const struct btrfs_delayed_item *last = curr;
6227                 u64 first_dir_index = curr->index;
6228                 u64 last_dir_index;
6229                 bool deleted_items = false;
6230                 int ret;
6231
6232                 key.offset = curr->index;
6233                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
6234                 if (ret < 0) {
6235                         return ret;
6236                 } else if (ret == 0) {
6237                         ret = batch_delete_dir_index_items(trans, inode, path, ctx,
6238                                                            delayed_del_list, curr,
6239                                                            &last);
6240                         if (ret)
6241                                 return ret;
6242                         deleted_items = true;
6243                 }
6244
6245                 btrfs_release_path(path);
6246
6247                 /*
6248                  * If we deleted items from the leaf, it means we have a range
6249                  * item logging their range, so no need to add one or update an
6250                  * existing one. Otherwise we have to log a dir range item.
6251                  */
6252                 if (deleted_items)
6253                         goto next_batch;
6254
6255                 last_dir_index = last->index;
6256                 ASSERT(last_dir_index >= first_dir_index);
6257                 /*
6258                  * If this range starts right after where the previous one ends,
6259                  * then we want to reuse the previous range item and change its
6260                  * end offset to the end of this range. This is just to minimize
6261                  * leaf space usage, by avoiding adding a new range item.
6262                  */
6263                 if (last_range_end != 0 && first_dir_index == last_range_end + 1)
6264                         first_dir_index = last_range_start;
6265
6266                 ret = insert_dir_log_key(trans, log, path, key.objectid,
6267                                          first_dir_index, last_dir_index);
6268                 if (ret)
6269                         return ret;
6270
6271                 last_range_start = first_dir_index;
6272                 last_range_end = last_dir_index;
6273 next_batch:
6274                 curr = list_next_entry(last, log_list);
6275         }
6276
6277         return 0;
6278 }
6279
6280 static int log_delayed_deletion_items(struct btrfs_trans_handle *trans,
6281                                       struct btrfs_inode *inode,
6282                                       struct btrfs_path *path,
6283                                       const struct list_head *delayed_del_list,
6284                                       struct btrfs_log_ctx *ctx)
6285 {
6286         /*
6287          * We are deleting dir index items from the log tree or adding range
6288          * items to it.
6289          */
6290         lockdep_assert_held(&inode->log_mutex);
6291
6292         if (list_empty(delayed_del_list))
6293                 return 0;
6294
6295         if (ctx->logged_before)
6296                 return log_delayed_deletions_incremental(trans, inode, path,
6297                                                          delayed_del_list, ctx);
6298
6299         return log_delayed_deletions_full(trans, inode, path, delayed_del_list,
6300                                           ctx);
6301 }
6302
6303 /*
6304  * Similar logic as for log_new_dir_dentries(), but it iterates over the delayed
6305  * items instead of the subvolume tree.
6306  */
6307 static int log_new_delayed_dentries(struct btrfs_trans_handle *trans,
6308                                     struct btrfs_inode *inode,
6309                                     const struct list_head *delayed_ins_list,
6310                                     struct btrfs_log_ctx *ctx)
6311 {
6312         const bool orig_log_new_dentries = ctx->log_new_dentries;
6313         struct btrfs_fs_info *fs_info = trans->fs_info;
6314         struct btrfs_delayed_item *item;
6315         int ret = 0;
6316
6317         /*
6318          * No need for the log mutex, plus to avoid potential deadlocks or
6319          * lockdep annotations due to nesting of delayed inode mutexes and log
6320          * mutexes.
6321          */
6322         lockdep_assert_not_held(&inode->log_mutex);
6323
6324         ASSERT(!ctx->logging_new_delayed_dentries);
6325         ctx->logging_new_delayed_dentries = true;
6326
6327         list_for_each_entry(item, delayed_ins_list, log_list) {
6328                 struct btrfs_dir_item *dir_item;
6329                 struct inode *di_inode;
6330                 struct btrfs_key key;
6331                 int log_mode = LOG_INODE_EXISTS;
6332
6333                 dir_item = (struct btrfs_dir_item *)item->data;
6334                 btrfs_disk_key_to_cpu(&key, &dir_item->location);
6335
6336                 if (key.type == BTRFS_ROOT_ITEM_KEY)
6337                         continue;
6338
6339                 di_inode = btrfs_iget(fs_info->sb, key.objectid, inode->root);
6340                 if (IS_ERR(di_inode)) {
6341                         ret = PTR_ERR(di_inode);
6342                         break;
6343                 }
6344
6345                 if (!need_log_inode(trans, BTRFS_I(di_inode))) {
6346                         btrfs_add_delayed_iput(BTRFS_I(di_inode));
6347                         continue;
6348                 }
6349
6350                 if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR)
6351                         log_mode = LOG_INODE_ALL;
6352
6353                 ctx->log_new_dentries = false;
6354                 ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx);
6355
6356                 if (!ret && ctx->log_new_dentries)
6357                         ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx);
6358
6359                 btrfs_add_delayed_iput(BTRFS_I(di_inode));
6360
6361                 if (ret)
6362                         break;
6363         }
6364
6365         ctx->log_new_dentries = orig_log_new_dentries;
6366         ctx->logging_new_delayed_dentries = false;
6367
6368         return ret;
6369 }
6370
6371 /* log a single inode in the tree log.
6372  * At least one parent directory for this inode must exist in the tree
6373  * or be logged already.
6374  *
6375  * Any items from this inode changed by the current transaction are copied
6376  * to the log tree.  An extra reference is taken on any extents in this
6377  * file, allowing us to avoid a whole pile of corner cases around logging
6378  * blocks that have been removed from the tree.
6379  *
6380  * See LOG_INODE_ALL and related defines for a description of what inode_only
6381  * does.
6382  *
6383  * This handles both files and directories.
6384  */
6385 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
6386                            struct btrfs_inode *inode,
6387                            int inode_only,
6388                            struct btrfs_log_ctx *ctx)
6389 {
6390         struct btrfs_path *path;
6391         struct btrfs_path *dst_path;
6392         struct btrfs_key min_key;
6393         struct btrfs_key max_key;
6394         struct btrfs_root *log = inode->root->log_root;
6395         int ret;
6396         bool fast_search = false;
6397         u64 ino = btrfs_ino(inode);
6398         struct extent_map_tree *em_tree = &inode->extent_tree;
6399         u64 logged_isize = 0;
6400         bool need_log_inode_item = true;
6401         bool xattrs_logged = false;
6402         bool inode_item_dropped = true;
6403         bool full_dir_logging = false;
6404         LIST_HEAD(delayed_ins_list);
6405         LIST_HEAD(delayed_del_list);
6406
6407         path = btrfs_alloc_path();
6408         if (!path)
6409                 return -ENOMEM;
6410         dst_path = btrfs_alloc_path();
6411         if (!dst_path) {
6412                 btrfs_free_path(path);
6413                 return -ENOMEM;
6414         }
6415
6416         min_key.objectid = ino;
6417         min_key.type = BTRFS_INODE_ITEM_KEY;
6418         min_key.offset = 0;
6419
6420         max_key.objectid = ino;
6421
6422
6423         /* today the code can only do partial logging of directories */
6424         if (S_ISDIR(inode->vfs_inode.i_mode) ||
6425             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6426                        &inode->runtime_flags) &&
6427              inode_only >= LOG_INODE_EXISTS))
6428                 max_key.type = BTRFS_XATTR_ITEM_KEY;
6429         else
6430                 max_key.type = (u8)-1;
6431         max_key.offset = (u64)-1;
6432
6433         if (S_ISDIR(inode->vfs_inode.i_mode) && inode_only == LOG_INODE_ALL)
6434                 full_dir_logging = true;
6435
6436         /*
6437          * If we are logging a directory while we are logging dentries of the
6438          * delayed items of some other inode, then we need to flush the delayed
6439          * items of this directory and not log the delayed items directly. This
6440          * is to prevent more than one level of recursion into btrfs_log_inode()
6441          * by having something like this:
6442          *
6443          *     $ mkdir -p a/b/c/d/e/f/g/h/...
6444          *     $ xfs_io -c "fsync" a
6445          *
6446          * Where all directories in the path did not exist before and are
6447          * created in the current transaction.
6448          * So in such a case we directly log the delayed items of the main
6449          * directory ("a") without flushing them first, while for each of its
6450          * subdirectories we flush their delayed items before logging them.
6451          * This prevents a potential unbounded recursion like this:
6452          *
6453          * btrfs_log_inode()
6454          *   log_new_delayed_dentries()
6455          *      btrfs_log_inode()
6456          *        log_new_delayed_dentries()
6457          *          btrfs_log_inode()
6458          *            log_new_delayed_dentries()
6459          *              (...)
6460          *
6461          * We have thresholds for the maximum number of delayed items to have in
6462          * memory, and once they are hit, the items are flushed asynchronously.
6463          * However the limit is quite high, so lets prevent deep levels of
6464          * recursion to happen by limiting the maximum depth to be 1.
6465          */
6466         if (full_dir_logging && ctx->logging_new_delayed_dentries) {
6467                 ret = btrfs_commit_inode_delayed_items(trans, inode);
6468                 if (ret)
6469                         goto out;
6470         }
6471
6472         mutex_lock(&inode->log_mutex);
6473
6474         /*
6475          * For symlinks, we must always log their content, which is stored in an
6476          * inline extent, otherwise we could end up with an empty symlink after
6477          * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
6478          * one attempts to create an empty symlink).
6479          * We don't need to worry about flushing delalloc, because when we create
6480          * the inline extent when the symlink is created (we never have delalloc
6481          * for symlinks).
6482          */
6483         if (S_ISLNK(inode->vfs_inode.i_mode))
6484                 inode_only = LOG_INODE_ALL;
6485
6486         /*
6487          * Before logging the inode item, cache the value returned by
6488          * inode_logged(), because after that we have the need to figure out if
6489          * the inode was previously logged in this transaction.
6490          */
6491         ret = inode_logged(trans, inode, path);
6492         if (ret < 0)
6493                 goto out_unlock;
6494         ctx->logged_before = (ret == 1);
6495         ret = 0;
6496
6497         /*
6498          * This is for cases where logging a directory could result in losing a
6499          * a file after replaying the log. For example, if we move a file from a
6500          * directory A to a directory B, then fsync directory A, we have no way
6501          * to known the file was moved from A to B, so logging just A would
6502          * result in losing the file after a log replay.
6503          */
6504         if (full_dir_logging && inode->last_unlink_trans >= trans->transid) {
6505                 ret = BTRFS_LOG_FORCE_COMMIT;
6506                 goto out_unlock;
6507         }
6508
6509         /*
6510          * a brute force approach to making sure we get the most uptodate
6511          * copies of everything.
6512          */
6513         if (S_ISDIR(inode->vfs_inode.i_mode)) {
6514                 clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags);
6515                 if (ctx->logged_before)
6516                         ret = drop_inode_items(trans, log, path, inode,
6517                                                BTRFS_XATTR_ITEM_KEY);
6518         } else {
6519                 if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) {
6520                         /*
6521                          * Make sure the new inode item we write to the log has
6522                          * the same isize as the current one (if it exists).
6523                          * This is necessary to prevent data loss after log
6524                          * replay, and also to prevent doing a wrong expanding
6525                          * truncate - for e.g. create file, write 4K into offset
6526                          * 0, fsync, write 4K into offset 4096, add hard link,
6527                          * fsync some other file (to sync log), power fail - if
6528                          * we use the inode's current i_size, after log replay
6529                          * we get a 8Kb file, with the last 4Kb extent as a hole
6530                          * (zeroes), as if an expanding truncate happened,
6531                          * instead of getting a file of 4Kb only.
6532                          */
6533                         ret = logged_inode_size(log, inode, path, &logged_isize);
6534                         if (ret)
6535                                 goto out_unlock;
6536                 }
6537                 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6538                              &inode->runtime_flags)) {
6539                         if (inode_only == LOG_INODE_EXISTS) {
6540                                 max_key.type = BTRFS_XATTR_ITEM_KEY;
6541                                 if (ctx->logged_before)
6542                                         ret = drop_inode_items(trans, log, path,
6543                                                                inode, max_key.type);
6544                         } else {
6545                                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
6546                                           &inode->runtime_flags);
6547                                 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6548                                           &inode->runtime_flags);
6549                                 if (ctx->logged_before)
6550                                         ret = truncate_inode_items(trans, log,
6551                                                                    inode, 0, 0);
6552                         }
6553                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
6554                                               &inode->runtime_flags) ||
6555                            inode_only == LOG_INODE_EXISTS) {
6556                         if (inode_only == LOG_INODE_ALL)
6557                                 fast_search = true;
6558                         max_key.type = BTRFS_XATTR_ITEM_KEY;
6559                         if (ctx->logged_before)
6560                                 ret = drop_inode_items(trans, log, path, inode,
6561                                                        max_key.type);
6562                 } else {
6563                         if (inode_only == LOG_INODE_ALL)
6564                                 fast_search = true;
6565                         inode_item_dropped = false;
6566                         goto log_extents;
6567                 }
6568
6569         }
6570         if (ret)
6571                 goto out_unlock;
6572
6573         /*
6574          * If we are logging a directory in full mode, collect the delayed items
6575          * before iterating the subvolume tree, so that we don't miss any new
6576          * dir index items in case they get flushed while or right after we are
6577          * iterating the subvolume tree.
6578          */
6579         if (full_dir_logging && !ctx->logging_new_delayed_dentries)
6580                 btrfs_log_get_delayed_items(inode, &delayed_ins_list,
6581                                             &delayed_del_list);
6582
6583         ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
6584                                       path, dst_path, logged_isize,
6585                                       inode_only, ctx,
6586                                       &need_log_inode_item);
6587         if (ret)
6588                 goto out_unlock;
6589
6590         btrfs_release_path(path);
6591         btrfs_release_path(dst_path);
6592         ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
6593         if (ret)
6594                 goto out_unlock;
6595         xattrs_logged = true;
6596         if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
6597                 btrfs_release_path(path);
6598                 btrfs_release_path(dst_path);
6599                 ret = btrfs_log_holes(trans, inode, path);
6600                 if (ret)
6601                         goto out_unlock;
6602         }
6603 log_extents:
6604         btrfs_release_path(path);
6605         btrfs_release_path(dst_path);
6606         if (need_log_inode_item) {
6607                 ret = log_inode_item(trans, log, dst_path, inode, inode_item_dropped);
6608                 if (ret)
6609                         goto out_unlock;
6610                 /*
6611                  * If we are doing a fast fsync and the inode was logged before
6612                  * in this transaction, we don't need to log the xattrs because
6613                  * they were logged before. If xattrs were added, changed or
6614                  * deleted since the last time we logged the inode, then we have
6615                  * already logged them because the inode had the runtime flag
6616                  * BTRFS_INODE_COPY_EVERYTHING set.
6617                  */
6618                 if (!xattrs_logged && inode->logged_trans < trans->transid) {
6619                         ret = btrfs_log_all_xattrs(trans, inode, path, dst_path, ctx);
6620                         if (ret)
6621                                 goto out_unlock;
6622                         btrfs_release_path(path);
6623                 }
6624         }
6625         if (fast_search) {
6626                 ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx);
6627                 if (ret)
6628                         goto out_unlock;
6629         } else if (inode_only == LOG_INODE_ALL) {
6630                 struct extent_map *em, *n;
6631
6632                 write_lock(&em_tree->lock);
6633                 list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
6634                         list_del_init(&em->list);
6635                 write_unlock(&em_tree->lock);
6636         }
6637
6638         if (full_dir_logging) {
6639                 ret = log_directory_changes(trans, inode, path, dst_path, ctx);
6640                 if (ret)
6641                         goto out_unlock;
6642                 ret = log_delayed_insertion_items(trans, inode, path,
6643                                                   &delayed_ins_list, ctx);
6644                 if (ret)
6645                         goto out_unlock;
6646                 ret = log_delayed_deletion_items(trans, inode, path,
6647                                                  &delayed_del_list, ctx);
6648                 if (ret)
6649                         goto out_unlock;
6650         }
6651
6652         spin_lock(&inode->lock);
6653         inode->logged_trans = trans->transid;
6654         /*
6655          * Don't update last_log_commit if we logged that an inode exists.
6656          * We do this for three reasons:
6657          *
6658          * 1) We might have had buffered writes to this inode that were
6659          *    flushed and had their ordered extents completed in this
6660          *    transaction, but we did not previously log the inode with
6661          *    LOG_INODE_ALL. Later the inode was evicted and after that
6662          *    it was loaded again and this LOG_INODE_EXISTS log operation
6663          *    happened. We must make sure that if an explicit fsync against
6664          *    the inode is performed later, it logs the new extents, an
6665          *    updated inode item, etc, and syncs the log. The same logic
6666          *    applies to direct IO writes instead of buffered writes.
6667          *
6668          * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
6669          *    is logged with an i_size of 0 or whatever value was logged
6670          *    before. If later the i_size of the inode is increased by a
6671          *    truncate operation, the log is synced through an fsync of
6672          *    some other inode and then finally an explicit fsync against
6673          *    this inode is made, we must make sure this fsync logs the
6674          *    inode with the new i_size, the hole between old i_size and
6675          *    the new i_size, and syncs the log.
6676          *
6677          * 3) If we are logging that an ancestor inode exists as part of
6678          *    logging a new name from a link or rename operation, don't update
6679          *    its last_log_commit - otherwise if an explicit fsync is made
6680          *    against an ancestor, the fsync considers the inode in the log
6681          *    and doesn't sync the log, resulting in the ancestor missing after
6682          *    a power failure unless the log was synced as part of an fsync
6683          *    against any other unrelated inode.
6684          */
6685         if (inode_only != LOG_INODE_EXISTS)
6686                 inode->last_log_commit = inode->last_sub_trans;
6687         spin_unlock(&inode->lock);
6688
6689         /*
6690          * Reset the last_reflink_trans so that the next fsync does not need to
6691          * go through the slower path when logging extents and their checksums.
6692          */
6693         if (inode_only == LOG_INODE_ALL)
6694                 inode->last_reflink_trans = 0;
6695
6696 out_unlock:
6697         mutex_unlock(&inode->log_mutex);
6698 out:
6699         btrfs_free_path(path);
6700         btrfs_free_path(dst_path);
6701
6702         if (ret)
6703                 free_conflicting_inodes(ctx);
6704         else
6705                 ret = log_conflicting_inodes(trans, inode->root, ctx);
6706
6707         if (full_dir_logging && !ctx->logging_new_delayed_dentries) {
6708                 if (!ret)
6709                         ret = log_new_delayed_dentries(trans, inode,
6710                                                        &delayed_ins_list, ctx);
6711
6712                 btrfs_log_put_delayed_items(inode, &delayed_ins_list,
6713                                             &delayed_del_list);
6714         }
6715
6716         return ret;
6717 }
6718
6719 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
6720                                  struct btrfs_inode *inode,
6721                                  struct btrfs_log_ctx *ctx)
6722 {
6723         struct btrfs_fs_info *fs_info = trans->fs_info;
6724         int ret;
6725         struct btrfs_path *path;
6726         struct btrfs_key key;
6727         struct btrfs_root *root = inode->root;
6728         const u64 ino = btrfs_ino(inode);
6729
6730         path = btrfs_alloc_path();
6731         if (!path)
6732                 return -ENOMEM;
6733         path->skip_locking = 1;
6734         path->search_commit_root = 1;
6735
6736         key.objectid = ino;
6737         key.type = BTRFS_INODE_REF_KEY;
6738         key.offset = 0;
6739         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6740         if (ret < 0)
6741                 goto out;
6742
6743         while (true) {
6744                 struct extent_buffer *leaf = path->nodes[0];
6745                 int slot = path->slots[0];
6746                 u32 cur_offset = 0;
6747                 u32 item_size;
6748                 unsigned long ptr;
6749
6750                 if (slot >= btrfs_header_nritems(leaf)) {
6751                         ret = btrfs_next_leaf(root, path);
6752                         if (ret < 0)
6753                                 goto out;
6754                         else if (ret > 0)
6755                                 break;
6756                         continue;
6757                 }
6758
6759                 btrfs_item_key_to_cpu(leaf, &key, slot);
6760                 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
6761                 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
6762                         break;
6763
6764                 item_size = btrfs_item_size(leaf, slot);
6765                 ptr = btrfs_item_ptr_offset(leaf, slot);
6766                 while (cur_offset < item_size) {
6767                         struct btrfs_key inode_key;
6768                         struct inode *dir_inode;
6769
6770                         inode_key.type = BTRFS_INODE_ITEM_KEY;
6771                         inode_key.offset = 0;
6772
6773                         if (key.type == BTRFS_INODE_EXTREF_KEY) {
6774                                 struct btrfs_inode_extref *extref;
6775
6776                                 extref = (struct btrfs_inode_extref *)
6777                                         (ptr + cur_offset);
6778                                 inode_key.objectid = btrfs_inode_extref_parent(
6779                                         leaf, extref);
6780                                 cur_offset += sizeof(*extref);
6781                                 cur_offset += btrfs_inode_extref_name_len(leaf,
6782                                         extref);
6783                         } else {
6784                                 inode_key.objectid = key.offset;
6785                                 cur_offset = item_size;
6786                         }
6787
6788                         dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
6789                                                root);
6790                         /*
6791                          * If the parent inode was deleted, return an error to
6792                          * fallback to a transaction commit. This is to prevent
6793                          * getting an inode that was moved from one parent A to
6794                          * a parent B, got its former parent A deleted and then
6795                          * it got fsync'ed, from existing at both parents after
6796                          * a log replay (and the old parent still existing).
6797                          * Example:
6798                          *
6799                          * mkdir /mnt/A
6800                          * mkdir /mnt/B
6801                          * touch /mnt/B/bar
6802                          * sync
6803                          * mv /mnt/B/bar /mnt/A/bar
6804                          * mv -T /mnt/A /mnt/B
6805                          * fsync /mnt/B/bar
6806                          * <power fail>
6807                          *
6808                          * If we ignore the old parent B which got deleted,
6809                          * after a log replay we would have file bar linked
6810                          * at both parents and the old parent B would still
6811                          * exist.
6812                          */
6813                         if (IS_ERR(dir_inode)) {
6814                                 ret = PTR_ERR(dir_inode);
6815                                 goto out;
6816                         }
6817
6818                         if (!need_log_inode(trans, BTRFS_I(dir_inode))) {
6819                                 btrfs_add_delayed_iput(BTRFS_I(dir_inode));
6820                                 continue;
6821                         }
6822
6823                         ctx->log_new_dentries = false;
6824                         ret = btrfs_log_inode(trans, BTRFS_I(dir_inode),
6825                                               LOG_INODE_ALL, ctx);
6826                         if (!ret && ctx->log_new_dentries)
6827                                 ret = log_new_dir_dentries(trans,
6828                                                    BTRFS_I(dir_inode), ctx);
6829                         btrfs_add_delayed_iput(BTRFS_I(dir_inode));
6830                         if (ret)
6831                                 goto out;
6832                 }
6833                 path->slots[0]++;
6834         }
6835         ret = 0;
6836 out:
6837         btrfs_free_path(path);
6838         return ret;
6839 }
6840
6841 static int log_new_ancestors(struct btrfs_trans_handle *trans,
6842                              struct btrfs_root *root,
6843                              struct btrfs_path *path,
6844                              struct btrfs_log_ctx *ctx)
6845 {
6846         struct btrfs_key found_key;
6847
6848         btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
6849
6850         while (true) {
6851                 struct btrfs_fs_info *fs_info = root->fs_info;
6852                 struct extent_buffer *leaf;
6853                 int slot;
6854                 struct btrfs_key search_key;
6855                 struct inode *inode;
6856                 u64 ino;
6857                 int ret = 0;
6858
6859                 btrfs_release_path(path);
6860
6861                 ino = found_key.offset;
6862
6863                 search_key.objectid = found_key.offset;
6864                 search_key.type = BTRFS_INODE_ITEM_KEY;
6865                 search_key.offset = 0;
6866                 inode = btrfs_iget(fs_info->sb, ino, root);
6867                 if (IS_ERR(inode))
6868                         return PTR_ERR(inode);
6869
6870                 if (BTRFS_I(inode)->generation >= trans->transid &&
6871                     need_log_inode(trans, BTRFS_I(inode)))
6872                         ret = btrfs_log_inode(trans, BTRFS_I(inode),
6873                                               LOG_INODE_EXISTS, ctx);
6874                 btrfs_add_delayed_iput(BTRFS_I(inode));
6875                 if (ret)
6876                         return ret;
6877
6878                 if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
6879                         break;
6880
6881                 search_key.type = BTRFS_INODE_REF_KEY;
6882                 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6883                 if (ret < 0)
6884                         return ret;
6885
6886                 leaf = path->nodes[0];
6887                 slot = path->slots[0];
6888                 if (slot >= btrfs_header_nritems(leaf)) {
6889                         ret = btrfs_next_leaf(root, path);
6890                         if (ret < 0)
6891                                 return ret;
6892                         else if (ret > 0)
6893                                 return -ENOENT;
6894                         leaf = path->nodes[0];
6895                         slot = path->slots[0];
6896                 }
6897
6898                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6899                 if (found_key.objectid != search_key.objectid ||
6900                     found_key.type != BTRFS_INODE_REF_KEY)
6901                         return -ENOENT;
6902         }
6903         return 0;
6904 }
6905
6906 static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
6907                                   struct btrfs_inode *inode,
6908                                   struct dentry *parent,
6909                                   struct btrfs_log_ctx *ctx)
6910 {
6911         struct btrfs_root *root = inode->root;
6912         struct dentry *old_parent = NULL;
6913         struct super_block *sb = inode->vfs_inode.i_sb;
6914         int ret = 0;
6915
6916         while (true) {
6917                 if (!parent || d_really_is_negative(parent) ||
6918                     sb != parent->d_sb)
6919                         break;
6920
6921                 inode = BTRFS_I(d_inode(parent));
6922                 if (root != inode->root)
6923                         break;
6924
6925                 if (inode->generation >= trans->transid &&
6926                     need_log_inode(trans, inode)) {
6927                         ret = btrfs_log_inode(trans, inode,
6928                                               LOG_INODE_EXISTS, ctx);
6929                         if (ret)
6930                                 break;
6931                 }
6932                 if (IS_ROOT(parent))
6933                         break;
6934
6935                 parent = dget_parent(parent);
6936                 dput(old_parent);
6937                 old_parent = parent;
6938         }
6939         dput(old_parent);
6940
6941         return ret;
6942 }
6943
6944 static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6945                                  struct btrfs_inode *inode,
6946                                  struct dentry *parent,
6947                                  struct btrfs_log_ctx *ctx)
6948 {
6949         struct btrfs_root *root = inode->root;
6950         const u64 ino = btrfs_ino(inode);
6951         struct btrfs_path *path;
6952         struct btrfs_key search_key;
6953         int ret;
6954
6955         /*
6956          * For a single hard link case, go through a fast path that does not
6957          * need to iterate the fs/subvolume tree.
6958          */
6959         if (inode->vfs_inode.i_nlink < 2)
6960                 return log_new_ancestors_fast(trans, inode, parent, ctx);
6961
6962         path = btrfs_alloc_path();
6963         if (!path)
6964                 return -ENOMEM;
6965
6966         search_key.objectid = ino;
6967         search_key.type = BTRFS_INODE_REF_KEY;
6968         search_key.offset = 0;
6969 again:
6970         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6971         if (ret < 0)
6972                 goto out;
6973         if (ret == 0)
6974                 path->slots[0]++;
6975
6976         while (true) {
6977                 struct extent_buffer *leaf = path->nodes[0];
6978                 int slot = path->slots[0];
6979                 struct btrfs_key found_key;
6980
6981                 if (slot >= btrfs_header_nritems(leaf)) {
6982                         ret = btrfs_next_leaf(root, path);
6983                         if (ret < 0)
6984                                 goto out;
6985                         else if (ret > 0)
6986                                 break;
6987                         continue;
6988                 }
6989
6990                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
6991                 if (found_key.objectid != ino ||
6992                     found_key.type > BTRFS_INODE_EXTREF_KEY)
6993                         break;
6994
6995                 /*
6996                  * Don't deal with extended references because they are rare
6997                  * cases and too complex to deal with (we would need to keep
6998                  * track of which subitem we are processing for each item in
6999                  * this loop, etc). So just return some error to fallback to
7000                  * a transaction commit.
7001                  */
7002                 if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
7003                         ret = -EMLINK;
7004                         goto out;
7005                 }
7006
7007                 /*
7008                  * Logging ancestors needs to do more searches on the fs/subvol
7009                  * tree, so it releases the path as needed to avoid deadlocks.
7010                  * Keep track of the last inode ref key and resume from that key
7011                  * after logging all new ancestors for the current hard link.
7012                  */
7013                 memcpy(&search_key, &found_key, sizeof(search_key));
7014
7015                 ret = log_new_ancestors(trans, root, path, ctx);
7016                 if (ret)
7017                         goto out;
7018                 btrfs_release_path(path);
7019                 goto again;
7020         }
7021         ret = 0;
7022 out:
7023         btrfs_free_path(path);
7024         return ret;
7025 }
7026
7027 /*
7028  * helper function around btrfs_log_inode to make sure newly created
7029  * parent directories also end up in the log.  A minimal inode and backref
7030  * only logging is done of any parent directories that are older than
7031  * the last committed transaction
7032  */
7033 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
7034                                   struct btrfs_inode *inode,
7035                                   struct dentry *parent,
7036                                   int inode_only,
7037                                   struct btrfs_log_ctx *ctx)
7038 {
7039         struct btrfs_root *root = inode->root;
7040         struct btrfs_fs_info *fs_info = root->fs_info;
7041         int ret = 0;
7042         bool log_dentries = false;
7043
7044         if (btrfs_test_opt(fs_info, NOTREELOG)) {
7045                 ret = BTRFS_LOG_FORCE_COMMIT;
7046                 goto end_no_trans;
7047         }
7048
7049         if (btrfs_root_refs(&root->root_item) == 0) {
7050                 ret = BTRFS_LOG_FORCE_COMMIT;
7051                 goto end_no_trans;
7052         }
7053
7054         /*
7055          * Skip already logged inodes or inodes corresponding to tmpfiles
7056          * (since logging them is pointless, a link count of 0 means they
7057          * will never be accessible).
7058          */
7059         if ((btrfs_inode_in_log(inode, trans->transid) &&
7060              list_empty(&ctx->ordered_extents)) ||
7061             inode->vfs_inode.i_nlink == 0) {
7062                 ret = BTRFS_NO_LOG_SYNC;
7063                 goto end_no_trans;
7064         }
7065
7066         ret = start_log_trans(trans, root, ctx);
7067         if (ret)
7068                 goto end_no_trans;
7069
7070         ret = btrfs_log_inode(trans, inode, inode_only, ctx);
7071         if (ret)
7072                 goto end_trans;
7073
7074         /*
7075          * for regular files, if its inode is already on disk, we don't
7076          * have to worry about the parents at all.  This is because
7077          * we can use the last_unlink_trans field to record renames
7078          * and other fun in this file.
7079          */
7080         if (S_ISREG(inode->vfs_inode.i_mode) &&
7081             inode->generation < trans->transid &&
7082             inode->last_unlink_trans < trans->transid) {
7083                 ret = 0;
7084                 goto end_trans;
7085         }
7086
7087         if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries)
7088                 log_dentries = true;
7089
7090         /*
7091          * On unlink we must make sure all our current and old parent directory
7092          * inodes are fully logged. This is to prevent leaving dangling
7093          * directory index entries in directories that were our parents but are
7094          * not anymore. Not doing this results in old parent directory being
7095          * impossible to delete after log replay (rmdir will always fail with
7096          * error -ENOTEMPTY).
7097          *
7098          * Example 1:
7099          *
7100          * mkdir testdir
7101          * touch testdir/foo
7102          * ln testdir/foo testdir/bar
7103          * sync
7104          * unlink testdir/bar
7105          * xfs_io -c fsync testdir/foo
7106          * <power failure>
7107          * mount fs, triggers log replay
7108          *
7109          * If we don't log the parent directory (testdir), after log replay the
7110          * directory still has an entry pointing to the file inode using the bar
7111          * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
7112          * the file inode has a link count of 1.
7113          *
7114          * Example 2:
7115          *
7116          * mkdir testdir
7117          * touch foo
7118          * ln foo testdir/foo2
7119          * ln foo testdir/foo3
7120          * sync
7121          * unlink testdir/foo3
7122          * xfs_io -c fsync foo
7123          * <power failure>
7124          * mount fs, triggers log replay
7125          *
7126          * Similar as the first example, after log replay the parent directory
7127          * testdir still has an entry pointing to the inode file with name foo3
7128          * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
7129          * and has a link count of 2.
7130          */
7131         if (inode->last_unlink_trans >= trans->transid) {
7132                 ret = btrfs_log_all_parents(trans, inode, ctx);
7133                 if (ret)
7134                         goto end_trans;
7135         }
7136
7137         ret = log_all_new_ancestors(trans, inode, parent, ctx);
7138         if (ret)
7139                 goto end_trans;
7140
7141         if (log_dentries)
7142                 ret = log_new_dir_dentries(trans, inode, ctx);
7143         else
7144                 ret = 0;
7145 end_trans:
7146         if (ret < 0) {
7147                 btrfs_set_log_full_commit(trans);
7148                 ret = BTRFS_LOG_FORCE_COMMIT;
7149         }
7150
7151         if (ret)
7152                 btrfs_remove_log_ctx(root, ctx);
7153         btrfs_end_log_trans(root);
7154 end_no_trans:
7155         return ret;
7156 }
7157
7158 /*
7159  * it is not safe to log dentry if the chunk root has added new
7160  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
7161  * If this returns 1, you must commit the transaction to safely get your
7162  * data on disk.
7163  */
7164 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
7165                           struct dentry *dentry,
7166                           struct btrfs_log_ctx *ctx)
7167 {
7168         struct dentry *parent = dget_parent(dentry);
7169         int ret;
7170
7171         ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
7172                                      LOG_INODE_ALL, ctx);
7173         dput(parent);
7174
7175         return ret;
7176 }
7177
7178 /*
7179  * should be called during mount to recover any replay any log trees
7180  * from the FS
7181  */
7182 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
7183 {
7184         int ret;
7185         struct btrfs_path *path;
7186         struct btrfs_trans_handle *trans;
7187         struct btrfs_key key;
7188         struct btrfs_key found_key;
7189         struct btrfs_root *log;
7190         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
7191         struct walk_control wc = {
7192                 .process_func = process_one_buffer,
7193                 .stage = LOG_WALK_PIN_ONLY,
7194         };
7195
7196         path = btrfs_alloc_path();
7197         if (!path)
7198                 return -ENOMEM;
7199
7200         set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7201
7202         trans = btrfs_start_transaction(fs_info->tree_root, 0);
7203         if (IS_ERR(trans)) {
7204                 ret = PTR_ERR(trans);
7205                 goto error;
7206         }
7207
7208         wc.trans = trans;
7209         wc.pin = 1;
7210
7211         ret = walk_log_tree(trans, log_root_tree, &wc);
7212         if (ret) {
7213                 btrfs_abort_transaction(trans, ret);
7214                 goto error;
7215         }
7216
7217 again:
7218         key.objectid = BTRFS_TREE_LOG_OBJECTID;
7219         key.offset = (u64)-1;
7220         key.type = BTRFS_ROOT_ITEM_KEY;
7221
7222         while (1) {
7223                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
7224
7225                 if (ret < 0) {
7226                         btrfs_abort_transaction(trans, ret);
7227                         goto error;
7228                 }
7229                 if (ret > 0) {
7230                         if (path->slots[0] == 0)
7231                                 break;
7232                         path->slots[0]--;
7233                 }
7234                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
7235                                       path->slots[0]);
7236                 btrfs_release_path(path);
7237                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
7238                         break;
7239
7240                 log = btrfs_read_tree_root(log_root_tree, &found_key);
7241                 if (IS_ERR(log)) {
7242                         ret = PTR_ERR(log);
7243                         btrfs_abort_transaction(trans, ret);
7244                         goto error;
7245                 }
7246
7247                 wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
7248                                                    true);
7249                 if (IS_ERR(wc.replay_dest)) {
7250                         ret = PTR_ERR(wc.replay_dest);
7251
7252                         /*
7253                          * We didn't find the subvol, likely because it was
7254                          * deleted.  This is ok, simply skip this log and go to
7255                          * the next one.
7256                          *
7257                          * We need to exclude the root because we can't have
7258                          * other log replays overwriting this log as we'll read
7259                          * it back in a few more times.  This will keep our
7260                          * block from being modified, and we'll just bail for
7261                          * each subsequent pass.
7262                          */
7263                         if (ret == -ENOENT)
7264                                 ret = btrfs_pin_extent_for_log_replay(trans, log->node);
7265                         btrfs_put_root(log);
7266
7267                         if (!ret)
7268                                 goto next;
7269                         btrfs_abort_transaction(trans, ret);
7270                         goto error;
7271                 }
7272
7273                 wc.replay_dest->log_root = log;
7274                 ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
7275                 if (ret)
7276                         /* The loop needs to continue due to the root refs */
7277                         btrfs_abort_transaction(trans, ret);
7278                 else
7279                         ret = walk_log_tree(trans, log, &wc);
7280
7281                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7282                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
7283                                                       path);
7284                         if (ret)
7285                                 btrfs_abort_transaction(trans, ret);
7286                 }
7287
7288                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
7289                         struct btrfs_root *root = wc.replay_dest;
7290
7291                         btrfs_release_path(path);
7292
7293                         /*
7294                          * We have just replayed everything, and the highest
7295                          * objectid of fs roots probably has changed in case
7296                          * some inode_item's got replayed.
7297                          *
7298                          * root->objectid_mutex is not acquired as log replay
7299                          * could only happen during mount.
7300                          */
7301                         ret = btrfs_init_root_free_objectid(root);
7302                         if (ret)
7303                                 btrfs_abort_transaction(trans, ret);
7304                 }
7305
7306                 wc.replay_dest->log_root = NULL;
7307                 btrfs_put_root(wc.replay_dest);
7308                 btrfs_put_root(log);
7309
7310                 if (ret)
7311                         goto error;
7312 next:
7313                 if (found_key.offset == 0)
7314                         break;
7315                 key.offset = found_key.offset - 1;
7316         }
7317         btrfs_release_path(path);
7318
7319         /* step one is to pin it all, step two is to replay just inodes */
7320         if (wc.pin) {
7321                 wc.pin = 0;
7322                 wc.process_func = replay_one_buffer;
7323                 wc.stage = LOG_WALK_REPLAY_INODES;
7324                 goto again;
7325         }
7326         /* step three is to replay everything */
7327         if (wc.stage < LOG_WALK_REPLAY_ALL) {
7328                 wc.stage++;
7329                 goto again;
7330         }
7331
7332         btrfs_free_path(path);
7333
7334         /* step 4: commit the transaction, which also unpins the blocks */
7335         ret = btrfs_commit_transaction(trans);
7336         if (ret)
7337                 return ret;
7338
7339         log_root_tree->log_root = NULL;
7340         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7341         btrfs_put_root(log_root_tree);
7342
7343         return 0;
7344 error:
7345         if (wc.trans)
7346                 btrfs_end_transaction(wc.trans);
7347         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
7348         btrfs_free_path(path);
7349         return ret;
7350 }
7351
7352 /*
7353  * there are some corner cases where we want to force a full
7354  * commit instead of allowing a directory to be logged.
7355  *
7356  * They revolve around files there were unlinked from the directory, and
7357  * this function updates the parent directory so that a full commit is
7358  * properly done if it is fsync'd later after the unlinks are done.
7359  *
7360  * Must be called before the unlink operations (updates to the subvolume tree,
7361  * inodes, etc) are done.
7362  */
7363 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
7364                              struct btrfs_inode *dir, struct btrfs_inode *inode,
7365                              bool for_rename)
7366 {
7367         /*
7368          * when we're logging a file, if it hasn't been renamed
7369          * or unlinked, and its inode is fully committed on disk,
7370          * we don't have to worry about walking up the directory chain
7371          * to log its parents.
7372          *
7373          * So, we use the last_unlink_trans field to put this transid
7374          * into the file.  When the file is logged we check it and
7375          * don't log the parents if the file is fully on disk.
7376          */
7377         mutex_lock(&inode->log_mutex);
7378         inode->last_unlink_trans = trans->transid;
7379         mutex_unlock(&inode->log_mutex);
7380
7381         if (!for_rename)
7382                 return;
7383
7384         /*
7385          * If this directory was already logged, any new names will be logged
7386          * with btrfs_log_new_name() and old names will be deleted from the log
7387          * tree with btrfs_del_dir_entries_in_log() or with
7388          * btrfs_del_inode_ref_in_log().
7389          */
7390         if (inode_logged(trans, dir, NULL) == 1)
7391                 return;
7392
7393         /*
7394          * If the inode we're about to unlink was logged before, the log will be
7395          * properly updated with the new name with btrfs_log_new_name() and the
7396          * old name removed with btrfs_del_dir_entries_in_log() or with
7397          * btrfs_del_inode_ref_in_log().
7398          */
7399         if (inode_logged(trans, inode, NULL) == 1)
7400                 return;
7401
7402         /*
7403          * when renaming files across directories, if the directory
7404          * there we're unlinking from gets fsync'd later on, there's
7405          * no way to find the destination directory later and fsync it
7406          * properly.  So, we have to be conservative and force commits
7407          * so the new name gets discovered.
7408          */
7409         mutex_lock(&dir->log_mutex);
7410         dir->last_unlink_trans = trans->transid;
7411         mutex_unlock(&dir->log_mutex);
7412 }
7413
7414 /*
7415  * Make sure that if someone attempts to fsync the parent directory of a deleted
7416  * snapshot, it ends up triggering a transaction commit. This is to guarantee
7417  * that after replaying the log tree of the parent directory's root we will not
7418  * see the snapshot anymore and at log replay time we will not see any log tree
7419  * corresponding to the deleted snapshot's root, which could lead to replaying
7420  * it after replaying the log tree of the parent directory (which would replay
7421  * the snapshot delete operation).
7422  *
7423  * Must be called before the actual snapshot destroy operation (updates to the
7424  * parent root and tree of tree roots trees, etc) are done.
7425  */
7426 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
7427                                    struct btrfs_inode *dir)
7428 {
7429         mutex_lock(&dir->log_mutex);
7430         dir->last_unlink_trans = trans->transid;
7431         mutex_unlock(&dir->log_mutex);
7432 }
7433
7434 /*
7435  * Update the log after adding a new name for an inode.
7436  *
7437  * @trans:              Transaction handle.
7438  * @old_dentry:         The dentry associated with the old name and the old
7439  *                      parent directory.
7440  * @old_dir:            The inode of the previous parent directory for the case
7441  *                      of a rename. For a link operation, it must be NULL.
7442  * @old_dir_index:      The index number associated with the old name, meaningful
7443  *                      only for rename operations (when @old_dir is not NULL).
7444  *                      Ignored for link operations.
7445  * @parent:             The dentry associated with the directory under which the
7446  *                      new name is located.
7447  *
7448  * Call this after adding a new name for an inode, as a result of a link or
7449  * rename operation, and it will properly update the log to reflect the new name.
7450  */
7451 void btrfs_log_new_name(struct btrfs_trans_handle *trans,
7452                         struct dentry *old_dentry, struct btrfs_inode *old_dir,
7453                         u64 old_dir_index, struct dentry *parent)
7454 {
7455         struct btrfs_inode *inode = BTRFS_I(d_inode(old_dentry));
7456         struct btrfs_root *root = inode->root;
7457         struct btrfs_log_ctx ctx;
7458         bool log_pinned = false;
7459         int ret;
7460
7461         /*
7462          * this will force the logging code to walk the dentry chain
7463          * up for the file
7464          */
7465         if (!S_ISDIR(inode->vfs_inode.i_mode))
7466                 inode->last_unlink_trans = trans->transid;
7467
7468         /*
7469          * if this inode hasn't been logged and directory we're renaming it
7470          * from hasn't been logged, we don't need to log it
7471          */
7472         ret = inode_logged(trans, inode, NULL);
7473         if (ret < 0) {
7474                 goto out;
7475         } else if (ret == 0) {
7476                 if (!old_dir)
7477                         return;
7478                 /*
7479                  * If the inode was not logged and we are doing a rename (old_dir is not
7480                  * NULL), check if old_dir was logged - if it was not we can return and
7481                  * do nothing.
7482                  */
7483                 ret = inode_logged(trans, old_dir, NULL);
7484                 if (ret < 0)
7485                         goto out;
7486                 else if (ret == 0)
7487                         return;
7488         }
7489         ret = 0;
7490
7491         /*
7492          * If we are doing a rename (old_dir is not NULL) from a directory that
7493          * was previously logged, make sure that on log replay we get the old
7494          * dir entry deleted. This is needed because we will also log the new
7495          * name of the renamed inode, so we need to make sure that after log
7496          * replay we don't end up with both the new and old dir entries existing.
7497          */
7498         if (old_dir && old_dir->logged_trans == trans->transid) {
7499                 struct btrfs_root *log = old_dir->root->log_root;
7500                 struct btrfs_path *path;
7501                 struct fscrypt_name fname;
7502
7503                 ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX);
7504
7505                 ret = fscrypt_setup_filename(&old_dir->vfs_inode,
7506                                              &old_dentry->d_name, 0, &fname);
7507                 if (ret)
7508                         goto out;
7509                 /*
7510                  * We have two inodes to update in the log, the old directory and
7511                  * the inode that got renamed, so we must pin the log to prevent
7512                  * anyone from syncing the log until we have updated both inodes
7513                  * in the log.
7514                  */
7515                 ret = join_running_log_trans(root);
7516                 /*
7517                  * At least one of the inodes was logged before, so this should
7518                  * not fail, but if it does, it's not serious, just bail out and
7519                  * mark the log for a full commit.
7520                  */
7521                 if (WARN_ON_ONCE(ret < 0)) {
7522                         fscrypt_free_filename(&fname);
7523                         goto out;
7524                 }
7525
7526                 log_pinned = true;
7527
7528                 path = btrfs_alloc_path();
7529                 if (!path) {
7530                         ret = -ENOMEM;
7531                         fscrypt_free_filename(&fname);
7532                         goto out;
7533                 }
7534
7535                 /*
7536                  * Other concurrent task might be logging the old directory,
7537                  * as it can be triggered when logging other inode that had or
7538                  * still has a dentry in the old directory. We lock the old
7539                  * directory's log_mutex to ensure the deletion of the old
7540                  * name is persisted, because during directory logging we
7541                  * delete all BTRFS_DIR_LOG_INDEX_KEY keys and the deletion of
7542                  * the old name's dir index item is in the delayed items, so
7543                  * it could be missed by an in progress directory logging.
7544                  */
7545                 mutex_lock(&old_dir->log_mutex);
7546                 ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir),
7547                                         &fname.disk_name, old_dir_index);
7548                 if (ret > 0) {
7549                         /*
7550                          * The dentry does not exist in the log, so record its
7551                          * deletion.
7552                          */
7553                         btrfs_release_path(path);
7554                         ret = insert_dir_log_key(trans, log, path,
7555                                                  btrfs_ino(old_dir),
7556                                                  old_dir_index, old_dir_index);
7557                 }
7558                 mutex_unlock(&old_dir->log_mutex);
7559
7560                 btrfs_free_path(path);
7561                 fscrypt_free_filename(&fname);
7562                 if (ret < 0)
7563                         goto out;
7564         }
7565
7566         btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
7567         ctx.logging_new_name = true;
7568         btrfs_init_log_ctx_scratch_eb(&ctx);
7569         /*
7570          * We don't care about the return value. If we fail to log the new name
7571          * then we know the next attempt to sync the log will fallback to a full
7572          * transaction commit (due to a call to btrfs_set_log_full_commit()), so
7573          * we don't need to worry about getting a log committed that has an
7574          * inconsistent state after a rename operation.
7575          */
7576         btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
7577         free_extent_buffer(ctx.scratch_eb);
7578         ASSERT(list_empty(&ctx.conflict_inodes));
7579 out:
7580         /*
7581          * If an error happened mark the log for a full commit because it's not
7582          * consistent and up to date or we couldn't find out if one of the
7583          * inodes was logged before in this transaction. Do it before unpinning
7584          * the log, to avoid any races with someone else trying to commit it.
7585          */
7586         if (ret < 0)
7587                 btrfs_set_log_full_commit(trans);
7588         if (log_pinned)
7589                 btrfs_end_log_trans(root);
7590 }
7591