fs/btrfs/tree-log.c

   1 /*
   2  * Copyright (C) 2008 Oracle.  All rights reserved.
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License v2 as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope that it will be useful,
   9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU General Public
  14  * License along with this program; if not, write to the
  15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16  * Boston, MA 021110-1307, USA.
  17  */
  18
  19 #include <linux/sched.h>
  20 #include <linux/slab.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/list_sort.h>
  23 #include "tree-log.h"
  24 #include "disk-io.h"
  25 #include "locking.h"
  26 #include "print-tree.h"
  27 #include "backref.h"
  28 #include "hash.h"
  29 #include "compression.h"
  30 #include "qgroup.h"
  31 #include "inode-map.h"
  32
  33 /* magic values for the inode_only field in btrfs_log_inode:
  34  *
  35  * LOG_INODE_ALL means to log everything
  36  * LOG_INODE_EXISTS means to log just enough to recreate the inode
  37  * during log replay
  38  */
  39 #define LOG_INODE_ALL 0
  40 #define LOG_INODE_EXISTS 1
  41 #define LOG_OTHER_INODE 2
  42
  43 /*
  44  * directory trouble cases
  45  *
  46  * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
  47  * log, we must force a full commit before doing an fsync of the directory
  48  * where the unlink was done.
  49  * ---> record transid of last unlink/rename per directory
  50  *
  51  * mkdir foo/some_dir
  52  * normal commit
  53  * rename foo/some_dir foo2/some_dir
  54  * mkdir foo/some_dir
  55  * fsync foo/some_dir/some_file
  56  *
  57  * The fsync above will unlink the original some_dir without recording
  58  * it in its new location (foo2).  After a crash, some_dir will be gone
  59  * unless the fsync of some_file forces a full commit
  60  *
  61  * 2) we must log any new names for any file or dir that is in the fsync
  62  * log. ---> check inode while renaming/linking.
  63  *
  64  * 2a) we must log any new names for any file or dir during rename
  65  * when the directory they are being removed from was logged.
  66  * ---> check inode and old parent dir during rename
  67  *
  68  *  2a is actually the more important variant.  With the extra logging
  69  *  a crash might unlink the old name without recreating the new one
  70  *
  71  * 3) after a crash, we must go through any directories with a link count
  72  * of zero and redo the rm -rf
  73  *
  74  * mkdir f1/foo
  75  * normal commit
  76  * rm -rf f1/foo
  77  * fsync(f1)
  78  *
  79  * The directory f1 was fully removed from the FS, but fsync was never
  80  * called on f1, only its parent dir.  After a crash the rm -rf must
  81  * be replayed.  This must be able to recurse down the entire
  82  * directory tree.  The inode link count fixup code takes care of the
  83  * ugly details.
  84  */
  85
  86 /*
  87  * stages for the tree walking.  The first
  88  * stage (0) is to only pin down the blocks we find
  89  * the second stage (1) is to make sure that all the inodes
  90  * we find in the log are created in the subvolume.
  91  *
  92  * The last stage is to deal with directories and links and extents
  93  * and all the other fun semantics
  94  */
  95 #define LOG_WALK_PIN_ONLY 0
  96 #define LOG_WALK_REPLAY_INODES 1
  97 #define LOG_WALK_REPLAY_DIR_INDEX 2
  98 #define LOG_WALK_REPLAY_ALL 3
  99
 100 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 101                            struct btrfs_root *root, struct btrfs_inode *inode,
 102                            int inode_only,
 103                            const loff_t start,
 104                            const loff_t end,
 105                            struct btrfs_log_ctx *ctx);
 106 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 107                              struct btrfs_root *root,
 108                              struct btrfs_path *path, u64 objectid);
 109 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
 110                                        struct btrfs_root *root,
 111                                        struct btrfs_root *log,
 112                                        struct btrfs_path *path,
 113                                        u64 dirid, int del_all);
 114
 115 /*
 116  * tree logging is a special write ahead log used to make sure that
 117  * fsyncs and O_SYNCs can happen without doing full tree commits.
 118  *
 119  * Full tree commits are expensive because they require commonly
 120  * modified blocks to be recowed, creating many dirty pages in the
 121  * extent tree an 4x-6x higher write load than ext3.
 122  *
 123  * Instead of doing a tree commit on every fsync, we use the
 124  * key ranges and transaction ids to find items for a given file or directory
 125  * that have changed in this transaction.  Those items are copied into
 126  * a special tree (one per subvolume root), that tree is written to disk
 127  * and then the fsync is considered complete.
 128  *
 129  * After a crash, items are copied out of the log-tree back into the
 130  * subvolume tree.  Any file data extents found are recorded in the extent
 131  * allocation tree, and the log-tree freed.
 132  *
 133  * The log tree is read three times, once to pin down all the extents it is
 134  * using in ram and once, once to create all the inodes logged in the tree
 135  * and once to do all the other items.
 136  */
 137
 138 /*
 139  * start a sub transaction and setup the log tree
 140  * this increments the log tree writer count to make the people
 141  * syncing the tree wait for us to finish
 142  */
 143 static int start_log_trans(struct btrfs_trans_handle *trans,
 144                            struct btrfs_root *root,
 145                            struct btrfs_log_ctx *ctx)
 146 {
 147         struct btrfs_fs_info *fs_info = root->fs_info;
 148         int ret = 0;
 149
 150         mutex_lock(&root->log_mutex);
 151
 152         if (root->log_root) {
 153                 if (btrfs_need_log_full_commit(fs_info, trans)) {
 154                         ret = -EAGAIN;
 155                         goto out;
 156                 }
 157
 158                 if (!root->log_start_pid) {
 159                         clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 160                         root->log_start_pid = current->pid;
 161                 } else if (root->log_start_pid != current->pid) {
 162                         set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 163                 }
 164         } else {
 165                 mutex_lock(&fs_info->tree_log_mutex);
 166                 if (!fs_info->log_root_tree)
 167                         ret = btrfs_init_log_root_tree(trans, fs_info);
 168                 mutex_unlock(&fs_info->tree_log_mutex);
 169                 if (ret)
 170                         goto out;
 171
 172                 ret = btrfs_add_log_tree(trans, root);
 173                 if (ret)
 174                         goto out;
 175
 176                 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
 177                 root->log_start_pid = current->pid;
 178         }
 179
 180         atomic_inc(&root->log_batch);
 181         atomic_inc(&root->log_writers);
 182         if (ctx) {
 183                 int index = root->log_transid % 2;
 184                 list_add_tail(&ctx->list, &root->log_ctxs[index]);
 185                 ctx->log_transid = root->log_transid;
 186         }
 187
 188 out:
 189         mutex_unlock(&root->log_mutex);
 190         return ret;
 191 }
 192
 193 /*
 194  * returns 0 if there was a log transaction running and we were able
 195  * to join, or returns -ENOENT if there were not transactions
 196  * in progress
 197  */
 198 static int join_running_log_trans(struct btrfs_root *root)
 199 {
 200         int ret = -ENOENT;
 201
 202         smp_mb();
 203         if (!root->log_root)
 204                 return -ENOENT;
 205
 206         mutex_lock(&root->log_mutex);
 207         if (root->log_root) {
 208                 ret = 0;
 209                 atomic_inc(&root->log_writers);
 210         }
 211         mutex_unlock(&root->log_mutex);
 212         return ret;
 213 }
 214
 215 /*
 216  * This either makes the current running log transaction wait
 217  * until you call btrfs_end_log_trans() or it makes any future
 218  * log transactions wait until you call btrfs_end_log_trans()
 219  */
 220 int btrfs_pin_log_trans(struct btrfs_root *root)
 221 {
 222         int ret = -ENOENT;
 223
 224         mutex_lock(&root->log_mutex);
 225         atomic_inc(&root->log_writers);
 226         mutex_unlock(&root->log_mutex);
 227         return ret;
 228 }
 229
 230 /*
 231  * indicate we're done making changes to the log tree
 232  * and wake up anyone waiting to do a sync
 233  */
 234 void btrfs_end_log_trans(struct btrfs_root *root)
 235 {
 236         if (atomic_dec_and_test(&root->log_writers)) {
 237                 /*
 238                  * Implicit memory barrier after atomic_dec_and_test
 239                  */
 240                 if (waitqueue_active(&root->log_writer_wait))
 241                         wake_up(&root->log_writer_wait);
 242         }
 243 }
 244
 245
 246 /*
 247  * the walk control struct is used to pass state down the chain when
 248  * processing the log tree.  The stage field tells us which part
 249  * of the log tree processing we are currently doing.  The others
 250  * are state fields used for that specific part
 251  */
 252 struct walk_control {
 253         /* should we free the extent on disk when done?  This is used
 254          * at transaction commit time while freeing a log tree
 255          */
 256         int free;
 257
 258         /* should we write out the extent buffer?  This is used
 259          * while flushing the log tree to disk during a sync
 260          */
 261         int write;
 262
 263         /* should we wait for the extent buffer io to finish?  Also used
 264          * while flushing the log tree to disk for a sync
 265          */
 266         int wait;
 267
 268         /* pin only walk, we record which extents on disk belong to the
 269          * log trees
 270          */
 271         int pin;
 272
 273         /* what stage of the replay code we're currently in */
 274         int stage;
 275
 276         /*
 277          * Ignore any items from the inode currently being processed. Needs
 278          * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
 279          * the LOG_WALK_REPLAY_INODES stage.
 280          */
 281         bool ignore_cur_inode;
 282
 283         /* the root we are currently replaying */
 284         struct btrfs_root *replay_dest;
 285
 286         /* the trans handle for the current replay */
 287         struct btrfs_trans_handle *trans;
 288
 289         /* the function that gets used to process blocks we find in the
 290          * tree.  Note the extent_buffer might not be up to date when it is
 291          * passed in, and it must be checked or read if you need the data
 292          * inside it
 293          */
 294         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
 295                             struct walk_control *wc, u64 gen);
 296 };
 297
 298 /*
 299  * process_func used to pin down extents, write them or wait on them
 300  */
 301 static int process_one_buffer(struct btrfs_root *log,
 302                               struct extent_buffer *eb,
 303                               struct walk_control *wc, u64 gen)
 304 {
 305         struct btrfs_fs_info *fs_info = log->fs_info;
 306         int ret = 0;
 307
 308         /*
 309          * If this fs is mixed then we need to be able to process the leaves to
 310          * pin down any logged extents, so we have to read the block.
 311          */
 312         if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 313                 ret = btrfs_read_buffer(eb, gen);
 314                 if (ret)
 315                         return ret;
 316         }
 317
 318         if (wc->pin)
 319                 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
 320                                                       eb->len);
 321
 322         if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
 323                 if (wc->pin && btrfs_header_level(eb) == 0)
 324                         ret = btrfs_exclude_logged_extents(fs_info, eb);
 325                 if (wc->write)
 326                         btrfs_write_tree_block(eb);
 327                 if (wc->wait)
 328                         btrfs_wait_tree_block_writeback(eb);
 329         }
 330         return ret;
 331 }
 332
 333 /*
 334  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
 335  * to the src data we are copying out.
 336  *
 337  * root is the tree we are copying into, and path is a scratch
 338  * path for use in this function (it should be released on entry and
 339  * will be released on exit).
 340  *
 341  * If the key is already in the destination tree the existing item is
 342  * overwritten.  If the existing item isn't big enough, it is extended.
 343  * If it is too large, it is truncated.
 344  *
 345  * If the key isn't in the destination yet, a new item is inserted.
 346  */
 347 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 348                                    struct btrfs_root *root,
 349                                    struct btrfs_path *path,
 350                                    struct extent_buffer *eb, int slot,
 351                                    struct btrfs_key *key)
 352 {
 353         struct btrfs_fs_info *fs_info = root->fs_info;
 354         int ret;
 355         u32 item_size;
 356         u64 saved_i_size = 0;
 357         int save_old_i_size = 0;
 358         unsigned long src_ptr;
 359         unsigned long dst_ptr;
 360         int overwrite_root = 0;
 361         bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 362
 363         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 364                 overwrite_root = 1;
 365
 366         item_size = btrfs_item_size_nr(eb, slot);
 367         src_ptr = btrfs_item_ptr_offset(eb, slot);
 368
 369         /* look for the key in the destination tree */
 370         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
 371         if (ret < 0)
 372                 return ret;
 373
 374         if (ret == 0) {
 375                 char *src_copy;
 376                 char *dst_copy;
 377                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
 378                                                   path->slots[0]);
 379                 if (dst_size != item_size)
 380                         goto insert;
 381
 382                 if (item_size == 0) {
 383                         btrfs_release_path(path);
 384                         return 0;
 385                 }
 386                 dst_copy = kmalloc(item_size, GFP_NOFS);
 387                 src_copy = kmalloc(item_size, GFP_NOFS);
 388                 if (!dst_copy || !src_copy) {
 389                         btrfs_release_path(path);
 390                         kfree(dst_copy);
 391                         kfree(src_copy);
 392                         return -ENOMEM;
 393                 }
 394
 395                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
 396
 397                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 398                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
 399                                    item_size);
 400                 ret = memcmp(dst_copy, src_copy, item_size);
 401
 402                 kfree(dst_copy);
 403                 kfree(src_copy);
 404                 /*
 405                  * they have the same contents, just return, this saves
 406                  * us from cowing blocks in the destination tree and doing
 407                  * extra writes that may not have been done by a previous
 408                  * sync
 409                  */
 410                 if (ret == 0) {
 411                         btrfs_release_path(path);
 412                         return 0;
 413                 }
 414
 415                 /*
 416                  * We need to load the old nbytes into the inode so when we
 417                  * replay the extents we've logged we get the right nbytes.
 418                  */
 419                 if (inode_item) {
 420                         struct btrfs_inode_item *item;
 421                         u64 nbytes;
 422                         u32 mode;
 423
 424                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 425                                               struct btrfs_inode_item);
 426                         nbytes = btrfs_inode_nbytes(path->nodes[0], item);
 427                         item = btrfs_item_ptr(eb, slot,
 428                                               struct btrfs_inode_item);
 429                         btrfs_set_inode_nbytes(eb, item, nbytes);
 430
 431                         /*
 432                          * If this is a directory we need to reset the i_size to
 433                          * 0 so that we can set it up properly when replaying
 434                          * the rest of the items in this log.
 435                          */
 436                         mode = btrfs_inode_mode(eb, item);
 437                         if (S_ISDIR(mode))
 438                                 btrfs_set_inode_size(eb, item, 0);
 439                 }
 440         } else if (inode_item) {
 441                 struct btrfs_inode_item *item;
 442                 u32 mode;
 443
 444                 /*
 445                  * New inode, set nbytes to 0 so that the nbytes comes out
 446                  * properly when we replay the extents.
 447                  */
 448                 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
 449                 btrfs_set_inode_nbytes(eb, item, 0);
 450
 451                 /*
 452                  * If this is a directory we need to reset the i_size to 0 so
 453                  * that we can set it up properly when replaying the rest of
 454                  * the items in this log.
 455                  */
 456                 mode = btrfs_inode_mode(eb, item);
 457                 if (S_ISDIR(mode))
 458                         btrfs_set_inode_size(eb, item, 0);
 459         }
 460 insert:
 461         btrfs_release_path(path);
 462         /* try to insert the key into the destination tree */
 463         path->skip_release_on_error = 1;
 464         ret = btrfs_insert_empty_item(trans, root, path,
 465                                       key, item_size);
 466         path->skip_release_on_error = 0;
 467
 468         /* make sure any existing item is the correct size */
 469         if (ret == -EEXIST || ret == -EOVERFLOW) {
 470                 u32 found_size;
 471                 found_size = btrfs_item_size_nr(path->nodes[0],
 472                                                 path->slots[0]);
 473                 if (found_size > item_size)
 474                         btrfs_truncate_item(fs_info, path, item_size, 1);
 475                 else if (found_size < item_size)
 476                         btrfs_extend_item(fs_info, path,
 477                                           item_size - found_size);
 478         } else if (ret) {
 479                 return ret;
 480         }
 481         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
 482                                         path->slots[0]);
 483
 484         /* don't overwrite an existing inode if the generation number
 485          * was logged as zero.  This is done when the tree logging code
 486          * is just logging an inode to make sure it exists after recovery.
 487          *
 488          * Also, don't overwrite i_size on directories during replay.
 489          * log replay inserts and removes directory items based on the
 490          * state of the tree found in the subvolume, and i_size is modified
 491          * as it goes
 492          */
 493         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
 494                 struct btrfs_inode_item *src_item;
 495                 struct btrfs_inode_item *dst_item;
 496
 497                 src_item = (struct btrfs_inode_item *)src_ptr;
 498                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 499
 500                 if (btrfs_inode_generation(eb, src_item) == 0) {
 501                         struct extent_buffer *dst_eb = path->nodes[0];
 502                         const u64 ino_size = btrfs_inode_size(eb, src_item);
 503
 504                         /*
 505                          * For regular files an ino_size == 0 is used only when
 506                          * logging that an inode exists, as part of a directory
 507                          * fsync, and the inode wasn't fsynced before. In this
 508                          * case don't set the size of the inode in the fs/subvol
 509                          * tree, otherwise we would be throwing valid data away.
 510                          */
 511                         if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
 512                             S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
 513                             ino_size != 0) {
 514                                 struct btrfs_map_token token;
 515
 516                                 btrfs_init_map_token(&token);
 517                                 btrfs_set_token_inode_size(dst_eb, dst_item,
 518                                                            ino_size, &token);
 519                         }
 520                         goto no_copy;
 521                 }
 522
 523                 if (overwrite_root &&
 524                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
 525                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
 526                         save_old_i_size = 1;
 527                         saved_i_size = btrfs_inode_size(path->nodes[0],
 528                                                         dst_item);
 529                 }
 530         }
 531
 532         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
 533                            src_ptr, item_size);
 534
 535         if (save_old_i_size) {
 536                 struct btrfs_inode_item *dst_item;
 537                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 538                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
 539         }
 540
 541         /* make sure the generation is filled in */
 542         if (key->type == BTRFS_INODE_ITEM_KEY) {
 543                 struct btrfs_inode_item *dst_item;
 544                 dst_item = (struct btrfs_inode_item *)dst_ptr;
 545                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
 546                         btrfs_set_inode_generation(path->nodes[0], dst_item,
 547                                                    trans->transid);
 548                 }
 549         }
 550 no_copy:
 551         btrfs_mark_buffer_dirty(path->nodes[0]);
 552         btrfs_release_path(path);
 553         return 0;
 554 }
 555
 556 /*
 557  * simple helper to read an inode off the disk from a given root
 558  * This can only be called for subvolume roots and not for the log
 559  */
 560 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 561                                              u64 objectid)
 562 {
 563         struct btrfs_key key;
 564         struct inode *inode;
 565
 566         key.objectid = objectid;
 567         key.type = BTRFS_INODE_ITEM_KEY;
 568         key.offset = 0;
 569         inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
 570         if (IS_ERR(inode)) {
 571                 inode = NULL;
 572         } else if (is_bad_inode(inode)) {
 573                 iput(inode);
 574                 inode = NULL;
 575         }
 576         return inode;
 577 }
 578
 579 /* replays a single extent in 'eb' at 'slot' with 'key' into the
 580  * subvolume 'root'.  path is released on entry and should be released
 581  * on exit.
 582  *
 583  * extents in the log tree have not been allocated out of the extent
 584  * tree yet.  So, this completes the allocation, taking a reference
 585  * as required if the extent already exists or creating a new extent
 586  * if it isn't in the extent allocation tree yet.
 587  *
 588  * The extent is inserted into the file, dropping any existing extents
 589  * from the file that overlap the new one.
 590  */
 591 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 592                                       struct btrfs_root *root,
 593                                       struct btrfs_path *path,
 594                                       struct extent_buffer *eb, int slot,
 595                                       struct btrfs_key *key)
 596 {
 597         struct btrfs_fs_info *fs_info = root->fs_info;
 598         int found_type;
 599         u64 extent_end;
 600         u64 start = key->offset;
 601         u64 nbytes = 0;
 602         struct btrfs_file_extent_item *item;
 603         struct inode *inode = NULL;
 604         unsigned long size;
 605         int ret = 0;
 606
 607         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 608         found_type = btrfs_file_extent_type(eb, item);
 609
 610         if (found_type == BTRFS_FILE_EXTENT_REG ||
 611             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 612                 nbytes = btrfs_file_extent_num_bytes(eb, item);
 613                 extent_end = start + nbytes;
 614
 615                 /*
 616                  * We don't add to the inodes nbytes if we are prealloc or a
 617                  * hole.
 618                  */
 619                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
 620                         nbytes = 0;
 621         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 622                 size = btrfs_file_extent_ram_bytes(eb, item);
 623                 nbytes = btrfs_file_extent_ram_bytes(eb, item);
 624                 extent_end = ALIGN(start + size,
 625                                    fs_info->sectorsize);
 626         } else {
 627                 ret = 0;
 628                 goto out;
 629         }
 630
 631         inode = read_one_inode(root, key->objectid);
 632         if (!inode) {
 633                 ret = -EIO;
 634                 goto out;
 635         }
 636
 637         /*
 638          * first check to see if we already have this extent in the
 639          * file.  This must be done before the btrfs_drop_extents run
 640          * so we don't try to drop this extent.
 641          */
 642         ret = btrfs_lookup_file_extent(trans, root, path,
 643                         btrfs_ino(BTRFS_I(inode)), start, 0);
 644
 645         if (ret == 0 &&
 646             (found_type == BTRFS_FILE_EXTENT_REG ||
 647              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
 648                 struct btrfs_file_extent_item cmp1;
 649                 struct btrfs_file_extent_item cmp2;
 650                 struct btrfs_file_extent_item *existing;
 651                 struct extent_buffer *leaf;
 652
 653                 leaf = path->nodes[0];
 654                 existing = btrfs_item_ptr(leaf, path->slots[0],
 655                                           struct btrfs_file_extent_item);
 656
 657                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
 658                                    sizeof(cmp1));
 659                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
 660                                    sizeof(cmp2));
 661
 662                 /*
 663                  * we already have a pointer to this exact extent,
 664                  * we don't have to do anything
 665                  */
 666                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
 667                         btrfs_release_path(path);
 668                         goto out;
 669                 }
 670         }
 671         btrfs_release_path(path);
 672
 673         /* drop any overlapping extents */
 674         ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
 675         if (ret)
 676                 goto out;
 677
 678         if (found_type == BTRFS_FILE_EXTENT_REG ||
 679             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 680                 u64 offset;
 681                 unsigned long dest_offset;
 682                 struct btrfs_key ins;
 683
 684                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
 685                     btrfs_fs_incompat(fs_info, NO_HOLES))
 686                         goto update_inode;
 687
 688                 ret = btrfs_insert_empty_item(trans, root, path, key,
 689                                               sizeof(*item));
 690                 if (ret)
 691                         goto out;
 692                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 693                                                     path->slots[0]);
 694                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
 695                                 (unsigned long)item,  sizeof(*item));
 696
 697                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 698                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 699                 ins.type = BTRFS_EXTENT_ITEM_KEY;
 700                 offset = key->offset - btrfs_file_extent_offset(eb, item);
 701
 702                 /*
 703                  * Manually record dirty extent, as here we did a shallow
 704                  * file extent item copy and skip normal backref update,
 705                  * but modifying extent tree all by ourselves.
 706                  * So need to manually record dirty extent for qgroup,
 707                  * as the owner of the file extent changed from log tree
 708                  * (doesn't affect qgroup) to fs/file tree(affects qgroup)
 709                  */
 710                 ret = btrfs_qgroup_trace_extent(trans, fs_info,
 711                                 btrfs_file_extent_disk_bytenr(eb, item),
 712                                 btrfs_file_extent_disk_num_bytes(eb, item),
 713                                 GFP_NOFS);
 714                 if (ret < 0)
 715                         goto out;
 716
 717                 if (ins.objectid > 0) {
 718                         u64 csum_start;
 719                         u64 csum_end;
 720                         LIST_HEAD(ordered_sums);
 721                         /*
 722                          * is this extent already allocated in the extent
 723                          * allocation tree?  If so, just add a reference
 724                          */
 725                         ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
 726                                                 ins.offset);
 727                         if (ret == 0) {
 728                                 ret = btrfs_inc_extent_ref(trans, fs_info,
 729                                                 ins.objectid, ins.offset,
 730                                                 0, root->root_key.objectid,
 731                                                 key->objectid, offset);
 732                                 if (ret)
 733                                         goto out;
 734                         } else {
 735                                 /*
 736                                  * insert the extent pointer in the extent
 737                                  * allocation tree
 738                                  */
 739                                 ret = btrfs_alloc_logged_file_extent(trans,
 740                                                 fs_info,
 741                                                 root->root_key.objectid,
 742                                                 key->objectid, offset, &ins);
 743                                 if (ret)
 744                                         goto out;
 745                         }
 746                         btrfs_release_path(path);
 747
 748                         if (btrfs_file_extent_compression(eb, item)) {
 749                                 csum_start = ins.objectid;
 750                                 csum_end = csum_start + ins.offset;
 751                         } else {
 752                                 csum_start = ins.objectid +
 753                                         btrfs_file_extent_offset(eb, item);
 754                                 csum_end = csum_start +
 755                                         btrfs_file_extent_num_bytes(eb, item);
 756                         }
 757
 758                         ret = btrfs_lookup_csums_range(root->log_root,
 759                                                 csum_start, csum_end - 1,
 760                                                 &ordered_sums, 0);
 761                         if (ret)
 762                                 goto out;
 763                         /*
 764                          * Now delete all existing cums in the csum root that
 765                          * cover our range. We do this because we can have an
 766                          * extent that is completely referenced by one file
 767                          * extent item and partially referenced by another
 768                          * file extent item (like after using the clone or
 769                          * extent_same ioctls). In this case if we end up doing
 770                          * the replay of the one that partially references the
 771                          * extent first, and we do not do the csum deletion
 772                          * below, we can get 2 csum items in the csum tree that
 773                          * overlap each other. For example, imagine our log has
 774                          * the two following file extent items:
 775                          *
 776                          * key (257 EXTENT_DATA 409600)
 777                          *     extent data disk byte 12845056 nr 102400
 778                          *     extent data offset 20480 nr 20480 ram 102400
 779                          *
 780                          * key (257 EXTENT_DATA 819200)
 781                          *     extent data disk byte 12845056 nr 102400
 782                          *     extent data offset 0 nr 102400 ram 102400
 783                          *
 784                          * Where the second one fully references the 100K extent
 785                          * that starts at disk byte 12845056, and the log tree
 786                          * has a single csum item that covers the entire range
 787                          * of the extent:
 788                          *
 789                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 790                          *
 791                          * After the first file extent item is replayed, the
 792                          * csum tree gets the following csum item:
 793                          *
 794                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 795                          *
 796                          * Which covers the 20K sub-range starting at offset 20K
 797                          * of our extent. Now when we replay the second file
 798                          * extent item, if we do not delete existing csum items
 799                          * that cover any of its blocks, we end up getting two
 800                          * csum items in our csum tree that overlap each other:
 801                          *
 802                          * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
 803                          * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
 804                          *
 805                          * Which is a problem, because after this anyone trying
 806                          * to lookup up for the checksum of any block of our
 807                          * extent starting at an offset of 40K or higher, will
 808                          * end up looking at the second csum item only, which
 809                          * does not contain the checksum for any block starting
 810                          * at offset 40K or higher of our extent.
 811                          */
 812                         while (!list_empty(&ordered_sums)) {
 813                                 struct btrfs_ordered_sum *sums;
 814                                 sums = list_entry(ordered_sums.next,
 815                                                 struct btrfs_ordered_sum,
 816                                                 list);
 817                                 if (!ret)
 818                                         ret = btrfs_del_csums(trans, fs_info,
 819                                                               sums->bytenr,
 820                                                               sums->len);
 821                                 if (!ret)
 822                                         ret = btrfs_csum_file_blocks(trans,
 823                                                 fs_info->csum_root, sums);
 824                                 list_del(&sums->list);
 825                                 kfree(sums);
 826                         }
 827                         if (ret)
 828                                 goto out;
 829                 } else {
 830                         btrfs_release_path(path);
 831                 }
 832         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 833                 /* inline extents are easy, we just overwrite them */
 834                 ret = overwrite_item(trans, root, path, eb, slot, key);
 835                 if (ret)
 836                         goto out;
 837         }
 838
 839         inode_add_bytes(inode, nbytes);
 840 update_inode:
 841         ret = btrfs_update_inode(trans, root, inode);
 842 out:
 843         if (inode)
 844                 iput(inode);
 845         return ret;
 846 }
 847
 848 /*
 849  * when cleaning up conflicts between the directory names in the
 850  * subvolume, directory names in the log and directory names in the
 851  * inode back references, we may have to unlink inodes from directories.
 852  *
 853  * This is a helper function to do the unlink of a specific directory
 854  * item
 855  */
 856 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 857                                       struct btrfs_root *root,
 858                                       struct btrfs_path *path,
 859                                       struct btrfs_inode *dir,
 860                                       struct btrfs_dir_item *di)
 861 {
 862         struct btrfs_fs_info *fs_info = root->fs_info;
 863         struct inode *inode;
 864         char *name;
 865         int name_len;
 866         struct extent_buffer *leaf;
 867         struct btrfs_key location;
 868         int ret;
 869
 870         leaf = path->nodes[0];
 871
 872         btrfs_dir_item_key_to_cpu(leaf, di, &location);
 873         name_len = btrfs_dir_name_len(leaf, di);
 874         name = kmalloc(name_len, GFP_NOFS);
 875         if (!name)
 876                 return -ENOMEM;
 877
 878         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
 879         btrfs_release_path(path);
 880
 881         inode = read_one_inode(root, location.objectid);
 882         if (!inode) {
 883                 ret = -EIO;
 884                 goto out;
 885         }
 886
 887         ret = link_to_fixup_dir(trans, root, path, location.objectid);
 888         if (ret)
 889                 goto out;
 890
 891         ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
 892                         name_len);
 893         if (ret)
 894                 goto out;
 895         else
 896                 ret = btrfs_run_delayed_items(trans, fs_info);
 897 out:
 898         kfree(name);
 899         iput(inode);
 900         return ret;
 901 }
 902
 903 /*
 904  * See if a given name and sequence number found in an inode back reference are
 905  * already in a directory and correctly point to this inode.
 906  *
 907  * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
 908  * exists.
 909  */
 910 static noinline int inode_in_dir(struct btrfs_root *root,
 911                                  struct btrfs_path *path,
 912                                  u64 dirid, u64 objectid, u64 index,
 913                                  const char *name, int name_len)
 914 {
 915         struct btrfs_dir_item *di;
 916         struct btrfs_key location;
 917         int ret = 0;
 918
 919         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
 920                                          index, name, name_len, 0);
 921         if (IS_ERR(di)) {
 922                 if (PTR_ERR(di) != -ENOENT)
 923                         ret = PTR_ERR(di);
 924                 goto out;
 925         } else if (di) {
 926                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 927                 if (location.objectid != objectid)
 928                         goto out;
 929         } else {
 930                 goto out;
 931         }
 932
 933         btrfs_release_path(path);
 934         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
 935         if (IS_ERR(di)) {
 936                 ret = PTR_ERR(di);
 937                 goto out;
 938         } else if (di) {
 939                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
 940                 if (location.objectid == objectid)
 941                         ret = 1;
 942         }
 943 out:
 944         btrfs_release_path(path);
 945         return ret;
 946 }
 947
 948 /*
 949  * helper function to check a log tree for a named back reference in
 950  * an inode.  This is used to decide if a back reference that is
 951  * found in the subvolume conflicts with what we find in the log.
 952  *
 953  * inode backreferences may have multiple refs in a single item,
 954  * during replay we process one reference at a time, and we don't
 955  * want to delete valid links to a file from the subvolume if that
 956  * link is also in the log.
 957  */
 958 static noinline int backref_in_log(struct btrfs_root *log,
 959                                    struct btrfs_key *key,
 960                                    u64 ref_objectid,
 961                                    const char *name, int namelen)
 962 {
 963         struct btrfs_path *path;
 964         struct btrfs_inode_ref *ref;
 965         unsigned long ptr;
 966         unsigned long ptr_end;
 967         unsigned long name_ptr;
 968         int found_name_len;
 969         int item_size;
 970         int ret;
 971         int match = 0;
 972
 973         path = btrfs_alloc_path();
 974         if (!path)
 975                 return -ENOMEM;
 976
 977         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
 978         if (ret != 0)
 979                 goto out;
 980
 981         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
 982
 983         if (key->type == BTRFS_INODE_EXTREF_KEY) {
 984                 if (btrfs_find_name_in_ext_backref(path, ref_objectid,
 985                                                    name, namelen, NULL))
 986                         match = 1;
 987
 988                 goto out;
 989         }
 990
 991         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 992         ptr_end = ptr + item_size;
 993         while (ptr < ptr_end) {
 994                 ref = (struct btrfs_inode_ref *)ptr;
 995                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
 996                 if (found_name_len == namelen) {
 997                         name_ptr = (unsigned long)(ref + 1);
 998                         ret = memcmp_extent_buffer(path->nodes[0], name,
 999                                                    name_ptr, namelen);
1000                         if (ret == 0) {
1001                                 match = 1;
1002                                 goto out;
1003                         }
1004                 }
1005                 ptr = (unsigned long)(ref + 1) + found_name_len;
1006         }
1007 out:
1008         btrfs_free_path(path);
1009         return match;
1010 }
1011
1012 static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1013                                   struct btrfs_root *root,
1014                                   struct btrfs_path *path,
1015                                   struct btrfs_root *log_root,
1016                                   struct btrfs_inode *dir,
1017                                   struct btrfs_inode *inode,
1018                                   u64 inode_objectid, u64 parent_objectid,
1019                                   u64 ref_index, char *name, int namelen,
1020                                   int *search_done)
1021 {
1022         struct btrfs_fs_info *fs_info = root->fs_info;
1023         int ret;
1024         char *victim_name;
1025         int victim_name_len;
1026         struct extent_buffer *leaf;
1027         struct btrfs_dir_item *di;
1028         struct btrfs_key search_key;
1029         struct btrfs_inode_extref *extref;
1030
1031 again:
1032         /* Search old style refs */
1033         search_key.objectid = inode_objectid;
1034         search_key.type = BTRFS_INODE_REF_KEY;
1035         search_key.offset = parent_objectid;
1036         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1037         if (ret == 0) {
1038                 struct btrfs_inode_ref *victim_ref;
1039                 unsigned long ptr;
1040                 unsigned long ptr_end;
1041
1042                 leaf = path->nodes[0];
1043
1044                 /* are we trying to overwrite a back ref for the root directory
1045                  * if so, just jump out, we're done
1046                  */
1047                 if (search_key.objectid == search_key.offset)
1048                         return 1;
1049
1050                 /* check all the names in this back reference to see
1051                  * if they are in the log.  if so, we allow them to stay
1052                  * otherwise they must be unlinked as a conflict
1053                  */
1054                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1055                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1056                 while (ptr < ptr_end) {
1057                         victim_ref = (struct btrfs_inode_ref *)ptr;
1058                         victim_name_len = btrfs_inode_ref_name_len(leaf,
1059                                                                    victim_ref);
1060                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1061                         if (!victim_name)
1062                                 return -ENOMEM;
1063
1064                         read_extent_buffer(leaf, victim_name,
1065                                            (unsigned long)(victim_ref + 1),
1066                                            victim_name_len);
1067
1068                         if (!backref_in_log(log_root, &search_key,
1069                                             parent_objectid,
1070                                             victim_name,
1071                                             victim_name_len)) {
1072                                 inc_nlink(&inode->vfs_inode);
1073                                 btrfs_release_path(path);
1074
1075                                 ret = btrfs_unlink_inode(trans, root, dir, inode,
1076                                                 victim_name, victim_name_len);
1077                                 kfree(victim_name);
1078                                 if (ret)
1079                                         return ret;
1080                                 ret = btrfs_run_delayed_items(trans, fs_info);
1081                                 if (ret)
1082                                         return ret;
1083                                 *search_done = 1;
1084                                 goto again;
1085                         }
1086                         kfree(victim_name);
1087
1088                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1089                 }
1090
1091                 /*
1092                  * NOTE: we have searched root tree and checked the
1093                  * corresponding ref, it does not need to check again.
1094                  */
1095                 *search_done = 1;
1096         }
1097         btrfs_release_path(path);
1098
1099         /* Same search but for extended refs */
1100         extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1101                                            inode_objectid, parent_objectid, 0,
1102                                            0);
1103         if (IS_ERR(extref)) {
1104                 return PTR_ERR(extref);
1105         } else if (extref) {
1106                 u32 item_size;
1107                 u32 cur_offset = 0;
1108                 unsigned long base;
1109                 struct inode *victim_parent;
1110
1111                 leaf = path->nodes[0];
1112
1113                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1114                 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1115
1116                 while (cur_offset < item_size) {
1117                         extref = (struct btrfs_inode_extref *)(base + cur_offset);
1118
1119                         victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1120
1121                         if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1122                                 goto next;
1123
1124                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
1125                         if (!victim_name)
1126                                 return -ENOMEM;
1127                         read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1128                                            victim_name_len);
1129
1130                         search_key.objectid = inode_objectid;
1131                         search_key.type = BTRFS_INODE_EXTREF_KEY;
1132                         search_key.offset = btrfs_extref_hash(parent_objectid,
1133                                                               victim_name,
1134                                                               victim_name_len);
1135                         ret = 0;
1136                         if (!backref_in_log(log_root, &search_key,
1137                                             parent_objectid, victim_name,
1138                                             victim_name_len)) {
1139                                 ret = -ENOENT;
1140                                 victim_parent = read_one_inode(root,
1141                                                 parent_objectid);
1142                                 if (victim_parent) {
1143                                         inc_nlink(&inode->vfs_inode);
1144                                         btrfs_release_path(path);
1145
1146                                         ret = btrfs_unlink_inode(trans, root,
1147                                                         BTRFS_I(victim_parent),
1148                                                         inode,
1149                                                         victim_name,
1150                                                         victim_name_len);
1151                                         if (!ret)
1152                                                 ret = btrfs_run_delayed_items(
1153                                                                   trans,
1154                                                                   fs_info);
1155                                 }
1156                                 iput(victim_parent);
1157                                 kfree(victim_name);
1158                                 if (ret)
1159                                         return ret;
1160                                 *search_done = 1;
1161                                 goto again;
1162                         }
1163                         kfree(victim_name);
1164 next:
1165                         cur_offset += victim_name_len + sizeof(*extref);
1166                 }
1167                 *search_done = 1;
1168         }
1169         btrfs_release_path(path);
1170
1171         /* look for a conflicting sequence number */
1172         di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1173                                          ref_index, name, namelen, 0);
1174         if (IS_ERR(di)) {
1175                 if (PTR_ERR(di) != -ENOENT)
1176                         return PTR_ERR(di);
1177         } else if (di) {
1178                 ret = drop_one_dir_item(trans, root, path, dir, di);
1179                 if (ret)
1180                         return ret;
1181         }
1182         btrfs_release_path(path);
1183
1184         /* look for a conflicing name */
1185         di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1186                                    name, namelen, 0);
1187         if (IS_ERR(di)) {
1188                 return PTR_ERR(di);
1189         } else if (di) {
1190                 ret = drop_one_dir_item(trans, root, path, dir, di);
1191                 if (ret)
1192                         return ret;
1193         }
1194         btrfs_release_path(path);
1195
1196         return 0;
1197 }
1198
1199 static int extref_get_fields(struct extent_buffer *eb, int slot,
1200                              unsigned long ref_ptr, u32 *namelen, char **name,
1201                              u64 *index, u64 *parent_objectid)
1202 {
1203         struct btrfs_inode_extref *extref;
1204
1205         extref = (struct btrfs_inode_extref *)ref_ptr;
1206
1207         *namelen = btrfs_inode_extref_name_len(eb, extref);
1208         if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name,
1209                                      *namelen))
1210                 return -EIO;
1211
1212         *name = kmalloc(*namelen, GFP_NOFS);
1213         if (*name == NULL)
1214                 return -ENOMEM;
1215
1216         read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1217                            *namelen);
1218
1219         *index = btrfs_inode_extref_index(eb, extref);
1220         if (parent_objectid)
1221                 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1222
1223         return 0;
1224 }
1225
1226 static int ref_get_fields(struct extent_buffer *eb, int slot,
1227                           unsigned long ref_ptr, u32 *namelen, char **name,
1228                           u64 *index)
1229 {
1230         struct btrfs_inode_ref *ref;
1231
1232         ref = (struct btrfs_inode_ref *)ref_ptr;
1233
1234         *namelen = btrfs_inode_ref_name_len(eb, ref);
1235         if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1),
1236                                      *namelen))
1237                 return -EIO;
1238
1239         *name = kmalloc(*namelen, GFP_NOFS);
1240         if (*name == NULL)
1241                 return -ENOMEM;
1242
1243         read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1244
1245         *index = btrfs_inode_ref_index(eb, ref);
1246
1247         return 0;
1248 }
1249
1250 /*
1251  * replay one inode back reference item found in the log tree.
1252  * eb, slot and key refer to the buffer and key found in the log tree.
1253  * root is the destination we are replaying into, and path is for temp
1254  * use by this function.  (it should be released on return).
1255  */
1256 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1257                                   struct btrfs_root *root,
1258                                   struct btrfs_root *log,
1259                                   struct btrfs_path *path,
1260                                   struct extent_buffer *eb, int slot,
1261                                   struct btrfs_key *key)
1262 {
1263         struct inode *dir = NULL;
1264         struct inode *inode = NULL;
1265         unsigned long ref_ptr;
1266         unsigned long ref_end;
1267         char *name = NULL;
1268         int namelen;
1269         int ret;
1270         int search_done = 0;
1271         int log_ref_ver = 0;
1272         u64 parent_objectid;
1273         u64 inode_objectid;
1274         u64 ref_index = 0;
1275         int ref_struct_size;
1276
1277         ref_ptr = btrfs_item_ptr_offset(eb, slot);
1278         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1279
1280         if (key->type == BTRFS_INODE_EXTREF_KEY) {
1281                 struct btrfs_inode_extref *r;
1282
1283                 ref_struct_size = sizeof(struct btrfs_inode_extref);
1284                 log_ref_ver = 1;
1285                 r = (struct btrfs_inode_extref *)ref_ptr;
1286                 parent_objectid = btrfs_inode_extref_parent(eb, r);
1287         } else {
1288                 ref_struct_size = sizeof(struct btrfs_inode_ref);
1289                 parent_objectid = key->offset;
1290         }
1291         inode_objectid = key->objectid;
1292
1293         /*
1294          * it is possible that we didn't log all the parent directories
1295          * for a given inode.  If we don't find the dir, just don't
1296          * copy the back ref in.  The link count fixup code will take
1297          * care of the rest
1298          */
1299         dir = read_one_inode(root, parent_objectid);
1300         if (!dir) {
1301                 ret = -ENOENT;
1302                 goto out;
1303         }
1304
1305         inode = read_one_inode(root, inode_objectid);
1306         if (!inode) {
1307                 ret = -EIO;
1308                 goto out;
1309         }
1310
1311         while (ref_ptr < ref_end) {
1312                 if (log_ref_ver) {
1313                         ret = extref_get_fields(eb, slot, ref_ptr, &namelen,
1314                                           &name, &ref_index, &parent_objectid);
1315                         /*
1316                          * parent object can change from one array
1317                          * item to another.
1318                          */
1319                         if (!dir)
1320                                 dir = read_one_inode(root, parent_objectid);
1321                         if (!dir) {
1322                                 ret = -ENOENT;
1323                                 goto out;
1324                         }
1325                 } else {
1326                         ret = ref_get_fields(eb, slot, ref_ptr, &namelen,
1327                                              &name, &ref_index);
1328                 }
1329                 if (ret)
1330                         goto out;
1331
1332                 ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1333                                    btrfs_ino(BTRFS_I(inode)), ref_index,
1334                                    name, namelen);
1335                 if (ret < 0) {
1336                         goto out;
1337                 } else if (ret == 0) {
1338                         /*
1339                          * look for a conflicting back reference in the
1340                          * metadata. if we find one we have to unlink that name
1341                          * of the file before we add our new link.  Later on, we
1342                          * overwrite any existing back reference, and we don't
1343                          * want to create dangling pointers in the directory.
1344                          */
1345
1346                         if (!search_done) {
1347                                 ret = __add_inode_ref(trans, root, path, log,
1348                                                       BTRFS_I(dir),
1349                                                       BTRFS_I(inode),
1350                                                       inode_objectid,
1351                                                       parent_objectid,
1352                                                       ref_index, name, namelen,
1353                                                       &search_done);
1354                                 if (ret) {
1355                                         if (ret == 1)
1356                                                 ret = 0;
1357                                         goto out;
1358                                 }
1359                         }
1360
1361                         /* insert our name */
1362                         ret = btrfs_add_link(trans, BTRFS_I(dir),
1363                                         BTRFS_I(inode),
1364                                         name, namelen, 0, ref_index);
1365                         if (ret)
1366                                 goto out;
1367
1368                         btrfs_update_inode(trans, root, inode);
1369                 }
1370                 /* Else, ret == 1, we already have a perfect match, we're done. */
1371
1372                 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1373                 kfree(name);
1374                 name = NULL;
1375                 if (log_ref_ver) {
1376                         iput(dir);
1377                         dir = NULL;
1378                 }
1379         }
1380
1381         /* finally write the back reference in the inode */
1382         ret = overwrite_item(trans, root, path, eb, slot, key);
1383 out:
1384         btrfs_release_path(path);
1385         kfree(name);
1386         iput(dir);
1387         iput(inode);
1388         return ret;
1389 }
1390
1391 static int insert_orphan_item(struct btrfs_trans_handle *trans,
1392                               struct btrfs_root *root, u64 ino)
1393 {
1394         int ret;
1395
1396         ret = btrfs_insert_orphan_item(trans, root, ino);
1397         if (ret == -EEXIST)
1398                 ret = 0;
1399
1400         return ret;
1401 }
1402
1403 static int count_inode_extrefs(struct btrfs_root *root,
1404                 struct btrfs_inode *inode, struct btrfs_path *path)
1405 {
1406         int ret = 0;
1407         int name_len;
1408         unsigned int nlink = 0;
1409         u32 item_size;
1410         u32 cur_offset = 0;
1411         u64 inode_objectid = btrfs_ino(inode);
1412         u64 offset = 0;
1413         unsigned long ptr;
1414         struct btrfs_inode_extref *extref;
1415         struct extent_buffer *leaf;
1416
1417         while (1) {
1418                 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1419                                             &extref, &offset);
1420                 if (ret)
1421                         break;
1422
1423                 leaf = path->nodes[0];
1424                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1425                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1426                 cur_offset = 0;
1427
1428                 while (cur_offset < item_size) {
1429                         extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1430                         name_len = btrfs_inode_extref_name_len(leaf, extref);
1431
1432                         nlink++;
1433
1434                         cur_offset += name_len + sizeof(*extref);
1435                 }
1436
1437                 offset++;
1438                 btrfs_release_path(path);
1439         }
1440         btrfs_release_path(path);
1441
1442         if (ret < 0 && ret != -ENOENT)
1443                 return ret;
1444         return nlink;
1445 }
1446
1447 static int count_inode_refs(struct btrfs_root *root,
1448                         struct btrfs_inode *inode, struct btrfs_path *path)
1449 {
1450         int ret;
1451         struct btrfs_key key;
1452         unsigned int nlink = 0;
1453         unsigned long ptr;
1454         unsigned long ptr_end;
1455         int name_len;
1456         u64 ino = btrfs_ino(inode);
1457
1458         key.objectid = ino;
1459         key.type = BTRFS_INODE_REF_KEY;
1460         key.offset = (u64)-1;
1461
1462         while (1) {
1463                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1464                 if (ret < 0)
1465                         break;
1466                 if (ret > 0) {
1467                         if (path->slots[0] == 0)
1468                                 break;
1469                         path->slots[0]--;
1470                 }
1471 process_slot:
1472                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1473                                       path->slots[0]);
1474                 if (key.objectid != ino ||
1475                     key.type != BTRFS_INODE_REF_KEY)
1476                         break;
1477                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1478                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1479                                                    path->slots[0]);
1480                 while (ptr < ptr_end) {
1481                         struct btrfs_inode_ref *ref;
1482
1483                         ref = (struct btrfs_inode_ref *)ptr;
1484                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1485                                                             ref);
1486                         ptr = (unsigned long)(ref + 1) + name_len;
1487                         nlink++;
1488                 }
1489
1490                 if (key.offset == 0)
1491                         break;
1492                 if (path->slots[0] > 0) {
1493                         path->slots[0]--;
1494                         goto process_slot;
1495                 }
1496                 key.offset--;
1497                 btrfs_release_path(path);
1498         }
1499         btrfs_release_path(path);
1500
1501         return nlink;
1502 }
1503
1504 /*
1505  * There are a few corners where the link count of the file can't
1506  * be properly maintained during replay.  So, instead of adding
1507  * lots of complexity to the log code, we just scan the backrefs
1508  * for any file that has been through replay.
1509  *
1510  * The scan will update the link count on the inode to reflect the
1511  * number of back refs found.  If it goes down to zero, the iput
1512  * will free the inode.
1513  */
1514 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1515                                            struct btrfs_root *root,
1516                                            struct inode *inode)
1517 {
1518         struct btrfs_path *path;
1519         int ret;
1520         u64 nlink = 0;
1521         u64 ino = btrfs_ino(BTRFS_I(inode));
1522
1523         path = btrfs_alloc_path();
1524         if (!path)
1525                 return -ENOMEM;
1526
1527         ret = count_inode_refs(root, BTRFS_I(inode), path);
1528         if (ret < 0)
1529                 goto out;
1530
1531         nlink = ret;
1532
1533         ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1534         if (ret < 0)
1535                 goto out;
1536
1537         nlink += ret;
1538
1539         ret = 0;
1540
1541         if (nlink != inode->i_nlink) {
1542                 set_nlink(inode, nlink);
1543                 btrfs_update_inode(trans, root, inode);
1544         }
1545         BTRFS_I(inode)->index_cnt = (u64)-1;
1546
1547         if (inode->i_nlink == 0) {
1548                 if (S_ISDIR(inode->i_mode)) {
1549                         ret = replay_dir_deletes(trans, root, NULL, path,
1550                                                  ino, 1);
1551                         if (ret)
1552                                 goto out;
1553                 }
1554                 ret = insert_orphan_item(trans, root, ino);
1555         }
1556
1557 out:
1558         btrfs_free_path(path);
1559         return ret;
1560 }
1561
1562 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1563                                             struct btrfs_root *root,
1564                                             struct btrfs_path *path)
1565 {
1566         int ret;
1567         struct btrfs_key key;
1568         struct inode *inode;
1569
1570         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1571         key.type = BTRFS_ORPHAN_ITEM_KEY;
1572         key.offset = (u64)-1;
1573         while (1) {
1574                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1575                 if (ret < 0)
1576                         break;
1577
1578                 if (ret == 1) {
1579                         ret = 0;
1580                         if (path->slots[0] == 0)
1581                                 break;
1582                         path->slots[0]--;
1583                 }
1584
1585                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1586                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1587                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1588                         break;
1589
1590                 ret = btrfs_del_item(trans, root, path);
1591                 if (ret)
1592                         break;
1593
1594                 btrfs_release_path(path);
1595                 inode = read_one_inode(root, key.offset);
1596                 if (!inode) {
1597                         ret = -EIO;
1598                         break;
1599                 }
1600
1601                 ret = fixup_inode_link_count(trans, root, inode);
1602                 iput(inode);
1603                 if (ret)
1604                         break;
1605
1606                 /*
1607                  * fixup on a directory may create new entries,
1608                  * make sure we always look for the highset possible
1609                  * offset
1610                  */
1611                 key.offset = (u64)-1;
1612         }
1613         btrfs_release_path(path);
1614         return ret;
1615 }
1616
1617
1618 /*
1619  * record a given inode in the fixup dir so we can check its link
1620  * count when replay is done.  The link count is incremented here
1621  * so the inode won't go away until we check it
1622  */
1623 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1624                                       struct btrfs_root *root,
1625                                       struct btrfs_path *path,
1626                                       u64 objectid)
1627 {
1628         struct btrfs_key key;
1629         int ret = 0;
1630         struct inode *inode;
1631
1632         inode = read_one_inode(root, objectid);
1633         if (!inode)
1634                 return -EIO;
1635
1636         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1637         key.type = BTRFS_ORPHAN_ITEM_KEY;
1638         key.offset = objectid;
1639
1640         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1641
1642         btrfs_release_path(path);
1643         if (ret == 0) {
1644                 if (!inode->i_nlink)
1645                         set_nlink(inode, 1);
1646                 else
1647                         inc_nlink(inode);
1648                 ret = btrfs_update_inode(trans, root, inode);
1649         } else if (ret == -EEXIST) {
1650                 ret = 0;
1651         }
1652         iput(inode);
1653
1654         return ret;
1655 }
1656
1657 /*
1658  * when replaying the log for a directory, we only insert names
1659  * for inodes that actually exist.  This means an fsync on a directory
1660  * does not implicitly fsync all the new files in it
1661  */
1662 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1663                                     struct btrfs_root *root,
1664                                     u64 dirid, u64 index,
1665                                     char *name, int name_len,
1666                                     struct btrfs_key *location)
1667 {
1668         struct inode *inode;
1669         struct inode *dir;
1670         int ret;
1671
1672         inode = read_one_inode(root, location->objectid);
1673         if (!inode)
1674                 return -ENOENT;
1675
1676         dir = read_one_inode(root, dirid);
1677         if (!dir) {
1678                 iput(inode);
1679                 return -EIO;
1680         }
1681
1682         ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1683                         name_len, 1, index);
1684
1685         /* FIXME, put inode into FIXUP list */
1686
1687         iput(inode);
1688         iput(dir);
1689         return ret;
1690 }
1691
1692 /*
1693  * Return true if an inode reference exists in the log for the given name,
1694  * inode and parent inode.
1695  */
1696 static bool name_in_log_ref(struct btrfs_root *log_root,
1697                             const char *name, const int name_len,
1698                             const u64 dirid, const u64 ino)
1699 {
1700         struct btrfs_key search_key;
1701
1702         search_key.objectid = ino;
1703         search_key.type = BTRFS_INODE_REF_KEY;
1704         search_key.offset = dirid;
1705         if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1706                 return true;
1707
1708         search_key.type = BTRFS_INODE_EXTREF_KEY;
1709         search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1710         if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1711                 return true;
1712
1713         return false;
1714 }
1715
1716 /*
1717  * take a single entry in a log directory item and replay it into
1718  * the subvolume.
1719  *
1720  * if a conflicting item exists in the subdirectory already,
1721  * the inode it points to is unlinked and put into the link count
1722  * fix up tree.
1723  *
1724  * If a name from the log points to a file or directory that does
1725  * not exist in the FS, it is skipped.  fsyncs on directories
1726  * do not force down inodes inside that directory, just changes to the
1727  * names or unlinks in a directory.
1728  *
1729  * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1730  * non-existing inode) and 1 if the name was replayed.
1731  */
1732 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1733                                     struct btrfs_root *root,
1734                                     struct btrfs_path *path,
1735                                     struct extent_buffer *eb,
1736                                     struct btrfs_dir_item *di,
1737                                     struct btrfs_key *key)
1738 {
1739         char *name;
1740         int name_len;
1741         struct btrfs_dir_item *dst_di;
1742         struct btrfs_key found_key;
1743         struct btrfs_key log_key;
1744         struct inode *dir;
1745         u8 log_type;
1746         bool exists;
1747         int ret;
1748         bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1749         bool name_added = false;
1750
1751         dir = read_one_inode(root, key->objectid);
1752         if (!dir)
1753                 return -EIO;
1754
1755         name_len = btrfs_dir_name_len(eb, di);
1756         name = kmalloc(name_len, GFP_NOFS);
1757         if (!name) {
1758                 ret = -ENOMEM;
1759                 goto out;
1760         }
1761
1762         log_type = btrfs_dir_type(eb, di);
1763         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1764                    name_len);
1765
1766         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1767         ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1768         btrfs_release_path(path);
1769         if (ret < 0)
1770                 goto out;
1771         exists = (ret == 0);
1772         ret = 0;
1773
1774         if (key->type == BTRFS_DIR_ITEM_KEY) {
1775                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1776                                        name, name_len, 1);
1777         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1778                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1779                                                      key->objectid,
1780                                                      key->offset, name,
1781                                                      name_len, 1);
1782         } else {
1783                 /* Corruption */
1784                 ret = -EINVAL;
1785                 goto out;
1786         }
1787
1788         if (dst_di == ERR_PTR(-ENOENT))
1789                 dst_di = NULL;
1790
1791         if (IS_ERR(dst_di)) {
1792                 ret = PTR_ERR(dst_di);
1793                 goto out;
1794         } else if (!dst_di) {
1795                 /* we need a sequence number to insert, so we only
1796                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1797                  */
1798                 if (key->type != BTRFS_DIR_INDEX_KEY)
1799                         goto out;
1800                 goto insert;
1801         }
1802
1803         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1804         /* the existing item matches the logged item */
1805         if (found_key.objectid == log_key.objectid &&
1806             found_key.type == log_key.type &&
1807             found_key.offset == log_key.offset &&
1808             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1809                 update_size = false;
1810                 goto out;
1811         }
1812
1813         /*
1814          * don't drop the conflicting directory entry if the inode
1815          * for the new entry doesn't exist
1816          */
1817         if (!exists)
1818                 goto out;
1819
1820         ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
1821         if (ret)
1822                 goto out;
1823
1824         if (key->type == BTRFS_DIR_INDEX_KEY)
1825                 goto insert;
1826 out:
1827         btrfs_release_path(path);
1828         if (!ret && update_size) {
1829                 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
1830                 ret = btrfs_update_inode(trans, root, dir);
1831         }
1832         kfree(name);
1833         iput(dir);
1834         if (!ret && name_added)
1835                 ret = 1;
1836         return ret;
1837
1838 insert:
1839         if (name_in_log_ref(root->log_root, name, name_len,
1840                             key->objectid, log_key.objectid)) {
1841                 /* The dentry will be added later. */
1842                 ret = 0;
1843                 update_size = false;
1844                 goto out;
1845         }
1846         btrfs_release_path(path);
1847         ret = insert_one_name(trans, root, key->objectid, key->offset,
1848                               name, name_len, &log_key);
1849         if (ret && ret != -ENOENT && ret != -EEXIST)
1850                 goto out;
1851         if (!ret)
1852                 name_added = true;
1853         update_size = false;
1854         ret = 0;
1855         goto out;
1856 }
1857
1858 /*
1859  * find all the names in a directory item and reconcile them into
1860  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1861  * one name in a directory item, but the same code gets used for
1862  * both directory index types
1863  */
1864 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1865                                         struct btrfs_root *root,
1866                                         struct btrfs_path *path,
1867                                         struct extent_buffer *eb, int slot,
1868                                         struct btrfs_key *key)
1869 {
1870         struct btrfs_fs_info *fs_info = root->fs_info;
1871         int ret = 0;
1872         u32 item_size = btrfs_item_size_nr(eb, slot);
1873         struct btrfs_dir_item *di;
1874         int name_len;
1875         unsigned long ptr;
1876         unsigned long ptr_end;
1877         struct btrfs_path *fixup_path = NULL;
1878
1879         ptr = btrfs_item_ptr_offset(eb, slot);
1880         ptr_end = ptr + item_size;
1881         while (ptr < ptr_end) {
1882                 di = (struct btrfs_dir_item *)ptr;
1883                 if (verify_dir_item(fs_info, eb, slot, di))
1884                         return -EIO;
1885                 name_len = btrfs_dir_name_len(eb, di);
1886                 ret = replay_one_name(trans, root, path, eb, di, key);
1887                 if (ret < 0)
1888                         break;
1889                 ptr = (unsigned long)(di + 1);
1890                 ptr += name_len;
1891
1892                 /*
1893                  * If this entry refers to a non-directory (directories can not
1894                  * have a link count > 1) and it was added in the transaction
1895                  * that was not committed, make sure we fixup the link count of
1896                  * the inode it the entry points to. Otherwise something like
1897                  * the following would result in a directory pointing to an
1898                  * inode with a wrong link that does not account for this dir
1899                  * entry:
1900                  *
1901                  * mkdir testdir
1902                  * touch testdir/foo
1903                  * touch testdir/bar
1904                  * sync
1905                  *
1906                  * ln testdir/bar testdir/bar_link
1907                  * ln testdir/foo testdir/foo_link
1908                  * xfs_io -c "fsync" testdir/bar
1909                  *
1910                  * <power failure>
1911                  *
1912                  * mount fs, log replay happens
1913                  *
1914                  * File foo would remain with a link count of 1 when it has two
1915                  * entries pointing to it in the directory testdir. This would
1916                  * make it impossible to ever delete the parent directory has
1917                  * it would result in stale dentries that can never be deleted.
1918                  */
1919                 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
1920                         struct btrfs_key di_key;
1921
1922                         if (!fixup_path) {
1923                                 fixup_path = btrfs_alloc_path();
1924                                 if (!fixup_path) {
1925                                         ret = -ENOMEM;
1926                                         break;
1927                                 }
1928                         }
1929
1930                         btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1931                         ret = link_to_fixup_dir(trans, root, fixup_path,
1932                                                 di_key.objectid);
1933                         if (ret)
1934                                 break;
1935                 }
1936                 ret = 0;
1937         }
1938         btrfs_free_path(fixup_path);
1939         return ret;
1940 }
1941
1942 /*
1943  * directory replay has two parts.  There are the standard directory
1944  * items in the log copied from the subvolume, and range items
1945  * created in the log while the subvolume was logged.
1946  *
1947  * The range items tell us which parts of the key space the log
1948  * is authoritative for.  During replay, if a key in the subvolume
1949  * directory is in a logged range item, but not actually in the log
1950  * that means it was deleted from the directory before the fsync
1951  * and should be removed.
1952  */
1953 static noinline int find_dir_range(struct btrfs_root *root,
1954                                    struct btrfs_path *path,
1955                                    u64 dirid, int key_type,
1956                                    u64 *start_ret, u64 *end_ret)
1957 {
1958         struct btrfs_key key;
1959         u64 found_end;
1960         struct btrfs_dir_log_item *item;
1961         int ret;
1962         int nritems;
1963
1964         if (*start_ret == (u64)-1)
1965                 return 1;
1966
1967         key.objectid = dirid;
1968         key.type = key_type;
1969         key.offset = *start_ret;
1970
1971         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1972         if (ret < 0)
1973                 goto out;
1974         if (ret > 0) {
1975                 if (path->slots[0] == 0)
1976                         goto out;
1977                 path->slots[0]--;
1978         }
1979         if (ret != 0)
1980                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1981
1982         if (key.type != key_type || key.objectid != dirid) {
1983                 ret = 1;
1984                 goto next;
1985         }
1986         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1987                               struct btrfs_dir_log_item);
1988         found_end = btrfs_dir_log_end(path->nodes[0], item);
1989
1990         if (*start_ret >= key.offset && *start_ret <= found_end) {
1991                 ret = 0;
1992                 *start_ret = key.offset;
1993                 *end_ret = found_end;
1994                 goto out;
1995         }
1996         ret = 1;
1997 next:
1998         /* check the next slot in the tree to see if it is a valid item */
1999         nritems = btrfs_header_nritems(path->nodes[0]);
2000         path->slots[0]++;
2001         if (path->slots[0] >= nritems) {
2002                 ret = btrfs_next_leaf(root, path);
2003                 if (ret)
2004                         goto out;
2005         }
2006
2007         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2008
2009         if (key.type != key_type || key.objectid != dirid) {
2010                 ret = 1;
2011                 goto out;
2012         }
2013         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2014                               struct btrfs_dir_log_item);
2015         found_end = btrfs_dir_log_end(path->nodes[0], item);
2016         *start_ret = key.offset;
2017         *end_ret = found_end;
2018         ret = 0;
2019 out:
2020         btrfs_release_path(path);
2021         return ret;
2022 }
2023
2024 /*
2025  * this looks for a given directory item in the log.  If the directory
2026  * item is not in the log, the item is removed and the inode it points
2027  * to is unlinked
2028  */
2029 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2030                                       struct btrfs_root *root,
2031                                       struct btrfs_root *log,
2032                                       struct btrfs_path *path,
2033                                       struct btrfs_path *log_path,
2034                                       struct inode *dir,
2035                                       struct btrfs_key *dir_key)
2036 {
2037         struct btrfs_fs_info *fs_info = root->fs_info;
2038         int ret;
2039         struct extent_buffer *eb;
2040         int slot;
2041         u32 item_size;
2042         struct btrfs_dir_item *di;
2043         struct btrfs_dir_item *log_di;
2044         int name_len;
2045         unsigned long ptr;
2046         unsigned long ptr_end;
2047         char *name;
2048         struct inode *inode;
2049         struct btrfs_key location;
2050
2051 again:
2052         eb = path->nodes[0];
2053         slot = path->slots[0];
2054         item_size = btrfs_item_size_nr(eb, slot);
2055         ptr = btrfs_item_ptr_offset(eb, slot);
2056         ptr_end = ptr + item_size;
2057         while (ptr < ptr_end) {
2058                 di = (struct btrfs_dir_item *)ptr;
2059                 if (verify_dir_item(fs_info, eb, slot, di)) {
2060                         ret = -EIO;
2061                         goto out;
2062                 }
2063
2064                 name_len = btrfs_dir_name_len(eb, di);
2065                 name = kmalloc(name_len, GFP_NOFS);
2066                 if (!name) {
2067                         ret = -ENOMEM;
2068                         goto out;
2069                 }
2070                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
2071                                   name_len);
2072                 log_di = NULL;
2073                 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2074                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
2075                                                        dir_key->objectid,
2076                                                        name, name_len, 0);
2077                 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2078                         log_di = btrfs_lookup_dir_index_item(trans, log,
2079                                                      log_path,
2080                                                      dir_key->objectid,
2081                                                      dir_key->offset,
2082                                                      name, name_len, 0);
2083                 }
2084                 if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
2085                         btrfs_dir_item_key_to_cpu(eb, di, &location);
2086                         btrfs_release_path(path);
2087                         btrfs_release_path(log_path);
2088                         inode = read_one_inode(root, location.objectid);
2089                         if (!inode) {
2090                                 kfree(name);
2091                                 return -EIO;
2092                         }
2093
2094                         ret = link_to_fixup_dir(trans, root,
2095                                                 path, location.objectid);
2096                         if (ret) {
2097                                 kfree(name);
2098                                 iput(inode);
2099                                 goto out;
2100                         }
2101
2102                         inc_nlink(inode);
2103                         ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2104                                         BTRFS_I(inode), name, name_len);
2105                         if (!ret)
2106                                 ret = btrfs_run_delayed_items(trans, fs_info);
2107                         kfree(name);
2108                         iput(inode);
2109                         if (ret)
2110                                 goto out;
2111
2112                         /* there might still be more names under this key
2113                          * check and repeat if required
2114                          */
2115                         ret = btrfs_search_slot(NULL, root, dir_key, path,
2116                                                 0, 0);
2117                         if (ret == 0)
2118                                 goto again;
2119                         ret = 0;
2120                         goto out;
2121                 } else if (IS_ERR(log_di)) {
2122                         kfree(name);
2123                         return PTR_ERR(log_di);
2124                 }
2125                 btrfs_release_path(log_path);
2126                 kfree(name);
2127
2128                 ptr = (unsigned long)(di + 1);
2129                 ptr += name_len;
2130         }
2131         ret = 0;
2132 out:
2133         btrfs_release_path(path);
2134         btrfs_release_path(log_path);
2135         return ret;
2136 }
2137
2138 static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2139                               struct btrfs_root *root,
2140                               struct btrfs_root *log,
2141                               struct btrfs_path *path,
2142                               const u64 ino)
2143 {
2144         struct btrfs_fs_info *fs_info = root->fs_info;
2145         struct btrfs_key search_key;
2146         struct btrfs_path *log_path;
2147         int i;
2148         int nritems;
2149         int ret;
2150
2151         log_path = btrfs_alloc_path();
2152         if (!log_path)
2153                 return -ENOMEM;
2154
2155         search_key.objectid = ino;
2156         search_key.type = BTRFS_XATTR_ITEM_KEY;
2157         search_key.offset = 0;
2158 again:
2159         ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2160         if (ret < 0)
2161                 goto out;
2162 process_leaf:
2163         nritems = btrfs_header_nritems(path->nodes[0]);
2164         for (i = path->slots[0]; i < nritems; i++) {
2165                 struct btrfs_key key;
2166                 struct btrfs_dir_item *di;
2167                 struct btrfs_dir_item *log_di;
2168                 u32 total_size;
2169                 u32 cur;
2170
2171                 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2172                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2173                         ret = 0;
2174                         goto out;
2175                 }
2176
2177                 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2178                 total_size = btrfs_item_size_nr(path->nodes[0], i);
2179                 cur = 0;
2180                 while (cur < total_size) {
2181                         u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2182                         u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2183                         u32 this_len = sizeof(*di) + name_len + data_len;
2184                         char *name;
2185
2186                         ret = verify_dir_item(fs_info, path->nodes[0], i, di);
2187                         if (ret) {
2188                                 ret = -EIO;
2189                                 goto out;
2190                         }
2191                         name = kmalloc(name_len, GFP_NOFS);
2192                         if (!name) {
2193                                 ret = -ENOMEM;
2194                                 goto out;
2195                         }
2196                         read_extent_buffer(path->nodes[0], name,
2197                                            (unsigned long)(di + 1), name_len);
2198
2199                         log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2200                                                     name, name_len, 0);
2201                         btrfs_release_path(log_path);
2202                         if (!log_di) {
2203                                 /* Doesn't exist in log tree, so delete it. */
2204                                 btrfs_release_path(path);
2205                                 di = btrfs_lookup_xattr(trans, root, path, ino,
2206                                                         name, name_len, -1);
2207                                 kfree(name);
2208                                 if (IS_ERR(di)) {
2209                                         ret = PTR_ERR(di);
2210                                         goto out;
2211                                 }
2212                                 ASSERT(di);
2213                                 ret = btrfs_delete_one_dir_name(trans, root,
2214                                                                 path, di);
2215                                 if (ret)
2216                                         goto out;
2217                                 btrfs_release_path(path);
2218                                 search_key = key;
2219                                 goto again;
2220                         }
2221                         kfree(name);
2222                         if (IS_ERR(log_di)) {
2223                                 ret = PTR_ERR(log_di);
2224                                 goto out;
2225                         }
2226                         cur += this_len;
2227                         di = (struct btrfs_dir_item *)((char *)di + this_len);
2228                 }
2229         }
2230         ret = btrfs_next_leaf(root, path);
2231         if (ret > 0)
2232                 ret = 0;
2233         else if (ret == 0)
2234                 goto process_leaf;
2235 out:
2236         btrfs_free_path(log_path);
2237         btrfs_release_path(path);
2238         return ret;
2239 }
2240
2241
2242 /*
2243  * deletion replay happens before we copy any new directory items
2244  * out of the log or out of backreferences from inodes.  It
2245  * scans the log to find ranges of keys that log is authoritative for,
2246  * and then scans the directory to find items in those ranges that are
2247  * not present in the log.
2248  *
2249  * Anything we don't find in the log is unlinked and removed from the
2250  * directory.
2251  */
2252 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2253                                        struct btrfs_root *root,
2254                                        struct btrfs_root *log,
2255                                        struct btrfs_path *path,
2256                                        u64 dirid, int del_all)
2257 {
2258         u64 range_start;
2259         u64 range_end;
2260         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2261         int ret = 0;
2262         struct btrfs_key dir_key;
2263         struct btrfs_key found_key;
2264         struct btrfs_path *log_path;
2265         struct inode *dir;
2266
2267         dir_key.objectid = dirid;
2268         dir_key.type = BTRFS_DIR_ITEM_KEY;
2269         log_path = btrfs_alloc_path();
2270         if (!log_path)
2271                 return -ENOMEM;
2272
2273         dir = read_one_inode(root, dirid);
2274         /* it isn't an error if the inode isn't there, that can happen
2275          * because we replay the deletes before we copy in the inode item
2276          * from the log
2277          */
2278         if (!dir) {
2279                 btrfs_free_path(log_path);
2280                 return 0;
2281         }
2282 again:
2283         range_start = 0;
2284         range_end = 0;
2285         while (1) {
2286                 if (del_all)
2287                         range_end = (u64)-1;
2288                 else {
2289                         ret = find_dir_range(log, path, dirid, key_type,
2290                                              &range_start, &range_end);
2291                         if (ret < 0)
2292                                 goto out;
2293                         else if (ret > 0)
2294                                 break;
2295                 }
2296
2297                 dir_key.offset = range_start;
2298                 while (1) {
2299                         int nritems;
2300                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
2301                                                 0, 0);
2302                         if (ret < 0)
2303                                 goto out;
2304
2305                         nritems = btrfs_header_nritems(path->nodes[0]);
2306                         if (path->slots[0] >= nritems) {
2307                                 ret = btrfs_next_leaf(root, path);
2308                                 if (ret == 1)
2309                                         break;
2310                                 else if (ret < 0)
2311                                         goto out;
2312                         }
2313                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2314                                               path->slots[0]);
2315                         if (found_key.objectid != dirid ||
2316                             found_key.type != dir_key.type)
2317                                 goto next_type;
2318
2319                         if (found_key.offset > range_end)
2320                                 break;
2321
2322                         ret = check_item_in_log(trans, root, log, path,
2323                                                 log_path, dir,
2324                                                 &found_key);
2325                         if (ret)
2326                                 goto out;
2327                         if (found_key.offset == (u64)-1)
2328                                 break;
2329                         dir_key.offset = found_key.offset + 1;
2330                 }
2331                 btrfs_release_path(path);
2332                 if (range_end == (u64)-1)
2333                         break;
2334                 range_start = range_end + 1;
2335         }
2336
2337 next_type:
2338         ret = 0;
2339         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2340                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
2341                 dir_key.type = BTRFS_DIR_INDEX_KEY;
2342                 btrfs_release_path(path);
2343                 goto again;
2344         }
2345 out:
2346         btrfs_release_path(path);
2347         btrfs_free_path(log_path);
2348         iput(dir);
2349         return ret;
2350 }
2351
2352 /*
2353  * the process_func used to replay items from the log tree.  This
2354  * gets called in two different stages.  The first stage just looks
2355  * for inodes and makes sure they are all copied into the subvolume.
2356  *
2357  * The second stage copies all the other item types from the log into
2358  * the subvolume.  The two stage approach is slower, but gets rid of
2359  * lots of complexity around inodes referencing other inodes that exist
2360  * only in the log (references come from either directory items or inode
2361  * back refs).
2362  */
2363 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2364                              struct walk_control *wc, u64 gen)
2365 {
2366         int nritems;
2367         struct btrfs_path *path;
2368         struct btrfs_root *root = wc->replay_dest;
2369         struct btrfs_key key;
2370         int level;
2371         int i;
2372         int ret;
2373
2374         ret = btrfs_read_buffer(eb, gen);
2375         if (ret)
2376                 return ret;
2377
2378         level = btrfs_header_level(eb);
2379
2380         if (level != 0)
2381                 return 0;
2382
2383         path = btrfs_alloc_path();
2384         if (!path)
2385                 return -ENOMEM;
2386
2387         nritems = btrfs_header_nritems(eb);
2388         for (i = 0; i < nritems; i++) {
2389                 btrfs_item_key_to_cpu(eb, &key, i);
2390
2391                 /* inode keys are done during the first stage */
2392                 if (key.type == BTRFS_INODE_ITEM_KEY &&
2393                     wc->stage == LOG_WALK_REPLAY_INODES) {
2394                         struct btrfs_inode_item *inode_item;
2395                         u32 mode;
2396
2397                         inode_item = btrfs_item_ptr(eb, i,
2398                                             struct btrfs_inode_item);
2399                         /*
2400                          * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2401                          * and never got linked before the fsync, skip it, as
2402                          * replaying it is pointless since it would be deleted
2403                          * later. We skip logging tmpfiles, but it's always
2404                          * possible we are replaying a log created with a kernel
2405                          * that used to log tmpfiles.
2406                          */
2407                         if (btrfs_inode_nlink(eb, inode_item) == 0) {
2408                                 wc->ignore_cur_inode = true;
2409                                 continue;
2410                         } else {
2411                                 wc->ignore_cur_inode = false;
2412                         }
2413                         ret = replay_xattr_deletes(wc->trans, root, log,
2414                                                    path, key.objectid);
2415                         if (ret)
2416                                 break;
2417                         mode = btrfs_inode_mode(eb, inode_item);
2418                         if (S_ISDIR(mode)) {
2419                                 ret = replay_dir_deletes(wc->trans,
2420                                          root, log, path, key.objectid, 0);
2421                                 if (ret)
2422                                         break;
2423                         }
2424                         ret = overwrite_item(wc->trans, root, path,
2425                                              eb, i, &key);
2426                         if (ret)
2427                                 break;
2428
2429                         /*
2430                          * Before replaying extents, truncate the inode to its
2431                          * size. We need to do it now and not after log replay
2432                          * because before an fsync we can have prealloc extents
2433                          * added beyond the inode's i_size. If we did it after,
2434                          * through orphan cleanup for example, we would drop
2435                          * those prealloc extents just after replaying them.
2436                          */
2437                         if (S_ISREG(mode)) {
2438                                 struct inode *inode;
2439                                 u64 from;
2440
2441                                 inode = read_one_inode(root, key.objectid);
2442                                 if (!inode) {
2443                                         ret = -EIO;
2444                                         break;
2445                                 }
2446                                 from = ALIGN(i_size_read(inode),
2447                                              root->fs_info->sectorsize);
2448                                 ret = btrfs_drop_extents(wc->trans, root, inode,
2449                                                          from, (u64)-1, 1);
2450                                 if (!ret) {
2451                                         /* Update the inode's nbytes. */
2452                                         ret = btrfs_update_inode(wc->trans,
2453                                                                  root, inode);
2454                                 }
2455                                 iput(inode);
2456                                 if (ret)
2457                                         break;
2458                         }
2459
2460                         ret = link_to_fixup_dir(wc->trans, root,
2461                                                 path, key.objectid);
2462                         if (ret)
2463                                 break;
2464                 }
2465
2466                 if (wc->ignore_cur_inode)
2467                         continue;
2468
2469                 if (key.type == BTRFS_DIR_INDEX_KEY &&
2470                     wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2471                         ret = replay_one_dir_item(wc->trans, root, path,
2472                                                   eb, i, &key);
2473                         if (ret)
2474                                 break;
2475                 }
2476
2477                 if (wc->stage < LOG_WALK_REPLAY_ALL)
2478                         continue;
2479
2480                 /* these keys are simply copied */
2481                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2482                         ret = overwrite_item(wc->trans, root, path,
2483                                              eb, i, &key);
2484                         if (ret)
2485                                 break;
2486                 } else if (key.type == BTRFS_INODE_REF_KEY ||
2487                            key.type == BTRFS_INODE_EXTREF_KEY) {
2488                         ret = add_inode_ref(wc->trans, root, log, path,
2489                                             eb, i, &key);
2490                         if (ret && ret != -ENOENT)
2491                                 break;
2492                         ret = 0;
2493                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2494                         ret = replay_one_extent(wc->trans, root, path,
2495                                                 eb, i, &key);
2496                         if (ret)
2497                                 break;
2498                 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2499                         ret = replay_one_dir_item(wc->trans, root, path,
2500                                                   eb, i, &key);
2501                         if (ret)
2502                                 break;
2503                 }
2504         }
2505         btrfs_free_path(path);
2506         return ret;
2507 }
2508
2509 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2510                                    struct btrfs_root *root,
2511                                    struct btrfs_path *path, int *level,
2512                                    struct walk_control *wc)
2513 {
2514         struct btrfs_fs_info *fs_info = root->fs_info;
2515         u64 root_owner;
2516         u64 bytenr;
2517         u64 ptr_gen;
2518         struct extent_buffer *next;
2519         struct extent_buffer *cur;
2520         struct extent_buffer *parent;
2521         u32 blocksize;
2522         int ret = 0;
2523
2524         WARN_ON(*level < 0);
2525         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2526
2527         while (*level > 0) {
2528                 WARN_ON(*level < 0);
2529                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2530                 cur = path->nodes[*level];
2531
2532                 WARN_ON(btrfs_header_level(cur) != *level);
2533
2534                 if (path->slots[*level] >=
2535                     btrfs_header_nritems(cur))
2536                         break;
2537
2538                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2539                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2540                 blocksize = fs_info->nodesize;
2541
2542                 parent = path->nodes[*level];
2543                 root_owner = btrfs_header_owner(parent);
2544
2545                 next = btrfs_find_create_tree_block(fs_info, bytenr);
2546                 if (IS_ERR(next))
2547                         return PTR_ERR(next);
2548
2549                 if (*level == 1) {
2550                         ret = wc->process_func(root, next, wc, ptr_gen);
2551                         if (ret) {
2552                                 free_extent_buffer(next);
2553                                 return ret;
2554                         }
2555
2556                         path->slots[*level]++;
2557                         if (wc->free) {
2558                                 ret = btrfs_read_buffer(next, ptr_gen);
2559                                 if (ret) {
2560                                         free_extent_buffer(next);
2561                                         return ret;
2562                                 }
2563
2564                                 if (trans) {
2565                                         btrfs_tree_lock(next);
2566                                         btrfs_set_lock_blocking(next);
2567                                         clean_tree_block(fs_info, next);
2568                                         btrfs_wait_tree_block_writeback(next);
2569                                         btrfs_tree_unlock(next);
2570                                 } else {
2571                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2572                                                 clear_extent_buffer_dirty(next);
2573                                 }
2574
2575                                 WARN_ON(root_owner !=
2576                                         BTRFS_TREE_LOG_OBJECTID);
2577                                 ret = btrfs_free_and_pin_reserved_extent(
2578                                                         fs_info, bytenr,
2579                                                         blocksize);
2580                                 if (ret) {
2581                                         free_extent_buffer(next);
2582                                         return ret;
2583                                 }
2584                         }
2585                         free_extent_buffer(next);
2586                         continue;
2587                 }
2588                 ret = btrfs_read_buffer(next, ptr_gen);
2589                 if (ret) {
2590                         free_extent_buffer(next);
2591                         return ret;
2592                 }
2593
2594                 WARN_ON(*level <= 0);
2595                 if (path->nodes[*level-1])
2596                         free_extent_buffer(path->nodes[*level-1]);
2597                 path->nodes[*level-1] = next;
2598                 *level = btrfs_header_level(next);
2599                 path->slots[*level] = 0;
2600                 cond_resched();
2601         }
2602         WARN_ON(*level < 0);
2603         WARN_ON(*level >= BTRFS_MAX_LEVEL);
2604
2605         path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2606
2607         cond_resched();
2608         return 0;
2609 }
2610
2611 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2612                                  struct btrfs_root *root,
2613                                  struct btrfs_path *path, int *level,
2614                                  struct walk_control *wc)
2615 {
2616         struct btrfs_fs_info *fs_info = root->fs_info;
2617         u64 root_owner;
2618         int i;
2619         int slot;
2620         int ret;
2621
2622         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2623                 slot = path->slots[i];
2624                 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2625                         path->slots[i]++;
2626                         *level = i;
2627                         WARN_ON(*level == 0);
2628                         return 0;
2629                 } else {
2630                         struct extent_buffer *parent;
2631                         if (path->nodes[*level] == root->node)
2632                                 parent = path->nodes[*level];
2633                         else
2634                                 parent = path->nodes[*level + 1];
2635
2636                         root_owner = btrfs_header_owner(parent);
2637                         ret = wc->process_func(root, path->nodes[*level], wc,
2638                                  btrfs_header_generation(path->nodes[*level]));
2639                         if (ret)
2640                                 return ret;
2641
2642                         if (wc->free) {
2643                                 struct extent_buffer *next;
2644
2645                                 next = path->nodes[*level];
2646
2647                                 if (trans) {
2648                                         btrfs_tree_lock(next);
2649                                         btrfs_set_lock_blocking(next);
2650                                         clean_tree_block(fs_info, next);
2651                                         btrfs_wait_tree_block_writeback(next);
2652                                         btrfs_tree_unlock(next);
2653                                 } else {
2654                                         if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2655                                                 clear_extent_buffer_dirty(next);
2656                                 }
2657
2658                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
2659                                 ret = btrfs_free_and_pin_reserved_extent(
2660                                                 fs_info,
2661                                                 path->nodes[*level]->start,
2662                                                 path->nodes[*level]->len);
2663                                 if (ret)
2664                                         return ret;
2665                         }
2666                         free_extent_buffer(path->nodes[*level]);
2667                         path->nodes[*level] = NULL;
2668                         *level = i + 1;
2669                 }
2670         }
2671         return 1;
2672 }
2673
2674 /*
2675  * drop the reference count on the tree rooted at 'snap'.  This traverses
2676  * the tree freeing any blocks that have a ref count of zero after being
2677  * decremented.
2678  */
2679 static int walk_log_tree(struct btrfs_trans_handle *trans,
2680                          struct btrfs_root *log, struct walk_control *wc)
2681 {
2682         struct btrfs_fs_info *fs_info = log->fs_info;
2683         int ret = 0;
2684         int wret;
2685         int level;
2686         struct btrfs_path *path;
2687         int orig_level;
2688
2689         path = btrfs_alloc_path();
2690         if (!path)
2691                 return -ENOMEM;
2692
2693         level = btrfs_header_level(log->node);
2694         orig_level = level;
2695         path->nodes[level] = log->node;
2696         extent_buffer_get(log->node);
2697         path->slots[level] = 0;
2698
2699         while (1) {
2700                 wret = walk_down_log_tree(trans, log, path, &level, wc);
2701                 if (wret > 0)
2702                         break;
2703                 if (wret < 0) {
2704                         ret = wret;
2705                         goto out;
2706                 }
2707
2708                 wret = walk_up_log_tree(trans, log, path, &level, wc);
2709                 if (wret > 0)
2710                         break;
2711                 if (wret < 0) {
2712                         ret = wret;
2713                         goto out;
2714                 }
2715         }
2716
2717         /* was the root node processed? if not, catch it here */
2718         if (path->nodes[orig_level]) {
2719                 ret = wc->process_func(log, path->nodes[orig_level], wc,
2720                          btrfs_header_generation(path->nodes[orig_level]));
2721                 if (ret)
2722                         goto out;
2723                 if (wc->free) {
2724                         struct extent_buffer *next;
2725
2726                         next = path->nodes[orig_level];
2727
2728                         if (trans) {
2729                                 btrfs_tree_lock(next);
2730                                 btrfs_set_lock_blocking(next);
2731                                 clean_tree_block(fs_info, next);
2732                                 btrfs_wait_tree_block_writeback(next);
2733                                 btrfs_tree_unlock(next);
2734                         } else {
2735                                 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2736                                         clear_extent_buffer_dirty(next);
2737                         }
2738
2739                         WARN_ON(log->root_key.objectid !=
2740                                 BTRFS_TREE_LOG_OBJECTID);
2741                         ret = btrfs_free_and_pin_reserved_extent(fs_info,
2742                                                         next->start, next->len);
2743                         if (ret)
2744                                 goto out;
2745                 }
2746         }
2747
2748 out:
2749         btrfs_free_path(path);
2750         return ret;
2751 }
2752
2753 /*
2754  * helper function to update the item for a given subvolumes log root
2755  * in the tree of log roots
2756  */
2757 static int update_log_root(struct btrfs_trans_handle *trans,
2758                            struct btrfs_root *log,
2759                            struct btrfs_root_item *root_item)
2760 {
2761         struct btrfs_fs_info *fs_info = log->fs_info;
2762         int ret;
2763
2764         if (log->log_transid == 1) {
2765                 /* insert root item on the first sync */
2766                 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2767                                 &log->root_key, root_item);
2768         } else {
2769                 ret = btrfs_update_root(trans, fs_info->log_root_tree,
2770                                 &log->root_key, root_item);
2771         }
2772         return ret;
2773 }
2774
2775 static void wait_log_commit(struct btrfs_root *root, int transid)
2776 {
2777         DEFINE_WAIT(wait);
2778         int index = transid % 2;
2779
2780         /*
2781          * we only allow two pending log transactions at a time,
2782          * so we know that if ours is more than 2 older than the
2783          * current transaction, we're done
2784          */
2785         do {
2786                 prepare_to_wait(&root->log_commit_wait[index],
2787                                 &wait, TASK_UNINTERRUPTIBLE);
2788                 mutex_unlock(&root->log_mutex);
2789
2790                 if (root->log_transid_committed < transid &&
2791                     atomic_read(&root->log_commit[index]))
2792                         schedule();
2793
2794                 finish_wait(&root->log_commit_wait[index], &wait);
2795                 mutex_lock(&root->log_mutex);
2796         } while (root->log_transid_committed < transid &&
2797                  atomic_read(&root->log_commit[index]));
2798 }
2799
2800 static void wait_for_writer(struct btrfs_root *root)
2801 {
2802         DEFINE_WAIT(wait);
2803
2804         while (atomic_read(&root->log_writers)) {
2805                 prepare_to_wait(&root->log_writer_wait,
2806                                 &wait, TASK_UNINTERRUPTIBLE);
2807                 mutex_unlock(&root->log_mutex);
2808                 if (atomic_read(&root->log_writers))
2809                         schedule();
2810                 finish_wait(&root->log_writer_wait, &wait);
2811                 mutex_lock(&root->log_mutex);
2812         }
2813 }
2814
2815 static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2816                                         struct btrfs_log_ctx *ctx)
2817 {
2818         if (!ctx)
2819                 return;
2820
2821         mutex_lock(&root->log_mutex);
2822         list_del_init(&ctx->list);
2823         mutex_unlock(&root->log_mutex);
2824 }
2825
2826 /*
2827  * Invoked in log mutex context, or be sure there is no other task which
2828  * can access the list.
2829  */
2830 static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
2831                                              int index, int error)
2832 {
2833         struct btrfs_log_ctx *ctx;
2834         struct btrfs_log_ctx *safe;
2835
2836         list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
2837                 list_del_init(&ctx->list);
2838                 ctx->log_ret = error;
2839         }
2840
2841         INIT_LIST_HEAD(&root->log_ctxs[index]);
2842 }
2843
2844 /*
2845  * btrfs_sync_log does sends a given tree log down to the disk and
2846  * updates the super blocks to record it.  When this call is done,
2847  * you know that any inodes previously logged are safely on disk only
2848  * if it returns 0.
2849  *
2850  * Any other return value means you need to call btrfs_commit_transaction.
2851  * Some of the edge cases for fsyncing directories that have had unlinks
2852  * or renames done in the past mean that sometimes the only safe
2853  * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
2854  * that has happened.
2855  */
2856 int btrfs_sync_log(struct btrfs_trans_handle *trans,
2857                    struct btrfs_root *root, struct btrfs_log_ctx *ctx)
2858 {
2859         int index1;
2860         int index2;
2861         int mark;
2862         int ret;
2863         struct btrfs_fs_info *fs_info = root->fs_info;
2864         struct btrfs_root *log = root->log_root;
2865         struct btrfs_root *log_root_tree = fs_info->log_root_tree;
2866         struct btrfs_root_item new_root_item;
2867         int log_transid = 0;
2868         struct btrfs_log_ctx root_log_ctx;
2869         struct blk_plug plug;
2870
2871         mutex_lock(&root->log_mutex);
2872         log_transid = ctx->log_transid;
2873         if (root->log_transid_committed >= log_transid) {
2874                 mutex_unlock(&root->log_mutex);
2875                 return ctx->log_ret;
2876         }
2877
2878         index1 = log_transid % 2;
2879         if (atomic_read(&root->log_commit[index1])) {
2880                 wait_log_commit(root, log_transid);
2881                 mutex_unlock(&root->log_mutex);
2882                 return ctx->log_ret;
2883         }
2884         ASSERT(log_transid == root->log_transid);
2885         atomic_set(&root->log_commit[index1], 1);
2886
2887         /* wait for previous tree log sync to complete */
2888         if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2889                 wait_log_commit(root, log_transid - 1);
2890
2891         while (1) {
2892                 int batch = atomic_read(&root->log_batch);
2893                 /* when we're on an ssd, just kick the log commit out */
2894                 if (!btrfs_test_opt(fs_info, SSD) &&
2895                     test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
2896                         mutex_unlock(&root->log_mutex);
2897                         schedule_timeout_uninterruptible(1);
2898                         mutex_lock(&root->log_mutex);
2899                 }
2900                 wait_for_writer(root);
2901                 if (batch == atomic_read(&root->log_batch))
2902                         break;
2903         }
2904
2905         /* bail out if we need to do a full commit */
2906         if (btrfs_need_log_full_commit(fs_info, trans)) {
2907                 ret = -EAGAIN;
2908                 btrfs_free_logged_extents(log, log_transid);
2909                 mutex_unlock(&root->log_mutex);
2910                 goto out;
2911         }
2912
2913         if (log_transid % 2 == 0)
2914                 mark = EXTENT_DIRTY;
2915         else
2916                 mark = EXTENT_NEW;
2917
2918         /* we start IO on  all the marked extents here, but we don't actually
2919          * wait for them until later.
2920          */
2921         blk_start_plug(&plug);
2922         ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
2923         if (ret) {
2924                 blk_finish_plug(&plug);
2925                 btrfs_abort_transaction(trans, ret);
2926                 btrfs_free_logged_extents(log, log_transid);
2927                 btrfs_set_log_full_commit(fs_info, trans);
2928                 mutex_unlock(&root->log_mutex);
2929                 goto out;
2930         }
2931
2932         /*
2933          * We _must_ update under the root->log_mutex in order to make sure we
2934          * have a consistent view of the log root we are trying to commit at
2935          * this moment.
2936          *
2937          * We _must_ copy this into a local copy, because we are not holding the
2938          * log_root_tree->log_mutex yet.  This is important because when we
2939          * commit the log_root_tree we must have a consistent view of the
2940          * log_root_tree when we update the super block to point at the
2941          * log_root_tree bytenr.  If we update the log_root_tree here we'll race
2942          * with the commit and possibly point at the new block which we may not
2943          * have written out.
2944          */
2945         btrfs_set_root_node(&log->root_item, log->node);
2946         memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
2947
2948         root->log_transid++;
2949         log->log_transid = root->log_transid;
2950         root->log_start_pid = 0;
2951         /*
2952          * IO has been started, blocks of the log tree have WRITTEN flag set
2953          * in their headers. new modifications of the log will be written to
2954          * new positions. so it's safe to allow log writers to go in.
2955          */
2956         mutex_unlock(&root->log_mutex);
2957
2958         btrfs_init_log_ctx(&root_log_ctx, NULL);
2959
2960         mutex_lock(&log_root_tree->log_mutex);
2961         atomic_inc(&log_root_tree->log_batch);
2962         atomic_inc(&log_root_tree->log_writers);
2963
2964         index2 = log_root_tree->log_transid % 2;
2965         list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
2966         root_log_ctx.log_transid = log_root_tree->log_transid;
2967
2968         mutex_unlock(&log_root_tree->log_mutex);
2969
2970         mutex_lock(&log_root_tree->log_mutex);
2971
2972         /*
2973          * Now we are safe to update the log_root_tree because we're under the
2974          * log_mutex, and we're a current writer so we're holding the commit
2975          * open until we drop the log_mutex.
2976          */
2977         ret = update_log_root(trans, log, &new_root_item);
2978
2979         if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2980                 /*
2981                  * Implicit memory barrier after atomic_dec_and_test
2982                  */
2983                 if (waitqueue_active(&log_root_tree->log_writer_wait))
2984                         wake_up(&log_root_tree->log_writer_wait);
2985         }
2986
2987         if (ret) {
2988                 if (!list_empty(&root_log_ctx.list))
2989                         list_del_init(&root_log_ctx.list);
2990
2991                 blk_finish_plug(&plug);
2992                 btrfs_set_log_full_commit(fs_info, trans);
2993
2994                 if (ret != -ENOSPC) {
2995                         btrfs_abort_transaction(trans, ret);
2996                         mutex_unlock(&log_root_tree->log_mutex);
2997                         goto out;
2998                 }
2999                 btrfs_wait_tree_log_extents(log, mark);
3000                 btrfs_free_logged_extents(log, log_transid);
3001                 mutex_unlock(&log_root_tree->log_mutex);
3002                 ret = -EAGAIN;
3003                 goto out;
3004         }
3005
3006         if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3007                 blk_finish_plug(&plug);
3008                 list_del_init(&root_log_ctx.list);
3009                 mutex_unlock(&log_root_tree->log_mutex);
3010                 ret = root_log_ctx.log_ret;
3011                 goto out;
3012         }
3013
3014         index2 = root_log_ctx.log_transid % 2;
3015         if (atomic_read(&log_root_tree->log_commit[index2])) {
3016                 blk_finish_plug(&plug);
3017                 ret = btrfs_wait_tree_log_extents(log, mark);
3018                 btrfs_wait_logged_extents(trans, log, log_transid);
3019                 wait_log_commit(log_root_tree,
3020                                 root_log_ctx.log_transid);
3021                 mutex_unlock(&log_root_tree->log_mutex);
3022                 if (!ret)
3023                         ret = root_log_ctx.log_ret;
3024                 goto out;
3025         }
3026         ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3027         atomic_set(&log_root_tree->log_commit[index2], 1);
3028
3029         if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3030                 wait_log_commit(log_root_tree,
3031                                 root_log_ctx.log_transid - 1);
3032         }
3033
3034         wait_for_writer(log_root_tree);
3035
3036         /*
3037          * now that we've moved on to the tree of log tree roots,
3038          * check the full commit flag again
3039          */
3040         if (btrfs_need_log_full_commit(fs_info, trans)) {
3041                 blk_finish_plug(&plug);
3042                 btrfs_wait_tree_log_extents(log, mark);
3043                 btrfs_free_logged_extents(log, log_transid);
3044                 mutex_unlock(&log_root_tree->log_mutex);
3045                 ret = -EAGAIN;
3046                 goto out_wake_log_root;
3047         }
3048
3049         ret = btrfs_write_marked_extents(fs_info,
3050                                          &log_root_tree->dirty_log_pages,
3051                                          EXTENT_DIRTY | EXTENT_NEW);
3052         blk_finish_plug(&plug);
3053         if (ret) {
3054                 btrfs_set_log_full_commit(fs_info, trans);
3055                 btrfs_abort_transaction(trans, ret);
3056                 btrfs_free_logged_extents(log, log_transid);
3057                 mutex_unlock(&log_root_tree->log_mutex);
3058                 goto out_wake_log_root;
3059         }
3060         ret = btrfs_wait_tree_log_extents(log, mark);
3061         if (!ret)
3062                 ret = btrfs_wait_tree_log_extents(log_root_tree,
3063                                                   EXTENT_NEW | EXTENT_DIRTY);
3064         if (ret) {
3065                 btrfs_set_log_full_commit(fs_info, trans);
3066                 btrfs_free_logged_extents(log, log_transid);
3067                 mutex_unlock(&log_root_tree->log_mutex);
3068                 goto out_wake_log_root;
3069         }
3070         btrfs_wait_logged_extents(trans, log, log_transid);
3071
3072         btrfs_set_super_log_root(fs_info->super_for_commit,
3073                                  log_root_tree->node->start);
3074         btrfs_set_super_log_root_level(fs_info->super_for_commit,
3075                                        btrfs_header_level(log_root_tree->node));
3076
3077         log_root_tree->log_transid++;
3078         mutex_unlock(&log_root_tree->log_mutex);
3079
3080         /*
3081          * nobody else is going to jump in and write the the ctree
3082          * super here because the log_commit atomic below is protecting
3083          * us.  We must be called with a transaction handle pinning
3084          * the running transaction open, so a full commit can't hop
3085          * in and cause problems either.
3086          */
3087         ret = write_all_supers(fs_info, 1);
3088         if (ret) {
3089                 btrfs_set_log_full_commit(fs_info, trans);
3090                 btrfs_abort_transaction(trans, ret);
3091                 goto out_wake_log_root;
3092         }
3093
3094         mutex_lock(&root->log_mutex);
3095         if (root->last_log_commit < log_transid)
3096                 root->last_log_commit = log_transid;
3097         mutex_unlock(&root->log_mutex);
3098
3099 out_wake_log_root:
3100         mutex_lock(&log_root_tree->log_mutex);
3101         btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3102
3103         log_root_tree->log_transid_committed++;
3104         atomic_set(&log_root_tree->log_commit[index2], 0);
3105         mutex_unlock(&log_root_tree->log_mutex);
3106
3107         /*
3108          * The barrier before waitqueue_active is needed so all the updates
3109          * above are seen by the woken threads. It might not be necessary, but
3110          * proving that seems to be hard.
3111          */
3112         smp_mb();
3113         if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
3114                 wake_up(&log_root_tree->log_commit_wait[index2]);
3115 out:
3116         mutex_lock(&root->log_mutex);
3117         btrfs_remove_all_log_ctxs(root, index1, ret);
3118         root->log_transid_committed++;
3119         atomic_set(&root->log_commit[index1], 0);
3120         mutex_unlock(&root->log_mutex);
3121
3122         /*
3123          * The barrier before waitqueue_active is needed so all the updates
3124          * above are seen by the woken threads. It might not be necessary, but
3125          * proving that seems to be hard.
3126          */
3127         smp_mb();
3128         if (waitqueue_active(&root->log_commit_wait[index1]))
3129                 wake_up(&root->log_commit_wait[index1]);
3130         return ret;
3131 }
3132
3133 static void free_log_tree(struct btrfs_trans_handle *trans,
3134                           struct btrfs_root *log)
3135 {
3136         int ret;
3137         u64 start;
3138         u64 end;
3139         struct walk_control wc = {
3140                 .free = 1,
3141                 .process_func = process_one_buffer
3142         };
3143
3144         ret = walk_log_tree(trans, log, &wc);
3145         if (ret) {
3146                 if (trans)
3147                         btrfs_abort_transaction(trans, ret);
3148                 else
3149                         btrfs_handle_fs_error(log->fs_info, ret, NULL);
3150         }
3151
3152         while (1) {
3153                 ret = find_first_extent_bit(&log->dirty_log_pages,
3154                                 0, &start, &end,
3155                                 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT,
3156                                 NULL);
3157                 if (ret)
3158                         break;
3159
3160                 clear_extent_bits(&log->dirty_log_pages, start, end,
3161                                   EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3162         }
3163
3164         /*
3165          * We may have short-circuited the log tree with the full commit logic
3166          * and left ordered extents on our list, so clear these out to keep us
3167          * from leaking inodes and memory.
3168          */
3169         btrfs_free_logged_extents(log, 0);
3170         btrfs_free_logged_extents(log, 1);
3171
3172         free_extent_buffer(log->node);
3173         kfree(log);
3174 }
3175
3176 /*
3177  * free all the extents used by the tree log.  This should be called
3178  * at commit time of the full transaction
3179  */
3180 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3181 {
3182         if (root->log_root) {
3183                 free_log_tree(trans, root->log_root);
3184                 root->log_root = NULL;
3185         }
3186         return 0;
3187 }
3188
3189 int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3190                              struct btrfs_fs_info *fs_info)
3191 {
3192         if (fs_info->log_root_tree) {
3193                 free_log_tree(trans, fs_info->log_root_tree);
3194                 fs_info->log_root_tree = NULL;
3195         }
3196         return 0;
3197 }
3198
3199 /*
3200  * Check if an inode was logged in the current transaction. We can't always rely
3201  * on an inode's logged_trans value, because it's an in-memory only field and
3202  * therefore not persisted. This means that its value is lost if the inode gets
3203  * evicted and loaded again from disk (in which case it has a value of 0, and
3204  * certainly it is smaller then any possible transaction ID), when that happens
3205  * the full_sync flag is set in the inode's runtime flags, so on that case we
3206  * assume eviction happened and ignore the logged_trans value, assuming the
3207  * worst case, that the inode was logged before in the current transaction.
3208  */
3209 static bool inode_logged(struct btrfs_trans_handle *trans,
3210                          struct btrfs_inode *inode)
3211 {
3212         if (inode->logged_trans == trans->transid)
3213                 return true;
3214
3215         if (inode->last_trans == trans->transid &&
3216             test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3217             !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3218                 return true;
3219
3220         return false;
3221 }
3222
3223 /*
3224  * If both a file and directory are logged, and unlinks or renames are
3225  * mixed in, we have a few interesting corners:
3226  *
3227  * create file X in dir Y
3228  * link file X to X.link in dir Y
3229  * fsync file X
3230  * unlink file X but leave X.link
3231  * fsync dir Y
3232  *
3233  * After a crash we would expect only X.link to exist.  But file X
3234  * didn't get fsync'd again so the log has back refs for X and X.link.
3235  *
3236  * We solve this by removing directory entries and inode backrefs from the
3237  * log when a file that was logged in the current transaction is
3238  * unlinked.  Any later fsync will include the updated log entries, and
3239  * we'll be able to reconstruct the proper directory items from backrefs.
3240  *
3241  * This optimizations allows us to avoid relogging the entire inode
3242  * or the entire directory.
3243  */
3244 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3245                                  struct btrfs_root *root,
3246                                  const char *name, int name_len,
3247                                  struct btrfs_inode *dir, u64 index)
3248 {
3249         struct btrfs_root *log;
3250         struct btrfs_dir_item *di;
3251         struct btrfs_path *path;
3252         int ret;
3253         int err = 0;
3254         int bytes_del = 0;
3255         u64 dir_ino = btrfs_ino(dir);
3256
3257         if (!inode_logged(trans, dir))
3258                 return 0;
3259
3260         ret = join_running_log_trans(root);
3261         if (ret)
3262                 return 0;
3263
3264         mutex_lock(&dir->log_mutex);
3265
3266         log = root->log_root;
3267         path = btrfs_alloc_path();
3268         if (!path) {
3269                 err = -ENOMEM;
3270                 goto out_unlock;
3271         }
3272
3273         di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3274                                    name, name_len, -1);
3275         if (IS_ERR(di)) {
3276                 err = PTR_ERR(di);
3277                 goto fail;
3278         }
3279         if (di) {
3280                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3281                 bytes_del += name_len;
3282                 if (ret) {
3283                         err = ret;
3284                         goto fail;
3285                 }
3286         }
3287         btrfs_release_path(path);
3288         di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3289                                          index, name, name_len, -1);
3290         if (IS_ERR(di)) {
3291                 err = PTR_ERR(di);
3292                 goto fail;
3293         }
3294         if (di) {
3295                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3296                 bytes_del += name_len;
3297                 if (ret) {
3298                         err = ret;
3299                         goto fail;
3300                 }
3301         }
3302
3303         /* update the directory size in the log to reflect the names
3304          * we have removed
3305          */
3306         if (bytes_del) {
3307                 struct btrfs_key key;
3308
3309                 key.objectid = dir_ino;
3310                 key.offset = 0;
3311                 key.type = BTRFS_INODE_ITEM_KEY;
3312                 btrfs_release_path(path);
3313
3314                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3315                 if (ret < 0) {
3316                         err = ret;
3317                         goto fail;
3318                 }
3319                 if (ret == 0) {
3320                         struct btrfs_inode_item *item;
3321                         u64 i_size;
3322
3323                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3324                                               struct btrfs_inode_item);
3325                         i_size = btrfs_inode_size(path->nodes[0], item);
3326                         if (i_size > bytes_del)
3327                                 i_size -= bytes_del;
3328                         else
3329                                 i_size = 0;
3330                         btrfs_set_inode_size(path->nodes[0], item, i_size);
3331                         btrfs_mark_buffer_dirty(path->nodes[0]);
3332                 } else
3333                         ret = 0;
3334                 btrfs_release_path(path);
3335         }
3336 fail:
3337         btrfs_free_path(path);
3338 out_unlock:
3339         mutex_unlock(&dir->log_mutex);
3340         if (err == -ENOSPC) {
3341                 btrfs_set_log_full_commit(root->fs_info, trans);
3342                 err = 0;
3343         } else if (err < 0 && err != -ENOENT) {
3344                 /* ENOENT can be returned if the entry hasn't been fsynced yet */
3345                 btrfs_abort_transaction(trans, err);
3346         }
3347
3348         btrfs_end_log_trans(root);
3349
3350         return err;
3351 }
3352
3353 /* see comments for btrfs_del_dir_entries_in_log */
3354 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3355                                struct btrfs_root *root,
3356                                const char *name, int name_len,
3357                                struct btrfs_inode *inode, u64 dirid)
3358 {
3359         struct btrfs_fs_info *fs_info = root->fs_info;
3360         struct btrfs_root *log;
3361         u64 index;
3362         int ret;
3363
3364         if (!inode_logged(trans, inode))
3365                 return 0;
3366
3367         ret = join_running_log_trans(root);
3368         if (ret)
3369                 return 0;
3370         log = root->log_root;
3371         mutex_lock(&inode->log_mutex);
3372
3373         ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3374                                   dirid, &index);
3375         mutex_unlock(&inode->log_mutex);
3376         if (ret == -ENOSPC) {
3377                 btrfs_set_log_full_commit(fs_info, trans);
3378                 ret = 0;
3379         } else if (ret < 0 && ret != -ENOENT)
3380                 btrfs_abort_transaction(trans, ret);
3381         btrfs_end_log_trans(root);
3382
3383         return ret;
3384 }
3385
3386 /*
3387  * creates a range item in the log for 'dirid'.  first_offset and
3388  * last_offset tell us which parts of the key space the log should
3389  * be considered authoritative for.
3390  */
3391 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3392                                        struct btrfs_root *log,
3393                                        struct btrfs_path *path,
3394                                        int key_type, u64 dirid,
3395                                        u64 first_offset, u64 last_offset)
3396 {
3397         int ret;
3398         struct btrfs_key key;
3399         struct btrfs_dir_log_item *item;
3400
3401         key.objectid = dirid;
3402         key.offset = first_offset;
3403         if (key_type == BTRFS_DIR_ITEM_KEY)
3404                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
3405         else
3406                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3407         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3408         if (ret)
3409                 return ret;
3410
3411         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3412                               struct btrfs_dir_log_item);
3413         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3414         btrfs_mark_buffer_dirty(path->nodes[0]);
3415         btrfs_release_path(path);
3416         return 0;
3417 }
3418
3419 /*
3420  * log all the items included in the current transaction for a given
3421  * directory.  This also creates the range items in the log tree required
3422  * to replay anything deleted before the fsync
3423  */
3424 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3425                           struct btrfs_root *root, struct btrfs_inode *inode,
3426                           struct btrfs_path *path,
3427                           struct btrfs_path *dst_path, int key_type,
3428                           struct btrfs_log_ctx *ctx,
3429                           u64 min_offset, u64 *last_offset_ret)
3430 {
3431         struct btrfs_key min_key;
3432         struct btrfs_root *log = root->log_root;
3433         struct extent_buffer *src;
3434         int err = 0;
3435         int ret;
3436         int i;
3437         int nritems;
3438         u64 first_offset = min_offset;
3439         u64 last_offset = (u64)-1;
3440         u64 ino = btrfs_ino(inode);
3441
3442         log = root->log_root;
3443
3444         min_key.objectid = ino;
3445         min_key.type = key_type;
3446         min_key.offset = min_offset;
3447
3448         ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3449
3450         /*
3451          * we didn't find anything from this transaction, see if there
3452          * is anything at all
3453          */
3454         if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3455                 min_key.objectid = ino;
3456                 min_key.type = key_type;
3457                 min_key.offset = (u64)-1;
3458                 btrfs_release_path(path);
3459                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3460                 if (ret < 0) {
3461                         btrfs_release_path(path);
3462                         return ret;
3463                 }
3464                 ret = btrfs_previous_item(root, path, ino, key_type);
3465
3466                 /* if ret == 0 there are items for this type,
3467                  * create a range to tell us the last key of this type.
3468                  * otherwise, there are no items in this directory after
3469                  * *min_offset, and we create a range to indicate that.
3470                  */
3471                 if (ret == 0) {
3472                         struct btrfs_key tmp;
3473                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3474                                               path->slots[0]);
3475                         if (key_type == tmp.type)
3476                                 first_offset = max(min_offset, tmp.offset) + 1;
3477                 }
3478                 goto done;
3479         }
3480
3481         /* go backward to find any previous key */
3482         ret = btrfs_previous_item(root, path, ino, key_type);
3483         if (ret == 0) {
3484                 struct btrfs_key tmp;
3485                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3486                 if (key_type == tmp.type) {
3487                         first_offset = tmp.offset;
3488                         ret = overwrite_item(trans, log, dst_path,
3489                                              path->nodes[0], path->slots[0],
3490                                              &tmp);
3491                         if (ret) {
3492                                 err = ret;
3493                                 goto done;
3494                         }
3495                 }
3496         }
3497         btrfs_release_path(path);
3498
3499         /*
3500          * Find the first key from this transaction again.  See the note for
3501          * log_new_dir_dentries, if we're logging a directory recursively we
3502          * won't be holding its i_mutex, which means we can modify the directory
3503          * while we're logging it.  If we remove an entry between our first
3504          * search and this search we'll not find the key again and can just
3505          * bail.
3506          */
3507 search:
3508         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3509         if (ret != 0)
3510                 goto done;
3511
3512         /*
3513          * we have a block from this transaction, log every item in it
3514          * from our directory
3515          */
3516         while (1) {
3517                 struct btrfs_key tmp;
3518                 src = path->nodes[0];
3519                 nritems = btrfs_header_nritems(src);
3520                 for (i = path->slots[0]; i < nritems; i++) {
3521                         struct btrfs_dir_item *di;
3522
3523                         btrfs_item_key_to_cpu(src, &min_key, i);
3524
3525                         if (min_key.objectid != ino || min_key.type != key_type)
3526                                 goto done;
3527
3528                         if (need_resched()) {
3529                                 btrfs_release_path(path);
3530                                 cond_resched();
3531                                 goto search;
3532                         }
3533
3534                         ret = overwrite_item(trans, log, dst_path, src, i,
3535                                              &min_key);
3536                         if (ret) {
3537                                 err = ret;
3538                                 goto done;
3539                         }
3540
3541                         /*
3542                          * We must make sure that when we log a directory entry,
3543                          * the corresponding inode, after log replay, has a
3544                          * matching link count. For example:
3545                          *
3546                          * touch foo
3547                          * mkdir mydir
3548                          * sync
3549                          * ln foo mydir/bar
3550                          * xfs_io -c "fsync" mydir
3551                          * <crash>
3552                          * <mount fs and log replay>
3553                          *
3554                          * Would result in a fsync log that when replayed, our
3555                          * file inode would have a link count of 1, but we get
3556                          * two directory entries pointing to the same inode.
3557                          * After removing one of the names, it would not be
3558                          * possible to remove the other name, which resulted
3559                          * always in stale file handle errors, and would not
3560                          * be possible to rmdir the parent directory, since
3561                          * its i_size could never decrement to the value
3562                          * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3563                          */
3564                         di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3565                         btrfs_dir_item_key_to_cpu(src, di, &tmp);
3566                         if (ctx &&
3567                             (btrfs_dir_transid(src, di) == trans->transid ||
3568                              btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3569                             tmp.type != BTRFS_ROOT_ITEM_KEY)
3570                                 ctx->log_new_dentries = true;
3571                 }
3572                 path->slots[0] = nritems;
3573
3574                 /*
3575                  * look ahead to the next item and see if it is also
3576                  * from this directory and from this transaction
3577                  */
3578                 ret = btrfs_next_leaf(root, path);
3579                 if (ret) {
3580                         if (ret == 1)
3581                                 last_offset = (u64)-1;
3582                         else
3583                                 err = ret;
3584                         goto done;
3585                 }
3586                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3587                 if (tmp.objectid != ino || tmp.type != key_type) {
3588                         last_offset = (u64)-1;
3589                         goto done;
3590                 }
3591                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3592                         ret = overwrite_item(trans, log, dst_path,
3593                                              path->nodes[0], path->slots[0],
3594                                              &tmp);
3595                         if (ret)
3596                                 err = ret;
3597                         else
3598                                 last_offset = tmp.offset;
3599                         goto done;
3600                 }
3601         }
3602 done:
3603         btrfs_release_path(path);
3604         btrfs_release_path(dst_path);
3605
3606         if (err == 0) {
3607                 *last_offset_ret = last_offset;
3608                 /*
3609                  * insert the log range keys to indicate where the log
3610                  * is valid
3611                  */
3612                 ret = insert_dir_log_key(trans, log, path, key_type,
3613                                          ino, first_offset, last_offset);
3614                 if (ret)
3615                         err = ret;
3616         }
3617         return err;
3618 }
3619
3620 /*
3621  * logging directories is very similar to logging inodes, We find all the items
3622  * from the current transaction and write them to the log.
3623  *
3624  * The recovery code scans the directory in the subvolume, and if it finds a
3625  * key in the range logged that is not present in the log tree, then it means
3626  * that dir entry was unlinked during the transaction.
3627  *
3628  * In order for that scan to work, we must include one key smaller than
3629  * the smallest logged by this transaction and one key larger than the largest
3630  * key logged by this transaction.
3631  */
3632 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3633                           struct btrfs_root *root, struct btrfs_inode *inode,
3634                           struct btrfs_path *path,
3635                           struct btrfs_path *dst_path,
3636                           struct btrfs_log_ctx *ctx)
3637 {
3638         u64 min_key;
3639         u64 max_key;
3640         int ret;
3641         int key_type = BTRFS_DIR_ITEM_KEY;
3642
3643 again:
3644         min_key = 0;
3645         max_key = 0;
3646         while (1) {
3647                 ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3648                                 ctx, min_key, &max_key);
3649                 if (ret)
3650                         return ret;
3651                 if (max_key == (u64)-1)
3652                         break;
3653                 min_key = max_key + 1;
3654         }
3655
3656         if (key_type == BTRFS_DIR_ITEM_KEY) {
3657                 key_type = BTRFS_DIR_INDEX_KEY;
3658                 goto again;
3659         }
3660         return 0;
3661 }
3662
3663 /*
3664  * a helper function to drop items from the log before we relog an
3665  * inode.  max_key_type indicates the highest item type to remove.
3666  * This cannot be run for file data extents because it does not
3667  * free the extents they point to.
3668  */
3669 static int drop_objectid_items(struct btrfs_trans_handle *trans,
3670                                   struct btrfs_root *log,
3671                                   struct btrfs_path *path,
3672                                   u64 objectid, int max_key_type)
3673 {
3674         int ret;
3675         struct btrfs_key key;
3676         struct btrfs_key found_key;
3677         int start_slot;
3678
3679         key.objectid = objectid;
3680         key.type = max_key_type;
3681         key.offset = (u64)-1;
3682
3683         while (1) {
3684                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3685                 BUG_ON(ret == 0); /* Logic error */
3686                 if (ret < 0)
3687                         break;
3688
3689                 if (path->slots[0] == 0)
3690                         break;
3691
3692                 path->slots[0]--;
3693                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3694                                       path->slots[0]);
3695
3696                 if (found_key.objectid != objectid)
3697                         break;
3698
3699                 found_key.offset = 0;
3700                 found_key.type = 0;
3701                 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3702                                        &start_slot);
3703
3704                 ret = btrfs_del_items(trans, log, path, start_slot,
3705                                       path->slots[0] - start_slot + 1);
3706                 /*
3707                  * If start slot isn't 0 then we don't need to re-search, we've
3708                  * found the last guy with the objectid in this tree.
3709                  */
3710                 if (ret || start_slot != 0)
3711                         break;
3712                 btrfs_release_path(path);
3713         }
3714         btrfs_release_path(path);
3715         if (ret > 0)
3716                 ret = 0;
3717         return ret;
3718 }
3719
3720 static void fill_inode_item(struct btrfs_trans_handle *trans,
3721                             struct extent_buffer *leaf,
3722                             struct btrfs_inode_item *item,
3723                             struct inode *inode, int log_inode_only,
3724                             u64 logged_isize)
3725 {
3726         struct btrfs_map_token token;
3727
3728         btrfs_init_map_token(&token);
3729
3730         if (log_inode_only) {
3731                 /* set the generation to zero so the recover code
3732                  * can tell the difference between an logging
3733                  * just to say 'this inode exists' and a logging
3734                  * to say 'update this inode with these values'
3735                  */
3736                 btrfs_set_token_inode_generation(leaf, item, 0, &token);
3737                 btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3738         } else {
3739                 btrfs_set_token_inode_generation(leaf, item,
3740                                                  BTRFS_I(inode)->generation,
3741                                                  &token);
3742                 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
3743         }
3744
3745         btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3746         btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3747         btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3748         btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3749
3750         btrfs_set_token_timespec_sec(leaf, &item->atime,
3751                                      inode->i_atime.tv_sec, &token);
3752         btrfs_set_token_timespec_nsec(leaf, &item->atime,
3753                                       inode->i_atime.tv_nsec, &token);
3754
3755         btrfs_set_token_timespec_sec(leaf, &item->mtime,
3756                                      inode->i_mtime.tv_sec, &token);
3757         btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3758                                       inode->i_mtime.tv_nsec, &token);
3759
3760         btrfs_set_token_timespec_sec(leaf, &item->ctime,
3761                                      inode->i_ctime.tv_sec, &token);
3762         btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3763                                       inode->i_ctime.tv_nsec, &token);
3764
3765         btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3766                                      &token);
3767
3768         btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3769         btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3770         btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3771         btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3772         btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3773 }
3774
3775 static int log_inode_item(struct btrfs_trans_handle *trans,
3776                           struct btrfs_root *log, struct btrfs_path *path,
3777                           struct btrfs_inode *inode)
3778 {
3779         struct btrfs_inode_item *inode_item;
3780         int ret;
3781
3782         ret = btrfs_insert_empty_item(trans, log, path,
3783                                       &inode->location, sizeof(*inode_item));
3784         if (ret && ret != -EEXIST)
3785                 return ret;
3786         inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3787                                     struct btrfs_inode_item);
3788         fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
3789                         0, 0);
3790         btrfs_release_path(path);
3791         return 0;
3792 }
3793
3794 static noinline int copy_items(struct btrfs_trans_handle *trans,
3795                                struct btrfs_inode *inode,
3796                                struct btrfs_path *dst_path,
3797                                struct btrfs_path *src_path,
3798                                int start_slot, int nr, int inode_only,
3799                                u64 logged_isize)
3800 {
3801         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
3802         unsigned long src_offset;
3803         unsigned long dst_offset;
3804         struct btrfs_root *log = inode->root->log_root;
3805         struct btrfs_file_extent_item *extent;
3806         struct btrfs_inode_item *inode_item;
3807         struct extent_buffer *src = src_path->nodes[0];
3808         int ret;
3809         struct btrfs_key *ins_keys;
3810         u32 *ins_sizes;
3811         char *ins_data;
3812         int i;
3813         struct list_head ordered_sums;
3814         int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
3815
3816         INIT_LIST_HEAD(&ordered_sums);
3817
3818         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3819                            nr * sizeof(u32), GFP_NOFS);
3820         if (!ins_data)
3821                 return -ENOMEM;
3822
3823         ins_sizes = (u32 *)ins_data;
3824         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3825
3826         for (i = 0; i < nr; i++) {
3827                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3828                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3829         }
3830         ret = btrfs_insert_empty_items(trans, log, dst_path,
3831                                        ins_keys, ins_sizes, nr);
3832         if (ret) {
3833                 kfree(ins_data);
3834                 return ret;
3835         }
3836
3837         for (i = 0; i < nr; i++, dst_path->slots[0]++) {
3838                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3839                                                    dst_path->slots[0]);
3840
3841                 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3842
3843                 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
3844                         inode_item = btrfs_item_ptr(dst_path->nodes[0],
3845                                                     dst_path->slots[0],
3846                                                     struct btrfs_inode_item);
3847                         fill_inode_item(trans, dst_path->nodes[0], inode_item,
3848                                         &inode->vfs_inode,
3849                                         inode_only == LOG_INODE_EXISTS,
3850                                         logged_isize);
3851                 } else {
3852                         copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3853                                            src_offset, ins_sizes[i]);
3854                 }
3855
3856                 /* take a reference on file data extents so that truncates
3857                  * or deletes of this inode don't have to relog the inode
3858                  * again
3859                  */
3860                 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3861                     !skip_csum) {
3862                         int found_type;
3863                         extent = btrfs_item_ptr(src, start_slot + i,
3864                                                 struct btrfs_file_extent_item);
3865
3866                         if (btrfs_file_extent_generation(src, extent) < trans->transid)
3867                                 continue;
3868
3869                         found_type = btrfs_file_extent_type(src, extent);
3870                         if (found_type == BTRFS_FILE_EXTENT_REG) {
3871                                 u64 ds, dl, cs, cl;
3872                                 ds = btrfs_file_extent_disk_bytenr(src,
3873                                                                 extent);
3874                                 /* ds == 0 is a hole */
3875                                 if (ds == 0)
3876                                         continue;
3877
3878                                 dl = btrfs_file_extent_disk_num_bytes(src,
3879                                                                 extent);
3880                                 cs = btrfs_file_extent_offset(src, extent);
3881                                 cl = btrfs_file_extent_num_bytes(src,
3882                                                                 extent);
3883                                 if (btrfs_file_extent_compression(src,
3884                                                                   extent)) {
3885                                         cs = 0;
3886                                         cl = dl;
3887                                 }
3888
3889                                 ret = btrfs_lookup_csums_range(
3890                                                 fs_info->csum_root,
3891                                                 ds + cs, ds + cs + cl - 1,
3892                                                 &ordered_sums, 0);
3893                                 if (ret)
3894                                         break;
3895                         }
3896                 }
3897         }
3898
3899         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
3900         btrfs_release_path(dst_path);
3901         kfree(ins_data);
3902
3903         /*
3904          * we have to do this after the loop above to avoid changing the
3905          * log tree while trying to change the log tree.
3906          */
3907         while (!list_empty(&ordered_sums)) {
3908                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
3909                                                    struct btrfs_ordered_sum,
3910                                                    list);
3911                 if (!ret)
3912                         ret = btrfs_csum_file_blocks(trans, log, sums);
3913                 list_del(&sums->list);
3914                 kfree(sums);
3915         }
3916
3917         return ret;
3918 }
3919
3920 static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3921 {
3922         struct extent_map *em1, *em2;
3923
3924         em1 = list_entry(a, struct extent_map, list);
3925         em2 = list_entry(b, struct extent_map, list);
3926
3927         if (em1->start < em2->start)
3928                 return -1;
3929         else if (em1->start > em2->start)
3930                 return 1;
3931         return 0;
3932 }
3933
3934 static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3935                                 struct inode *inode,
3936                                 struct btrfs_root *root,
3937                                 const struct extent_map *em,
3938                                 const struct list_head *logged_list,
3939                                 bool *ordered_io_error)
3940 {
3941         struct btrfs_fs_info *fs_info = root->fs_info;
3942         struct btrfs_ordered_extent *ordered;
3943         struct btrfs_root *log = root->log_root;
3944         u64 mod_start = em->mod_start;
3945         u64 mod_len = em->mod_len;
3946         const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3947         u64 csum_offset;
3948         u64 csum_len;
3949         LIST_HEAD(ordered_sums);
3950         int ret = 0;
3951
3952         *ordered_io_error = false;
3953
3954         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
3955             em->block_start == EXTENT_MAP_HOLE)
3956                 return 0;
3957
3958         /*
3959          * Wait far any ordered extent that covers our extent map. If it
3960          * finishes without an error, first check and see if our csums are on
3961          * our outstanding ordered extents.
3962          */
3963         list_for_each_entry(ordered, logged_list, log_list) {
3964                 struct btrfs_ordered_sum *sum;
3965
3966                 if (!mod_len)
3967                         break;
3968
3969                 if (ordered->file_offset + ordered->len <= mod_start ||
3970                     mod_start + mod_len <= ordered->file_offset)
3971                         continue;
3972
3973                 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
3974                     !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3975                     !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
3976                         const u64 start = ordered->file_offset;
3977                         const u64 end = ordered->file_offset + ordered->len - 1;
3978
3979                         WARN_ON(ordered->inode != inode);
3980                         filemap_fdatawrite_range(inode->i_mapping, start, end);
3981                 }
3982
3983                 wait_event(ordered->wait,
3984                            (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
3985                             test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3986
3987                 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3988                         /*
3989                          * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3990                          * i_mapping flags, so that the next fsync won't get
3991                          * an outdated io error too.
3992                          */
3993                         filemap_check_errors(inode->i_mapping);
3994                         *ordered_io_error = true;
3995                         break;
3996                 }
3997                 /*
3998                  * We are going to copy all the csums on this ordered extent, so
3999                  * go ahead and adjust mod_start and mod_len in case this
4000                  * ordered extent has already been logged.
4001                  */
4002                 if (ordered->file_offset > mod_start) {
4003                         if (ordered->file_offset + ordered->len >=
4004                             mod_start + mod_len)
4005                                 mod_len = ordered->file_offset - mod_start;
4006                         /*
4007                          * If we have this case
4008                          *
4009                          * |--------- logged extent ---------|
4010                          *       |----- ordered extent ----|
4011                          *
4012                          * Just don't mess with mod_start and mod_len, we'll
4013                          * just end up logging more csums than we need and it
4014                          * will be ok.
4015                          */
4016                 } else {
4017                         if (ordered->file_offset + ordered->len <
4018                             mod_start + mod_len) {
4019                                 mod_len = (mod_start + mod_len) -
4020                                         (ordered->file_offset + ordered->len);
4021                                 mod_start = ordered->file_offset +
4022                                         ordered->len;
4023                         } else {
4024                                 mod_len = 0;
4025                         }
4026                 }
4027
4028                 if (skip_csum)
4029                         continue;
4030
4031                 /*
4032                  * To keep us from looping for the above case of an ordered
4033                  * extent that falls inside of the logged extent.
4034                  */
4035                 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
4036                                      &ordered->flags))
4037                         continue;
4038
4039                 list_for_each_entry(sum, &ordered->list, list) {
4040                         ret = btrfs_csum_file_blocks(trans, log, sum);
4041                         if (ret)
4042                                 break;
4043                 }
4044         }
4045
4046         if (*ordered_io_error || !mod_len || ret || skip_csum)
4047                 return ret;
4048
4049         if (em->compress_type) {
4050                 csum_offset = 0;
4051                 csum_len = max(em->block_len, em->orig_block_len);
4052         } else {
4053                 csum_offset = mod_start - em->start;
4054                 csum_len = mod_len;
4055         }
4056
4057         /* block start is already adjusted for the file extent offset. */
4058         ret = btrfs_lookup_csums_range(fs_info->csum_root,
4059                                        em->block_start + csum_offset,
4060                                        em->block_start + csum_offset +
4061                                        csum_len - 1, &ordered_sums, 0);
4062         if (ret)
4063                 return ret;
4064
4065         while (!list_empty(&ordered_sums)) {
4066                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4067                                                    struct btrfs_ordered_sum,
4068                                                    list);
4069                 if (!ret)
4070                         ret = btrfs_csum_file_blocks(trans, log, sums);
4071                 list_del(&sums->list);
4072                 kfree(sums);
4073         }
4074
4075         return ret;
4076 }
4077
4078 static int log_one_extent(struct btrfs_trans_handle *trans,
4079                           struct btrfs_inode *inode, struct btrfs_root *root,
4080                           const struct extent_map *em,
4081                           struct btrfs_path *path,
4082                           const struct list_head *logged_list,
4083                           struct btrfs_log_ctx *ctx)
4084 {
4085         struct btrfs_root *log = root->log_root;
4086         struct btrfs_file_extent_item *fi;
4087         struct extent_buffer *leaf;
4088         struct btrfs_map_token token;
4089         struct btrfs_key key;
4090         u64 extent_offset = em->start - em->orig_start;
4091         u64 block_len;
4092         int ret;
4093         int extent_inserted = 0;
4094         bool ordered_io_err = false;
4095
4096         ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em,
4097                         logged_list, &ordered_io_err);
4098         if (ret)
4099                 return ret;
4100
4101         if (ordered_io_err) {
4102                 ctx->io_err = -EIO;
4103                 return ctx->io_err;
4104         }
4105
4106         btrfs_init_map_token(&token);
4107
4108         ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
4109                                    em->start + em->len, NULL, 0, 1,
4110                                    sizeof(*fi), &extent_inserted);
4111         if (ret)
4112                 return ret;
4113
4114         if (!extent_inserted) {
4115                 key.objectid = btrfs_ino(inode);
4116                 key.type = BTRFS_EXTENT_DATA_KEY;
4117                 key.offset = em->start;
4118
4119                 ret = btrfs_insert_empty_item(trans, log, path, &key,
4120                                               sizeof(*fi));
4121                 if (ret)
4122                         return ret;
4123         }
4124         leaf = path->nodes[0];
4125         fi = btrfs_item_ptr(leaf, path->slots[0],
4126                             struct btrfs_file_extent_item);
4127
4128         btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
4129                                                &token);
4130         if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4131                 btrfs_set_token_file_extent_type(leaf, fi,
4132                                                  BTRFS_FILE_EXTENT_PREALLOC,
4133                                                  &token);
4134         else
4135                 btrfs_set_token_file_extent_type(leaf, fi,
4136                                                  BTRFS_FILE_EXTENT_REG,
4137                                                  &token);
4138
4139         block_len = max(em->block_len, em->orig_block_len);
4140         if (em->compress_type != BTRFS_COMPRESS_NONE) {
4141                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4142                                                         em->block_start,
4143                                                         &token);
4144                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4145                                                            &token);
4146         } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4147                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4148                                                         em->block_start -
4149                                                         extent_offset, &token);
4150                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4151                                                            &token);
4152         } else {
4153                 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
4154                 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
4155                                                            &token);
4156         }
4157
4158         btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
4159         btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
4160         btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
4161         btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
4162                                                 &token);
4163         btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
4164         btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
4165         btrfs_mark_buffer_dirty(leaf);
4166
4167         btrfs_release_path(path);
4168
4169         return ret;
4170 }
4171
4172 /*
4173  * Log all prealloc extents beyond the inode's i_size to make sure we do not
4174  * lose them after doing a fast fsync and replaying the log. We scan the
4175  * subvolume's root instead of iterating the inode's extent map tree because
4176  * otherwise we can log incorrect extent items based on extent map conversion.
4177  * That can happen due to the fact that extent maps are merged when they
4178  * are not in the extent map tree's list of modified extents.
4179  */
4180 static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4181                                       struct btrfs_inode *inode,
4182                                       struct btrfs_path *path)
4183 {
4184         struct btrfs_root *root = inode->root;
4185         struct btrfs_key key;
4186         const u64 i_size = i_size_read(&inode->vfs_inode);
4187         const u64 ino = btrfs_ino(inode);
4188         struct btrfs_path *dst_path = NULL;
4189         bool dropped_extents = false;
4190         u64 truncate_offset = i_size;
4191         struct extent_buffer *leaf;
4192         int slot;
4193         int ins_nr = 0;
4194         int start_slot;
4195         int ret;
4196
4197         if (!(inode->flags & BTRFS_INODE_PREALLOC))
4198                 return 0;
4199
4200         key.objectid = ino;
4201         key.type = BTRFS_EXTENT_DATA_KEY;
4202         key.offset = i_size;
4203         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4204         if (ret < 0)
4205                 goto out;
4206
4207         /*
4208          * We must check if there is a prealloc extent that starts before the
4209          * i_size and crosses the i_size boundary. This is to ensure later we
4210          * truncate down to the end of that extent and not to the i_size, as
4211          * otherwise we end up losing part of the prealloc extent after a log
4212          * replay and with an implicit hole if there is another prealloc extent
4213          * that starts at an offset beyond i_size.
4214          */
4215         ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4216         if (ret < 0)
4217                 goto out;
4218
4219         if (ret == 0) {
4220                 struct btrfs_file_extent_item *ei;
4221
4222                 leaf = path->nodes[0];
4223                 slot = path->slots[0];
4224                 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4225
4226                 if (btrfs_file_extent_type(leaf, ei) ==
4227                     BTRFS_FILE_EXTENT_PREALLOC) {
4228                         u64 extent_end;
4229
4230                         btrfs_item_key_to_cpu(leaf, &key, slot);
4231                         extent_end = key.offset +
4232                                 btrfs_file_extent_num_bytes(leaf, ei);
4233
4234                         if (extent_end > i_size)
4235                                 truncate_offset = extent_end;
4236                 }
4237         } else {
4238                 ret = 0;
4239         }
4240
4241         while (true) {
4242                 leaf = path->nodes[0];
4243                 slot = path->slots[0];
4244
4245                 if (slot >= btrfs_header_nritems(leaf)) {
4246                         if (ins_nr > 0) {
4247                                 ret = copy_items(trans, inode, dst_path, path,
4248                                                  start_slot, ins_nr, 1, 0);
4249                                 if (ret < 0)
4250                                         goto out;
4251                                 ins_nr = 0;
4252                         }
4253                         ret = btrfs_next_leaf(root, path);
4254                         if (ret < 0)
4255                                 goto out;
4256                         if (ret > 0) {
4257                                 ret = 0;
4258                                 break;
4259                         }
4260                         continue;
4261                 }
4262
4263                 btrfs_item_key_to_cpu(leaf, &key, slot);
4264                 if (key.objectid > ino)
4265                         break;
4266                 if (WARN_ON_ONCE(key.objectid < ino) ||
4267                     key.type < BTRFS_EXTENT_DATA_KEY ||
4268                     key.offset < i_size) {
4269                         path->slots[0]++;
4270                         continue;
4271                 }
4272                 if (!dropped_extents) {
4273                         /*
4274                          * Avoid logging extent items logged in past fsync calls
4275                          * and leading to duplicate keys in the log tree.
4276                          */
4277                         do {
4278                                 ret = btrfs_truncate_inode_items(trans,
4279                                                          root->log_root,
4280                                                          &inode->vfs_inode,
4281                                                          truncate_offset,
4282                                                          BTRFS_EXTENT_DATA_KEY);
4283                         } while (ret == -EAGAIN);
4284                         if (ret)
4285                                 goto out;
4286                         dropped_extents = true;
4287                 }
4288                 if (ins_nr == 0)
4289                         start_slot = slot;
4290                 ins_nr++;
4291                 path->slots[0]++;
4292                 if (!dst_path) {
4293                         dst_path = btrfs_alloc_path();
4294                         if (!dst_path) {
4295                                 ret = -ENOMEM;
4296                                 goto out;
4297                         }
4298                 }
4299         }
4300         if (ins_nr > 0) {
4301                 ret = copy_items(trans, inode, dst_path, path,
4302                                  start_slot, ins_nr, 1, 0);
4303                 if (ret > 0)
4304                         ret = 0;
4305         }
4306 out:
4307         btrfs_release_path(path);
4308         btrfs_free_path(dst_path);
4309         return ret;
4310 }
4311
4312 static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4313                                      struct btrfs_root *root,
4314                                      struct btrfs_inode *inode,
4315                                      struct btrfs_path *path,
4316                                      struct list_head *logged_list,
4317                                      struct btrfs_log_ctx *ctx,
4318                                      const u64 start,
4319                                      const u64 end)
4320 {
4321         struct extent_map *em, *n;
4322         struct list_head extents;
4323         struct extent_map_tree *tree = &inode->extent_tree;
4324         u64 logged_start, logged_end;
4325         u64 test_gen;
4326         int ret = 0;
4327         int num = 0;
4328
4329         INIT_LIST_HEAD(&extents);
4330
4331         write_lock(&tree->lock);
4332         test_gen = root->fs_info->last_trans_committed;
4333         logged_start = start;
4334         logged_end = end;
4335
4336         list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4337                 list_del_init(&em->list);
4338                 /*
4339                  * Just an arbitrary number, this can be really CPU intensive
4340                  * once we start getting a lot of extents, and really once we
4341                  * have a bunch of extents we just want to commit since it will
4342                  * be faster.
4343                  */
4344                 if (++num > 32768) {
4345                         list_del_init(&tree->modified_extents);
4346                         ret = -EFBIG;
4347                         goto process;
4348                 }
4349
4350                 if (em->generation <= test_gen)
4351                         continue;
4352
4353                 /* We log prealloc extents beyond eof later. */
4354                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4355                     em->start >= i_size_read(&inode->vfs_inode))
4356                         continue;
4357
4358                 if (em->start < logged_start)
4359                         logged_start = em->start;
4360                 if ((em->start + em->len - 1) > logged_end)
4361                         logged_end = em->start + em->len - 1;
4362
4363                 /* Need a ref to keep it from getting evicted from cache */
4364                 refcount_inc(&em->refs);
4365                 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4366                 list_add_tail(&em->list, &extents);
4367                 num++;
4368         }
4369
4370         list_sort(NULL, &extents, extent_cmp);
4371         btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
4372         /*
4373          * Some ordered extents started by fsync might have completed
4374          * before we could collect them into the list logged_list, which
4375          * means they're gone, not in our logged_list nor in the inode's
4376          * ordered tree. We want the application/user space to know an
4377          * error happened while attempting to persist file data so that
4378          * it can take proper action. If such error happened, we leave
4379          * without writing to the log tree and the fsync must report the
4380          * file data write error and not commit the current transaction.
4381          */
4382         ret = filemap_check_errors(inode->vfs_inode.i_mapping);
4383         if (ret)
4384                 ctx->io_err = ret;
4385 process:
4386         while (!list_empty(&extents)) {
4387                 em = list_entry(extents.next, struct extent_map, list);
4388
4389                 list_del_init(&em->list);
4390
4391                 /*
4392                  * If we had an error we just need to delete everybody from our
4393                  * private list.
4394                  */
4395                 if (ret) {
4396                         clear_em_logging(tree, em);
4397                         free_extent_map(em);
4398                         continue;
4399                 }
4400
4401                 write_unlock(&tree->lock);
4402
4403                 ret = log_one_extent(trans, inode, root, em, path, logged_list,
4404                                      ctx);
4405                 write_lock(&tree->lock);
4406                 clear_em_logging(tree, em);
4407                 free_extent_map(em);
4408         }
4409         WARN_ON(!list_empty(&extents));
4410         write_unlock(&tree->lock);
4411
4412         btrfs_release_path(path);
4413         if (!ret)
4414                 ret = btrfs_log_prealloc_extents(trans, inode, path);
4415
4416         return ret;
4417 }
4418
4419 static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4420                              struct btrfs_path *path, u64 *size_ret)
4421 {
4422         struct btrfs_key key;
4423         int ret;
4424
4425         key.objectid = btrfs_ino(inode);
4426         key.type = BTRFS_INODE_ITEM_KEY;
4427         key.offset = 0;
4428
4429         ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4430         if (ret < 0) {
4431                 return ret;
4432         } else if (ret > 0) {
4433                 *size_ret = 0;
4434         } else {
4435                 struct btrfs_inode_item *item;
4436
4437                 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4438                                       struct btrfs_inode_item);
4439                 *size_ret = btrfs_inode_size(path->nodes[0], item);
4440                 /*
4441                  * If the in-memory inode's i_size is smaller then the inode
4442                  * size stored in the btree, return the inode's i_size, so
4443                  * that we get a correct inode size after replaying the log
4444                  * when before a power failure we had a shrinking truncate
4445                  * followed by addition of a new name (rename / new hard link).
4446                  * Otherwise return the inode size from the btree, to avoid
4447                  * data loss when replaying a log due to previously doing a
4448                  * write that expands the inode's size and logging a new name
4449                  * immediately after.
4450                  */
4451                 if (*size_ret > inode->vfs_inode.i_size)
4452                         *size_ret = inode->vfs_inode.i_size;
4453         }
4454
4455         btrfs_release_path(path);
4456         return 0;
4457 }
4458
4459 /*
4460  * At the moment we always log all xattrs. This is to figure out at log replay
4461  * time which xattrs must have their deletion replayed. If a xattr is missing
4462  * in the log tree and exists in the fs/subvol tree, we delete it. This is
4463  * because if a xattr is deleted, the inode is fsynced and a power failure
4464  * happens, causing the log to be replayed the next time the fs is mounted,
4465  * we want the xattr to not exist anymore (same behaviour as other filesystems
4466  * with a journal, ext3/4, xfs, f2fs, etc).
4467  */
4468 static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4469                                 struct btrfs_root *root,
4470                                 struct btrfs_inode *inode,
4471                                 struct btrfs_path *path,
4472                                 struct btrfs_path *dst_path)
4473 {
4474         int ret;
4475         struct btrfs_key key;
4476         const u64 ino = btrfs_ino(inode);
4477         int ins_nr = 0;
4478         int start_slot = 0;
4479
4480         key.objectid = ino;
4481         key.type = BTRFS_XATTR_ITEM_KEY;
4482         key.offset = 0;
4483
4484         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4485         if (ret < 0)
4486                 return ret;
4487
4488         while (true) {
4489                 int slot = path->slots[0];
4490                 struct extent_buffer *leaf = path->nodes[0];
4491                 int nritems = btrfs_header_nritems(leaf);
4492
4493                 if (slot >= nritems) {
4494                         if (ins_nr > 0) {
4495                                 ret = copy_items(trans, inode, dst_path, path,
4496                                                  start_slot, ins_nr, 1, 0);
4497                                 if (ret < 0)
4498                                         return ret;
4499                                 ins_nr = 0;
4500                         }
4501                         ret = btrfs_next_leaf(root, path);
4502                         if (ret < 0)
4503                                 return ret;
4504                         else if (ret > 0)
4505                                 break;
4506                         continue;
4507                 }
4508
4509                 btrfs_item_key_to_cpu(leaf, &key, slot);
4510                 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4511                         break;
4512
4513                 if (ins_nr == 0)
4514                         start_slot = slot;
4515                 ins_nr++;
4516                 path->slots[0]++;
4517                 cond_resched();
4518         }
4519         if (ins_nr > 0) {
4520                 ret = copy_items(trans, inode, dst_path, path,
4521                                  start_slot, ins_nr, 1, 0);
4522                 if (ret < 0)
4523                         return ret;
4524         }
4525
4526         return 0;
4527 }
4528
4529 /*
4530  * When using the NO_HOLES feature if we punched a hole that causes the
4531  * deletion of entire leafs or all the extent items of the first leaf (the one
4532  * that contains the inode item and references) we may end up not processing
4533  * any extents, because there are no leafs with a generation matching the
4534  * current transaction that have extent items for our inode. So we need to find
4535  * if any holes exist and then log them. We also need to log holes after any
4536  * truncate operation that changes the inode's size.
4537  */
4538 static int btrfs_log_holes(struct btrfs_trans_handle *trans,
4539                            struct btrfs_root *root,
4540                            struct btrfs_inode *inode,
4541                            struct btrfs_path *path)
4542 {
4543         struct btrfs_fs_info *fs_info = root->fs_info;
4544         struct btrfs_key key;
4545         const u64 ino = btrfs_ino(inode);
4546         const u64 i_size = i_size_read(&inode->vfs_inode);
4547         u64 prev_extent_end = 0;
4548         int ret;
4549
4550         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
4551                 return 0;
4552
4553         key.objectid = ino;
4554         key.type = BTRFS_EXTENT_DATA_KEY;
4555         key.offset = 0;
4556
4557         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4558         if (ret < 0)
4559                 return ret;
4560
4561         while (true) {
4562                 struct btrfs_file_extent_item *extent;
4563                 struct extent_buffer *leaf = path->nodes[0];
4564                 u64 len;
4565
4566                 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4567                         ret = btrfs_next_leaf(root, path);
4568                         if (ret < 0)
4569                                 return ret;
4570                         if (ret > 0) {
4571                                 ret = 0;
4572                                 break;
4573                         }
4574                         leaf = path->nodes[0];
4575                 }
4576
4577                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4578                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
4579                         break;
4580
4581                 /* We have a hole, log it. */
4582                 if (prev_extent_end < key.offset) {
4583                         const u64 hole_len = key.offset - prev_extent_end;
4584
4585                         /*
4586                          * Release the path to avoid deadlocks with other code
4587                          * paths that search the root while holding locks on
4588                          * leafs from the log root.
4589                          */
4590                         btrfs_release_path(path);
4591                         ret = btrfs_insert_file_extent(trans, root->log_root,
4592                                                        ino, prev_extent_end, 0,
4593                                                        0, hole_len, 0, hole_len,
4594                                                        0, 0, 0);
4595                         if (ret < 0)
4596                                 return ret;
4597
4598                         /*
4599                          * Search for the same key again in the root. Since it's
4600                          * an extent item and we are holding the inode lock, the
4601                          * key must still exist. If it doesn't just emit warning
4602                          * and return an error to fall back to a transaction
4603                          * commit.
4604                          */
4605                         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4606                         if (ret < 0)
4607                                 return ret;
4608                         if (WARN_ON(ret > 0))
4609                                 return -ENOENT;
4610                         leaf = path->nodes[0];
4611                 }
4612
4613                 extent = btrfs_item_ptr(leaf, path->slots[0],
4614                                         struct btrfs_file_extent_item);
4615                 if (btrfs_file_extent_type(leaf, extent) ==
4616                     BTRFS_FILE_EXTENT_INLINE) {
4617                         len = btrfs_file_extent_ram_bytes(leaf, extent);
4618                         prev_extent_end = ALIGN(key.offset + len,
4619                                                 fs_info->sectorsize);
4620                 } else {
4621                         len = btrfs_file_extent_num_bytes(leaf, extent);
4622                         prev_extent_end = key.offset + len;
4623                 }
4624
4625                 path->slots[0]++;
4626                 cond_resched();
4627         }
4628
4629         if (prev_extent_end < i_size) {
4630                 u64 hole_len;
4631
4632                 btrfs_release_path(path);
4633                 hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
4634                 ret = btrfs_insert_file_extent(trans, root->log_root,
4635                                                ino, prev_extent_end, 0, 0,
4636                                                hole_len, 0, hole_len,
4637                                                0, 0, 0);
4638                 if (ret < 0)
4639                         return ret;
4640         }
4641
4642         return 0;
4643 }
4644
4645 /*
4646  * When we are logging a new inode X, check if it doesn't have a reference that
4647  * matches the reference from some other inode Y created in a past transaction
4648  * and that was renamed in the current transaction. If we don't do this, then at
4649  * log replay time we can lose inode Y (and all its files if it's a directory):
4650  *
4651  * mkdir /mnt/x
4652  * echo "hello world" > /mnt/x/foobar
4653  * sync
4654  * mv /mnt/x /mnt/y
4655  * mkdir /mnt/x                 # or touch /mnt/x
4656  * xfs_io -c fsync /mnt/x
4657  * <power fail>
4658  * mount fs, trigger log replay
4659  *
4660  * After the log replay procedure, we would lose the first directory and all its
4661  * files (file foobar).
4662  * For the case where inode Y is not a directory we simply end up losing it:
4663  *
4664  * echo "123" > /mnt/foo
4665  * sync
4666  * mv /mnt/foo /mnt/bar
4667  * echo "abc" > /mnt/foo
4668  * xfs_io -c fsync /mnt/foo
4669  * <power fail>
4670  *
4671  * We also need this for cases where a snapshot entry is replaced by some other
4672  * entry (file or directory) otherwise we end up with an unreplayable log due to
4673  * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4674  * if it were a regular entry:
4675  *
4676  * mkdir /mnt/x
4677  * btrfs subvolume snapshot /mnt /mnt/x/snap
4678  * btrfs subvolume delete /mnt/x/snap
4679  * rmdir /mnt/x
4680  * mkdir /mnt/x
4681  * fsync /mnt/x or fsync some new file inside it
4682  * <power fail>
4683  *
4684  * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4685  * the same transaction.
4686  */
4687 static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4688                                          const int slot,
4689                                          const struct btrfs_key *key,
4690                                          struct btrfs_inode *inode,
4691                                          u64 *other_ino)
4692 {
4693         int ret;
4694         struct btrfs_path *search_path;
4695         char *name = NULL;
4696         u32 name_len = 0;
4697         u32 item_size = btrfs_item_size_nr(eb, slot);
4698         u32 cur_offset = 0;
4699         unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4700
4701         search_path = btrfs_alloc_path();
4702         if (!search_path)
4703                 return -ENOMEM;
4704         search_path->search_commit_root = 1;
4705         search_path->skip_locking = 1;
4706
4707         while (cur_offset < item_size) {
4708                 u64 parent;
4709                 u32 this_name_len;
4710                 u32 this_len;
4711                 unsigned long name_ptr;
4712                 struct btrfs_dir_item *di;
4713
4714                 if (key->type == BTRFS_INODE_REF_KEY) {
4715                         struct btrfs_inode_ref *iref;
4716
4717                         iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4718                         parent = key->offset;
4719                         this_name_len = btrfs_inode_ref_name_len(eb, iref);
4720                         name_ptr = (unsigned long)(iref + 1);
4721                         this_len = sizeof(*iref) + this_name_len;
4722                 } else {
4723                         struct btrfs_inode_extref *extref;
4724
4725                         extref = (struct btrfs_inode_extref *)(ptr +
4726                                                                cur_offset);
4727                         parent = btrfs_inode_extref_parent(eb, extref);
4728                         this_name_len = btrfs_inode_extref_name_len(eb, extref);
4729                         name_ptr = (unsigned long)&extref->name;
4730                         this_len = sizeof(*extref) + this_name_len;
4731                 }
4732
4733                 ret = btrfs_is_name_len_valid(eb, slot, name_ptr,
4734                                               this_name_len);
4735                 if (!ret) {
4736                         ret = -EIO;
4737                         goto out;
4738                 }
4739                 if (this_name_len > name_len) {
4740                         char *new_name;
4741
4742                         new_name = krealloc(name, this_name_len, GFP_NOFS);
4743                         if (!new_name) {
4744                                 ret = -ENOMEM;
4745                                 goto out;
4746                         }
4747                         name_len = this_name_len;
4748                         name = new_name;
4749                 }
4750
4751                 read_extent_buffer(eb, name, name_ptr, this_name_len);
4752                 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4753                                 parent, name, this_name_len, 0);
4754                 if (di && !IS_ERR(di)) {
4755                         struct btrfs_key di_key;
4756
4757                         btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4758                                                   di, &di_key);
4759                         if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4760                                 ret = 1;
4761                                 *other_ino = di_key.objectid;
4762                         } else {
4763                                 ret = -EAGAIN;
4764                         }
4765                         goto out;
4766                 } else if (IS_ERR(di)) {
4767                         ret = PTR_ERR(di);
4768                         goto out;
4769                 }
4770                 btrfs_release_path(search_path);
4771
4772                 cur_offset += this_len;
4773         }
4774         ret = 0;
4775 out:
4776         btrfs_free_path(search_path);
4777         kfree(name);
4778         return ret;
4779 }
4780
4781 /* log a single inode in the tree log.
4782  * At least one parent directory for this inode must exist in the tree
4783  * or be logged already.
4784  *
4785  * Any items from this inode changed by the current transaction are copied
4786  * to the log tree.  An extra reference is taken on any extents in this
4787  * file, allowing us to avoid a whole pile of corner cases around logging
4788  * blocks that have been removed from the tree.
4789  *
4790  * See LOG_INODE_ALL and related defines for a description of what inode_only
4791  * does.
4792  *
4793  * This handles both files and directories.
4794  */
4795 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4796                            struct btrfs_root *root, struct btrfs_inode *inode,
4797                            int inode_only,
4798                            const loff_t start,
4799                            const loff_t end,
4800                            struct btrfs_log_ctx *ctx)
4801 {
4802         struct btrfs_fs_info *fs_info = root->fs_info;
4803         struct btrfs_path *path;
4804         struct btrfs_path *dst_path;
4805         struct btrfs_key min_key;
4806         struct btrfs_key max_key;
4807         struct btrfs_root *log = root->log_root;
4808         struct extent_buffer *src = NULL;
4809         LIST_HEAD(logged_list);
4810         int err = 0;
4811         int ret;
4812         int nritems;
4813         int ins_start_slot = 0;
4814         int ins_nr;
4815         bool fast_search = false;
4816         u64 ino = btrfs_ino(inode);
4817         struct extent_map_tree *em_tree = &inode->extent_tree;
4818         u64 logged_isize = 0;
4819         bool need_log_inode_item = true;
4820         bool xattrs_logged = false;
4821
4822         path = btrfs_alloc_path();
4823         if (!path)
4824                 return -ENOMEM;
4825         dst_path = btrfs_alloc_path();
4826         if (!dst_path) {
4827                 btrfs_free_path(path);
4828                 return -ENOMEM;
4829         }
4830
4831         min_key.objectid = ino;
4832         min_key.type = BTRFS_INODE_ITEM_KEY;
4833         min_key.offset = 0;
4834
4835         max_key.objectid = ino;
4836
4837
4838         /* today the code can only do partial logging of directories */
4839         if (S_ISDIR(inode->vfs_inode.i_mode) ||
4840             (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4841                        &inode->runtime_flags) &&
4842              inode_only >= LOG_INODE_EXISTS))
4843                 max_key.type = BTRFS_XATTR_ITEM_KEY;
4844         else
4845                 max_key.type = (u8)-1;
4846         max_key.offset = (u64)-1;
4847
4848         /*
4849          * Only run delayed items if we are a dir or a new file.
4850          * Otherwise commit the delayed inode only, which is needed in
4851          * order for the log replay code to mark inodes for link count
4852          * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
4853          */
4854         if (S_ISDIR(inode->vfs_inode.i_mode) ||
4855             inode->generation > fs_info->last_trans_committed)
4856                 ret = btrfs_commit_inode_delayed_items(trans, inode);
4857         else
4858                 ret = btrfs_commit_inode_delayed_inode(inode);
4859
4860         if (ret) {
4861                 btrfs_free_path(path);
4862                 btrfs_free_path(dst_path);
4863                 return ret;
4864         }
4865
4866         if (inode_only == LOG_OTHER_INODE) {
4867                 inode_only = LOG_INODE_EXISTS;
4868                 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
4869         } else {
4870                 mutex_lock(&inode->log_mutex);
4871         }
4872
4873         /*
4874          * For symlinks, we must always log their content, which is stored in an
4875          * inline extent, otherwise we could end up with an empty symlink after
4876          * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
4877          * one attempts to create an empty symlink).
4878          * We don't need to worry about flushing delalloc, because when we create
4879          * the inline extent when the symlink is created (we never have delalloc
4880          * for symlinks).
4881          */
4882         if (S_ISLNK(inode->vfs_inode.i_mode))
4883                 inode_only = LOG_INODE_ALL;
4884
4885         /*
4886          * a brute force approach to making sure we get the most uptodate
4887          * copies of everything.
4888          */
4889         if (S_ISDIR(inode->vfs_inode.i_mode)) {
4890                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
4891
4892                 if (inode_only == LOG_INODE_EXISTS)
4893                         max_key_type = BTRFS_XATTR_ITEM_KEY;
4894                 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4895         } else {
4896                 if (inode_only == LOG_INODE_EXISTS) {
4897                         /*
4898                          * Make sure the new inode item we write to the log has
4899                          * the same isize as the current one (if it exists).
4900                          * This is necessary to prevent data loss after log
4901                          * replay, and also to prevent doing a wrong expanding
4902                          * truncate - for e.g. create file, write 4K into offset
4903                          * 0, fsync, write 4K into offset 4096, add hard link,
4904                          * fsync some other file (to sync log), power fail - if
4905                          * we use the inode's current i_size, after log replay
4906                          * we get a 8Kb file, with the last 4Kb extent as a hole
4907                          * (zeroes), as if an expanding truncate happened,
4908                          * instead of getting a file of 4Kb only.
4909                          */
4910                         err = logged_inode_size(log, inode, path, &logged_isize);
4911                         if (err)
4912                                 goto out_unlock;
4913                 }
4914                 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4915                              &inode->runtime_flags)) {
4916                         if (inode_only == LOG_INODE_EXISTS) {
4917                                 max_key.type = BTRFS_XATTR_ITEM_KEY;
4918                                 ret = drop_objectid_items(trans, log, path, ino,
4919                                                           max_key.type);
4920                         } else {
4921                                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4922                                           &inode->runtime_flags);
4923                                 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4924                                           &inode->runtime_flags);
4925                                 while(1) {
4926                                         ret = btrfs_truncate_inode_items(trans,
4927                                                 log, &inode->vfs_inode, 0, 0);
4928                                         if (ret != -EAGAIN)
4929                                                 break;
4930                                 }
4931                         }
4932                 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
4933                                               &inode->runtime_flags) ||
4934                            inode_only == LOG_INODE_EXISTS) {
4935                         if (inode_only == LOG_INODE_ALL)
4936                                 fast_search = true;
4937                         max_key.type = BTRFS_XATTR_ITEM_KEY;
4938                         ret = drop_objectid_items(trans, log, path, ino,
4939                                                   max_key.type);
4940                 } else {
4941                         if (inode_only == LOG_INODE_ALL)
4942                                 fast_search = true;
4943                         goto log_extents;
4944                 }
4945
4946         }
4947         if (ret) {
4948                 err = ret;
4949                 goto out_unlock;
4950         }
4951
4952         while (1) {
4953                 ins_nr = 0;
4954                 ret = btrfs_search_forward(root, &min_key,
4955                                            path, trans->transid);
4956                 if (ret < 0) {
4957                         err = ret;
4958                         goto out_unlock;
4959                 }
4960                 if (ret != 0)
4961                         break;
4962 again:
4963                 /* note, ins_nr might be > 0 here, cleanup outside the loop */
4964                 if (min_key.objectid != ino)
4965                         break;
4966                 if (min_key.type > max_key.type)
4967                         break;
4968
4969                 if (min_key.type == BTRFS_INODE_ITEM_KEY)
4970                         need_log_inode_item = false;
4971
4972                 if ((min_key.type == BTRFS_INODE_REF_KEY ||
4973                      min_key.type == BTRFS_INODE_EXTREF_KEY) &&
4974                     inode->generation == trans->transid) {
4975                         u64 other_ino = 0;
4976
4977                         ret = btrfs_check_ref_name_override(path->nodes[0],
4978                                         path->slots[0], &min_key, inode,
4979                                         &other_ino);
4980                         if (ret < 0) {
4981                                 err = ret;
4982                                 goto out_unlock;
4983                         } else if (ret > 0 && ctx &&
4984                                    other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
4985                                 struct btrfs_key inode_key;
4986                                 struct inode *other_inode;
4987
4988                                 if (ins_nr > 0) {
4989                                         ins_nr++;
4990                                 } else {
4991                                         ins_nr = 1;
4992                                         ins_start_slot = path->slots[0];
4993                                 }
4994                                 ret = copy_items(trans, inode, dst_path, path,
4995                                                  ins_start_slot,
4996                                                  ins_nr, inode_only,
4997                                                  logged_isize);
4998                                 if (ret < 0) {
4999                                         err = ret;
5000                                         goto out_unlock;
5001                                 }
5002                                 ins_nr = 0;
5003                                 btrfs_release_path(path);
5004                                 inode_key.objectid = other_ino;
5005                                 inode_key.type = BTRFS_INODE_ITEM_KEY;
5006                                 inode_key.offset = 0;
5007                                 other_inode = btrfs_iget(fs_info->sb,
5008                                                          &inode_key, root,
5009                                                          NULL);
5010                                 /*
5011                                  * If the other inode that had a conflicting dir
5012                                  * entry was deleted in the current transaction,
5013                                  * we don't need to do more work nor fallback to
5014                                  * a transaction commit.
5015                                  */
5016                                 if (IS_ERR(other_inode) &&
5017                                     PTR_ERR(other_inode) == -ENOENT) {
5018                                         goto next_key;
5019                                 } else if (IS_ERR(other_inode)) {
5020                                         err = PTR_ERR(other_inode);
5021                                         goto out_unlock;
5022                                 }
5023                                 /*
5024                                  * We are safe logging the other inode without
5025                                  * acquiring its i_mutex as long as we log with
5026                                  * the LOG_INODE_EXISTS mode. We're safe against
5027                                  * concurrent renames of the other inode as well
5028                                  * because during a rename we pin the log and
5029                                  * update the log with the new name before we
5030                                  * unpin it.
5031                                  */
5032                                 err = btrfs_log_inode(trans, root,
5033                                                 BTRFS_I(other_inode),
5034                                                 LOG_OTHER_INODE, 0, LLONG_MAX,
5035                                                 ctx);
5036                                 btrfs_add_delayed_iput(other_inode);
5037                                 if (err)
5038                                         goto out_unlock;
5039                                 else
5040                                         goto next_key;
5041                         }
5042                 }
5043
5044                 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5045                 if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
5046                         if (ins_nr == 0)
5047                                 goto next_slot;
5048                         ret = copy_items(trans, inode, dst_path, path,
5049                                          ins_start_slot,
5050                                          ins_nr, inode_only, logged_isize);
5051                         if (ret < 0) {
5052                                 err = ret;
5053                                 goto out_unlock;
5054                         }
5055                         ins_nr = 0;
5056                         goto next_slot;
5057                 }
5058
5059                 src = path->nodes[0];
5060                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5061                         ins_nr++;
5062                         goto next_slot;
5063                 } else if (!ins_nr) {
5064                         ins_start_slot = path->slots[0];
5065                         ins_nr = 1;
5066                         goto next_slot;
5067                 }
5068
5069                 ret = copy_items(trans, inode, dst_path, path,
5070                                  ins_start_slot, ins_nr, inode_only,
5071                                  logged_isize);
5072                 if (ret < 0) {
5073                         err = ret;
5074                         goto out_unlock;
5075                 }
5076                 ins_nr = 1;
5077                 ins_start_slot = path->slots[0];
5078 next_slot:
5079
5080                 nritems = btrfs_header_nritems(path->nodes[0]);
5081                 path->slots[0]++;
5082                 if (path->slots[0] < nritems) {
5083                         btrfs_item_key_to_cpu(path->nodes[0], &min_key,
5084                                               path->slots[0]);
5085                         goto again;
5086                 }
5087                 if (ins_nr) {
5088                         ret = copy_items(trans, inode, dst_path, path,
5089                                          ins_start_slot,
5090                                          ins_nr, inode_only, logged_isize);
5091                         if (ret < 0) {
5092                                 err = ret;
5093                                 goto out_unlock;
5094                         }
5095                         ins_nr = 0;
5096                 }
5097                 btrfs_release_path(path);
5098 next_key:
5099                 if (min_key.offset < (u64)-1) {
5100                         min_key.offset++;
5101                 } else if (min_key.type < max_key.type) {
5102                         min_key.type++;
5103                         min_key.offset = 0;
5104                 } else {
5105                         break;
5106                 }
5107         }
5108         if (ins_nr) {
5109                 ret = copy_items(trans, inode, dst_path, path,
5110                                  ins_start_slot, ins_nr, inode_only,
5111                                  logged_isize);
5112                 if (ret < 0) {
5113                         err = ret;
5114                         goto out_unlock;
5115                 }
5116                 ins_nr = 0;
5117         }
5118
5119         btrfs_release_path(path);
5120         btrfs_release_path(dst_path);
5121         err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
5122         if (err)
5123                 goto out_unlock;
5124         xattrs_logged = true;
5125         if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5126                 btrfs_release_path(path);
5127                 btrfs_release_path(dst_path);
5128                 err = btrfs_log_holes(trans, root, inode, path);
5129                 if (err)
5130                         goto out_unlock;
5131         }
5132 log_extents:
5133         btrfs_release_path(path);
5134         btrfs_release_path(dst_path);
5135         if (need_log_inode_item) {
5136                 err = log_inode_item(trans, log, dst_path, inode);
5137                 if (!err && !xattrs_logged) {
5138                         err = btrfs_log_all_xattrs(trans, root, inode, path,
5139                                                    dst_path);
5140                         btrfs_release_path(path);
5141                 }
5142                 if (err)
5143                         goto out_unlock;
5144         }
5145         if (fast_search) {
5146                 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5147                                                 &logged_list, ctx, start, end);
5148                 if (ret) {
5149                         err = ret;
5150                         goto out_unlock;
5151                 }
5152         } else if (inode_only == LOG_INODE_ALL) {
5153                 struct extent_map *em, *n;
5154
5155                 write_lock(&em_tree->lock);
5156                 /*
5157                  * We can't just remove every em if we're called for a ranged
5158                  * fsync - that is, one that doesn't cover the whole possible
5159                  * file range (0 to LLONG_MAX). This is because we can have
5160                  * em's that fall outside the range we're logging and therefore
5161                  * their ordered operations haven't completed yet
5162                  * (btrfs_finish_ordered_io() not invoked yet). This means we
5163                  * didn't get their respective file extent item in the fs/subvol
5164                  * tree yet, and need to let the next fast fsync (one which
5165                  * consults the list of modified extent maps) find the em so
5166                  * that it logs a matching file extent item and waits for the
5167                  * respective ordered operation to complete (if it's still
5168                  * running).
5169                  *
5170                  * Removing every em outside the range we're logging would make
5171                  * the next fast fsync not log their matching file extent items,
5172                  * therefore making us lose data after a log replay.
5173                  */
5174                 list_for_each_entry_safe(em, n, &em_tree->modified_extents,
5175                                          list) {
5176                         const u64 mod_end = em->mod_start + em->mod_len - 1;
5177
5178                         if (em->mod_start >= start && mod_end <= end)
5179                                 list_del_init(&em->list);
5180                 }
5181                 write_unlock(&em_tree->lock);
5182         }
5183
5184         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5185                 ret = log_directory_changes(trans, root, inode, path, dst_path,
5186                                         ctx);
5187                 if (ret) {
5188                         err = ret;
5189                         goto out_unlock;
5190                 }
5191         }
5192
5193         /*
5194          * Don't update last_log_commit if we logged that an inode exists after
5195          * it was loaded to memory (full_sync bit set).
5196          * This is to prevent data loss when we do a write to the inode, then
5197          * the inode gets evicted after all delalloc was flushed, then we log
5198          * it exists (due to a rename for example) and then fsync it. This last
5199          * fsync would do nothing (not logging the extents previously written).
5200          */
5201         spin_lock(&inode->lock);
5202         inode->logged_trans = trans->transid;
5203         if (inode_only != LOG_INODE_EXISTS ||
5204             !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5205                 inode->last_log_commit = inode->last_sub_trans;
5206         spin_unlock(&inode->lock);
5207 out_unlock:
5208         if (unlikely(err))
5209                 btrfs_put_logged_extents(&logged_list);
5210         else
5211                 btrfs_submit_logged_extents(&logged_list, log);
5212         mutex_unlock(&inode->log_mutex);
5213
5214         btrfs_free_path(path);
5215         btrfs_free_path(dst_path);
5216         return err;
5217 }
5218
5219 /*
5220  * Check if we must fallback to a transaction commit when logging an inode.
5221  * This must be called after logging the inode and is used only in the context
5222  * when fsyncing an inode requires the need to log some other inode - in which
5223  * case we can't lock the i_mutex of each other inode we need to log as that
5224  * can lead to deadlocks with concurrent fsync against other inodes (as we can
5225  * log inodes up or down in the hierarchy) or rename operations for example. So
5226  * we take the log_mutex of the inode after we have logged it and then check for
5227  * its last_unlink_trans value - this is safe because any task setting
5228  * last_unlink_trans must take the log_mutex and it must do this before it does
5229  * the actual unlink operation, so if we do this check before a concurrent task
5230  * sets last_unlink_trans it means we've logged a consistent version/state of
5231  * all the inode items, otherwise we are not sure and must do a transaction
5232  * commit (the concurrent task might have only updated last_unlink_trans before
5233  * we logged the inode or it might have also done the unlink).
5234  */
5235 static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5236                                           struct btrfs_inode *inode)
5237 {
5238         struct btrfs_fs_info *fs_info = inode->root->fs_info;
5239         bool ret = false;
5240
5241         mutex_lock(&inode->log_mutex);
5242         if (inode->last_unlink_trans > fs_info->last_trans_committed) {
5243                 /*
5244                  * Make sure any commits to the log are forced to be full
5245                  * commits.
5246                  */
5247                 btrfs_set_log_full_commit(fs_info, trans);
5248                 ret = true;
5249         }
5250         mutex_unlock(&inode->log_mutex);
5251
5252         return ret;
5253 }
5254
5255 /*
5256  * follow the dentry parent pointers up the chain and see if any
5257  * of the directories in it require a full commit before they can
5258  * be logged.  Returns zero if nothing special needs to be done or 1 if
5259  * a full commit is required.
5260  */
5261 static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
5262                                                struct btrfs_inode *inode,
5263                                                struct dentry *parent,
5264                                                struct super_block *sb,
5265                                                u64 last_committed)
5266 {
5267         int ret = 0;
5268         struct dentry *old_parent = NULL;
5269
5270         /*
5271          * for regular files, if its inode is already on disk, we don't
5272          * have to worry about the parents at all.  This is because
5273          * we can use the last_unlink_trans field to record renames
5274          * and other fun in this file.
5275          */
5276         if (S_ISREG(inode->vfs_inode.i_mode) &&
5277             inode->generation <= last_committed &&
5278             inode->last_unlink_trans <= last_committed)
5279                 goto out;
5280
5281         if (!S_ISDIR(inode->vfs_inode.i_mode)) {
5282                 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5283                         goto out;
5284                 inode = BTRFS_I(d_inode(parent));
5285         }
5286
5287         while (1) {
5288                 if (btrfs_must_commit_transaction(trans, inode)) {
5289                         ret = 1;
5290                         break;
5291                 }
5292
5293                 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5294                         break;
5295
5296                 if (IS_ROOT(parent)) {
5297                         inode = BTRFS_I(d_inode(parent));
5298                         if (btrfs_must_commit_transaction(trans, inode))
5299                                 ret = 1;
5300                         break;
5301                 }
5302
5303                 parent = dget_parent(parent);
5304                 dput(old_parent);
5305                 old_parent = parent;
5306                 inode = BTRFS_I(d_inode(parent));
5307
5308         }
5309         dput(old_parent);
5310 out:
5311         return ret;
5312 }
5313
5314 struct btrfs_dir_list {
5315         u64 ino;
5316         struct list_head list;
5317 };
5318
5319 /*
5320  * Log the inodes of the new dentries of a directory. See log_dir_items() for
5321  * details about the why it is needed.
5322  * This is a recursive operation - if an existing dentry corresponds to a
5323  * directory, that directory's new entries are logged too (same behaviour as
5324  * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5325  * the dentries point to we do not lock their i_mutex, otherwise lockdep
5326  * complains about the following circular lock dependency / possible deadlock:
5327  *
5328  *        CPU0                                        CPU1
5329  *        ----                                        ----
5330  * lock(&type->i_mutex_dir_key#3/2);
5331  *                                            lock(sb_internal#2);
5332  *                                            lock(&type->i_mutex_dir_key#3/2);
5333  * lock(&sb->s_type->i_mutex_key#14);
5334  *
5335  * Where sb_internal is the lock (a counter that works as a lock) acquired by
5336  * sb_start_intwrite() in btrfs_start_transaction().
5337  * Not locking i_mutex of the inodes is still safe because:
5338  *
5339  * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5340  *    that while logging the inode new references (names) are added or removed
5341  *    from the inode, leaving the logged inode item with a link count that does
5342  *    not match the number of logged inode reference items. This is fine because
5343  *    at log replay time we compute the real number of links and correct the
5344  *    link count in the inode item (see replay_one_buffer() and
5345  *    link_to_fixup_dir());
5346  *
5347  * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5348  *    while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5349  *    BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5350  *    has a size that doesn't match the sum of the lengths of all the logged
5351  *    names. This does not result in a problem because if a dir_item key is
5352  *    logged but its matching dir_index key is not logged, at log replay time we
5353  *    don't use it to replay the respective name (see replay_one_name()). On the
5354  *    other hand if only the dir_index key ends up being logged, the respective
5355  *    name is added to the fs/subvol tree with both the dir_item and dir_index
5356  *    keys created (see replay_one_name()).
5357  *    The directory's inode item with a wrong i_size is not a problem as well,
5358  *    since we don't use it at log replay time to set the i_size in the inode
5359  *    item of the fs/subvol tree (see overwrite_item()).
5360  */
5361 static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5362                                 struct btrfs_root *root,
5363                                 struct btrfs_inode *start_inode,
5364                                 struct btrfs_log_ctx *ctx)
5365 {
5366         struct btrfs_fs_info *fs_info = root->fs_info;
5367         struct btrfs_root *log = root->log_root;
5368         struct btrfs_path *path;
5369         LIST_HEAD(dir_list);
5370         struct btrfs_dir_list *dir_elem;
5371         int ret = 0;
5372
5373         path = btrfs_alloc_path();
5374         if (!path)
5375                 return -ENOMEM;
5376
5377         dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5378         if (!dir_elem) {
5379                 btrfs_free_path(path);
5380                 return -ENOMEM;
5381         }
5382         dir_elem->ino = btrfs_ino(start_inode);
5383         list_add_tail(&dir_elem->list, &dir_list);
5384
5385         while (!list_empty(&dir_list)) {
5386                 struct extent_buffer *leaf;
5387                 struct btrfs_key min_key;
5388                 int nritems;
5389                 int i;
5390
5391                 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5392                                             list);
5393                 if (ret)
5394                         goto next_dir_inode;
5395
5396                 min_key.objectid = dir_elem->ino;
5397                 min_key.type = BTRFS_DIR_ITEM_KEY;
5398                 min_key.offset = 0;
5399 again:
5400                 btrfs_release_path(path);
5401                 ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5402                 if (ret < 0) {
5403                         goto next_dir_inode;
5404                 } else if (ret > 0) {
5405                         ret = 0;
5406                         goto next_dir_inode;
5407                 }
5408
5409 process_leaf:
5410                 leaf = path->nodes[0];
5411                 nritems = btrfs_header_nritems(leaf);
5412                 for (i = path->slots[0]; i < nritems; i++) {
5413                         struct btrfs_dir_item *di;
5414                         struct btrfs_key di_key;
5415                         struct inode *di_inode;
5416                         struct btrfs_dir_list *new_dir_elem;
5417                         int log_mode = LOG_INODE_EXISTS;
5418                         int type;
5419
5420                         btrfs_item_key_to_cpu(leaf, &min_key, i);
5421                         if (min_key.objectid != dir_elem->ino ||
5422                             min_key.type != BTRFS_DIR_ITEM_KEY)
5423                                 goto next_dir_inode;
5424
5425                         di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5426                         type = btrfs_dir_type(leaf, di);
5427                         if (btrfs_dir_transid(leaf, di) < trans->transid &&
5428                             type != BTRFS_FT_DIR)
5429                                 continue;
5430                         btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5431                         if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5432                                 continue;
5433
5434                         btrfs_release_path(path);
5435                         di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
5436                         if (IS_ERR(di_inode)) {
5437                                 ret = PTR_ERR(di_inode);
5438                                 goto next_dir_inode;
5439                         }
5440
5441                         if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
5442                                 btrfs_add_delayed_iput(di_inode);
5443                                 break;
5444                         }
5445
5446                         ctx->log_new_dentries = false;
5447                         if (type == BTRFS_FT_DIR)
5448                                 log_mode = LOG_INODE_ALL;
5449                         ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5450                                               log_mode, 0, LLONG_MAX, ctx);
5451                         if (!ret &&
5452                             btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
5453                                 ret = 1;
5454                         btrfs_add_delayed_iput(di_inode);
5455                         if (ret)
5456                                 goto next_dir_inode;
5457                         if (ctx->log_new_dentries) {
5458                                 new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5459                                                        GFP_NOFS);
5460                                 if (!new_dir_elem) {
5461                                         ret = -ENOMEM;
5462                                         goto next_dir_inode;
5463                                 }
5464                                 new_dir_elem->ino = di_key.objectid;
5465                                 list_add_tail(&new_dir_elem->list, &dir_list);
5466                         }
5467                         break;
5468                 }
5469                 if (i == nritems) {
5470                         ret = btrfs_next_leaf(log, path);
5471                         if (ret < 0) {
5472                                 goto next_dir_inode;
5473                         } else if (ret > 0) {
5474                                 ret = 0;
5475                                 goto next_dir_inode;
5476                         }
5477                         goto process_leaf;
5478                 }
5479                 if (min_key.offset < (u64)-1) {
5480                         min_key.offset++;
5481                         goto again;
5482                 }
5483 next_dir_inode:
5484                 list_del(&dir_elem->list);
5485                 kfree(dir_elem);
5486         }
5487
5488         btrfs_free_path(path);
5489         return ret;
5490 }
5491
5492 static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5493                                  struct btrfs_inode *inode,
5494                                  struct btrfs_log_ctx *ctx)
5495 {
5496         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
5497         int ret;
5498         struct btrfs_path *path;
5499         struct btrfs_key key;
5500         struct btrfs_root *root = inode->root;
5501         const u64 ino = btrfs_ino(inode);
5502
5503         path = btrfs_alloc_path();
5504         if (!path)
5505                 return -ENOMEM;
5506         path->skip_locking = 1;
5507         path->search_commit_root = 1;
5508
5509         key.objectid = ino;
5510         key.type = BTRFS_INODE_REF_KEY;
5511         key.offset = 0;
5512         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5513         if (ret < 0)
5514                 goto out;
5515
5516         while (true) {
5517                 struct extent_buffer *leaf = path->nodes[0];
5518                 int slot = path->slots[0];
5519                 u32 cur_offset = 0;
5520                 u32 item_size;
5521                 unsigned long ptr;
5522
5523                 if (slot >= btrfs_header_nritems(leaf)) {
5524                         ret = btrfs_next_leaf(root, path);
5525                         if (ret < 0)
5526                                 goto out;
5527                         else if (ret > 0)
5528                                 break;
5529                         continue;
5530                 }
5531
5532                 btrfs_item_key_to_cpu(leaf, &key, slot);
5533                 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5534                 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5535                         break;
5536
5537                 item_size = btrfs_item_size_nr(leaf, slot);
5538                 ptr = btrfs_item_ptr_offset(leaf, slot);
5539                 while (cur_offset < item_size) {
5540                         struct btrfs_key inode_key;
5541                         struct inode *dir_inode;
5542
5543                         inode_key.type = BTRFS_INODE_ITEM_KEY;
5544                         inode_key.offset = 0;
5545
5546                         if (key.type == BTRFS_INODE_EXTREF_KEY) {
5547                                 struct btrfs_inode_extref *extref;
5548
5549                                 extref = (struct btrfs_inode_extref *)
5550                                         (ptr + cur_offset);
5551                                 inode_key.objectid = btrfs_inode_extref_parent(
5552                                         leaf, extref);
5553                                 cur_offset += sizeof(*extref);
5554                                 cur_offset += btrfs_inode_extref_name_len(leaf,
5555                                         extref);
5556                         } else {
5557                                 inode_key.objectid = key.offset;
5558                                 cur_offset = item_size;
5559                         }
5560
5561                         dir_inode = btrfs_iget(fs_info->sb, &inode_key,
5562                                                root, NULL);
5563                         /*
5564                          * If the parent inode was deleted, return an error to
5565                          * fallback to a transaction commit. This is to prevent
5566                          * getting an inode that was moved from one parent A to
5567                          * a parent B, got its former parent A deleted and then
5568                          * it got fsync'ed, from existing at both parents after
5569                          * a log replay (and the old parent still existing).
5570                          * Example:
5571                          *
5572                          * mkdir /mnt/A
5573                          * mkdir /mnt/B
5574                          * touch /mnt/B/bar
5575                          * sync
5576                          * mv /mnt/B/bar /mnt/A/bar
5577                          * mv -T /mnt/A /mnt/B
5578                          * fsync /mnt/B/bar
5579                          * <power fail>
5580                          *
5581                          * If we ignore the old parent B which got deleted,
5582                          * after a log replay we would have file bar linked
5583                          * at both parents and the old parent B would still
5584                          * exist.
5585                          */
5586                         if (IS_ERR(dir_inode)) {
5587                                 ret = PTR_ERR(dir_inode);
5588                                 goto out;
5589                         }
5590
5591                         if (ctx)
5592                                 ctx->log_new_dentries = false;
5593                         ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5594                                               LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5595                         if (!ret &&
5596                             btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
5597                                 ret = 1;
5598                         if (!ret && ctx && ctx->log_new_dentries)
5599                                 ret = log_new_dir_dentries(trans, root,
5600                                                    BTRFS_I(dir_inode), ctx);
5601                         btrfs_add_delayed_iput(dir_inode);
5602                         if (ret)
5603                                 goto out;
5604                 }
5605                 path->slots[0]++;
5606         }
5607         ret = 0;
5608 out:
5609         btrfs_free_path(path);
5610         return ret;
5611 }
5612
5613 /*
5614  * helper function around btrfs_log_inode to make sure newly created
5615  * parent directories also end up in the log.  A minimal inode and backref
5616  * only logging is done of any parent directories that are older than
5617  * the last committed transaction
5618  */
5619 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
5620                                   struct btrfs_root *root,
5621                                   struct btrfs_inode *inode,
5622                                   struct dentry *parent,
5623                                   const loff_t start,
5624                                   const loff_t end,
5625                                   int exists_only,
5626                                   struct btrfs_log_ctx *ctx)
5627 {
5628         struct btrfs_fs_info *fs_info = root->fs_info;
5629         int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
5630         struct super_block *sb;
5631         struct dentry *old_parent = NULL;
5632         int ret = 0;
5633         u64 last_committed = fs_info->last_trans_committed;
5634         bool log_dentries = false;
5635         struct btrfs_inode *orig_inode = inode;
5636
5637         sb = inode->vfs_inode.i_sb;
5638
5639         if (btrfs_test_opt(fs_info, NOTREELOG)) {
5640                 ret = 1;
5641                 goto end_no_trans;
5642         }
5643
5644         /*
5645          * The prev transaction commit doesn't complete, we need do
5646          * full commit by ourselves.
5647          */
5648         if (fs_info->last_trans_log_full_commit >
5649             fs_info->last_trans_committed) {
5650                 ret = 1;
5651                 goto end_no_trans;
5652         }
5653
5654         if (root != inode->root || btrfs_root_refs(&root->root_item) == 0) {
5655                 ret = 1;
5656                 goto end_no_trans;
5657         }
5658
5659         ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
5660                         last_committed);
5661         if (ret)
5662                 goto end_no_trans;
5663
5664         /*
5665          * Skip already logged inodes or inodes corresponding to tmpfiles
5666          * (since logging them is pointless, a link count of 0 means they
5667          * will never be accessible).
5668          */
5669         if (btrfs_inode_in_log(inode, trans->transid) ||
5670             inode->vfs_inode.i_nlink == 0) {
5671                 ret = BTRFS_NO_LOG_SYNC;
5672                 goto end_no_trans;
5673         }
5674
5675         ret = start_log_trans(trans, root, ctx);
5676         if (ret)
5677                 goto end_no_trans;
5678
5679         ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
5680         if (ret)
5681                 goto end_trans;
5682
5683         /*
5684          * for regular files, if its inode is already on disk, we don't
5685          * have to worry about the parents at all.  This is because
5686          * we can use the last_unlink_trans field to record renames
5687          * and other fun in this file.
5688          */
5689         if (S_ISREG(inode->vfs_inode.i_mode) &&
5690             inode->generation <= last_committed &&
5691             inode->last_unlink_trans <= last_committed) {
5692                 ret = 0;
5693                 goto end_trans;
5694         }
5695
5696         if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
5697                 log_dentries = true;
5698
5699         /*
5700          * On unlink we must make sure all our current and old parent directory
5701          * inodes are fully logged. This is to prevent leaving dangling
5702          * directory index entries in directories that were our parents but are
5703          * not anymore. Not doing this results in old parent directory being
5704          * impossible to delete after log replay (rmdir will always fail with
5705          * error -ENOTEMPTY).
5706          *
5707          * Example 1:
5708          *
5709          * mkdir testdir
5710          * touch testdir/foo
5711          * ln testdir/foo testdir/bar
5712          * sync
5713          * unlink testdir/bar
5714          * xfs_io -c fsync testdir/foo
5715          * <power failure>
5716          * mount fs, triggers log replay
5717          *
5718          * If we don't log the parent directory (testdir), after log replay the
5719          * directory still has an entry pointing to the file inode using the bar
5720          * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
5721          * the file inode has a link count of 1.
5722          *
5723          * Example 2:
5724          *
5725          * mkdir testdir
5726          * touch foo
5727          * ln foo testdir/foo2
5728          * ln foo testdir/foo3
5729          * sync
5730          * unlink testdir/foo3
5731          * xfs_io -c fsync foo
5732          * <power failure>
5733          * mount fs, triggers log replay
5734          *
5735          * Similar as the first example, after log replay the parent directory
5736          * testdir still has an entry pointing to the inode file with name foo3
5737          * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
5738          * and has a link count of 2.
5739          */
5740         if (inode->last_unlink_trans > last_committed) {
5741                 ret = btrfs_log_all_parents(trans, orig_inode, ctx);
5742                 if (ret)
5743                         goto end_trans;
5744         }
5745
5746         /*
5747          * If a new hard link was added to the inode in the current transaction
5748          * and its link count is now greater than 1, we need to fallback to a
5749          * transaction commit, otherwise we can end up not logging all its new
5750          * parents for all the hard links. Here just from the dentry used to
5751          * fsync, we can not visit the ancestor inodes for all the other hard
5752          * links to figure out if any is new, so we fallback to a transaction
5753          * commit (instead of adding a lot of complexity of scanning a btree,
5754          * since this scenario is not a common use case).
5755          */
5756         if (inode->vfs_inode.i_nlink > 1 &&
5757             inode->last_link_trans > last_committed) {
5758                 ret = -EMLINK;
5759                 goto end_trans;
5760         }
5761
5762         while (1) {
5763                 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5764                         break;
5765
5766                 inode = BTRFS_I(d_inode(parent));
5767                 if (root != inode->root)
5768                         break;
5769
5770                 if (inode->generation > last_committed) {
5771                         ret = btrfs_log_inode(trans, root, inode,
5772                                         LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
5773                         if (ret)
5774                                 goto end_trans;
5775                 }
5776                 if (IS_ROOT(parent))
5777                         break;
5778
5779                 parent = dget_parent(parent);
5780                 dput(old_parent);
5781                 old_parent = parent;
5782         }
5783         if (log_dentries)
5784                 ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
5785         else
5786                 ret = 0;
5787 end_trans:
5788         dput(old_parent);
5789         if (ret < 0) {
5790                 btrfs_set_log_full_commit(fs_info, trans);
5791                 ret = 1;
5792         }
5793
5794         if (ret)
5795                 btrfs_remove_log_ctx(root, ctx);
5796         btrfs_end_log_trans(root);
5797 end_no_trans:
5798         return ret;
5799 }
5800
5801 /*
5802  * it is not safe to log dentry if the chunk root has added new
5803  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
5804  * If this returns 1, you must commit the transaction to safely get your
5805  * data on disk.
5806  */
5807 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
5808                           struct btrfs_root *root, struct dentry *dentry,
5809                           const loff_t start,
5810                           const loff_t end,
5811                           struct btrfs_log_ctx *ctx)
5812 {
5813         struct dentry *parent = dget_parent(dentry);
5814         int ret;
5815
5816         ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)),
5817                         parent, start, end, 0, ctx);
5818         dput(parent);
5819
5820         return ret;
5821 }
5822
5823 /*
5824  * should be called during mount to recover any replay any log trees
5825  * from the FS
5826  */
5827 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5828 {
5829         int ret;
5830         struct btrfs_path *path;
5831         struct btrfs_trans_handle *trans;
5832         struct btrfs_key key;
5833         struct btrfs_key found_key;
5834         struct btrfs_key tmp_key;
5835         struct btrfs_root *log;
5836         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
5837         struct walk_control wc = {
5838                 .process_func = process_one_buffer,
5839                 .stage = 0,
5840         };
5841
5842         path = btrfs_alloc_path();
5843         if (!path)
5844                 return -ENOMEM;
5845
5846         set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5847
5848         trans = btrfs_start_transaction(fs_info->tree_root, 0);
5849         if (IS_ERR(trans)) {
5850                 ret = PTR_ERR(trans);
5851                 goto error;
5852         }
5853
5854         wc.trans = trans;
5855         wc.pin = 1;
5856
5857         ret = walk_log_tree(trans, log_root_tree, &wc);
5858         if (ret) {
5859                 btrfs_handle_fs_error(fs_info, ret,
5860                         "Failed to pin buffers while recovering log root tree.");
5861                 goto error;
5862         }
5863
5864 again:
5865         key.objectid = BTRFS_TREE_LOG_OBJECTID;
5866         key.offset = (u64)-1;
5867         key.type = BTRFS_ROOT_ITEM_KEY;
5868
5869         while (1) {
5870                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
5871
5872                 if (ret < 0) {
5873                         btrfs_handle_fs_error(fs_info, ret,
5874                                     "Couldn't find tree log root.");
5875                         goto error;
5876                 }
5877                 if (ret > 0) {
5878                         if (path->slots[0] == 0)
5879                                 break;
5880                         path->slots[0]--;
5881                 }
5882                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
5883                                       path->slots[0]);
5884                 btrfs_release_path(path);
5885                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
5886                         break;
5887
5888                 log = btrfs_read_fs_root(log_root_tree, &found_key);
5889                 if (IS_ERR(log)) {
5890                         ret = PTR_ERR(log);
5891                         btrfs_handle_fs_error(fs_info, ret,
5892                                     "Couldn't read tree log root.");
5893                         goto error;
5894                 }
5895
5896                 tmp_key.objectid = found_key.offset;
5897                 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
5898                 tmp_key.offset = (u64)-1;
5899
5900                 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
5901                 if (IS_ERR(wc.replay_dest)) {
5902                         ret = PTR_ERR(wc.replay_dest);
5903
5904                         /*
5905                          * We didn't find the subvol, likely because it was
5906                          * deleted.  This is ok, simply skip this log and go to
5907                          * the next one.
5908                          *
5909                          * We need to exclude the root because we can't have
5910                          * other log replays overwriting this log as we'll read
5911                          * it back in a few more times.  This will keep our
5912                          * block from being modified, and we'll just bail for
5913                          * each subsequent pass.
5914                          */
5915                         if (ret == -ENOENT)
5916                                 ret = btrfs_pin_extent_for_log_replay(fs_info,
5917                                                         log->node->start,
5918                                                         log->node->len);
5919                         free_extent_buffer(log->node);
5920                         free_extent_buffer(log->commit_root);
5921                         kfree(log);
5922
5923                         if (!ret)
5924                                 goto next;
5925                         btrfs_handle_fs_error(fs_info, ret,
5926                                 "Couldn't read target root for tree log recovery.");
5927                         goto error;
5928                 }
5929
5930                 wc.replay_dest->log_root = log;
5931                 btrfs_record_root_in_trans(trans, wc.replay_dest);
5932                 ret = walk_log_tree(trans, log, &wc);
5933
5934                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5935                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
5936                                                       path);
5937                 }
5938
5939                 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
5940                         struct btrfs_root *root = wc.replay_dest;
5941
5942                         btrfs_release_path(path);
5943
5944                         /*
5945                          * We have just replayed everything, and the highest
5946                          * objectid of fs roots probably has changed in case
5947                          * some inode_item's got replayed.
5948                          *
5949                          * root->objectid_mutex is not acquired as log replay
5950                          * could only happen during mount.
5951                          */
5952                         ret = btrfs_find_highest_objectid(root,
5953                                                   &root->highest_objectid);
5954                 }
5955
5956                 wc.replay_dest->log_root = NULL;
5957                 free_extent_buffer(log->node);
5958                 free_extent_buffer(log->commit_root);
5959                 kfree(log);
5960
5961                 if (ret)
5962                         goto error;
5963 next:
5964                 if (found_key.offset == 0)
5965                         break;
5966                 key.offset = found_key.offset - 1;
5967         }
5968         btrfs_release_path(path);
5969
5970         /* step one is to pin it all, step two is to replay just inodes */
5971         if (wc.pin) {
5972                 wc.pin = 0;
5973                 wc.process_func = replay_one_buffer;
5974                 wc.stage = LOG_WALK_REPLAY_INODES;
5975                 goto again;
5976         }
5977         /* step three is to replay everything */
5978         if (wc.stage < LOG_WALK_REPLAY_ALL) {
5979                 wc.stage++;
5980                 goto again;
5981         }
5982
5983         btrfs_free_path(path);
5984
5985         /* step 4: commit the transaction, which also unpins the blocks */
5986         ret = btrfs_commit_transaction(trans);
5987         if (ret)
5988                 return ret;
5989
5990         free_extent_buffer(log_root_tree->node);
5991         log_root_tree->log_root = NULL;
5992         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
5993         kfree(log_root_tree);
5994
5995         return 0;
5996 error:
5997         if (wc.trans)
5998                 btrfs_end_transaction(wc.trans);
5999         clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6000         btrfs_free_path(path);
6001         return ret;
6002 }
6003
6004 /*
6005  * there are some corner cases where we want to force a full
6006  * commit instead of allowing a directory to be logged.
6007  *
6008  * They revolve around files there were unlinked from the directory, and
6009  * this function updates the parent directory so that a full commit is
6010  * properly done if it is fsync'd later after the unlinks are done.
6011  *
6012  * Must be called before the unlink operations (updates to the subvolume tree,
6013  * inodes, etc) are done.
6014  */
6015 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
6016                              struct btrfs_inode *dir, struct btrfs_inode *inode,
6017                              int for_rename)
6018 {
6019         /*
6020          * when we're logging a file, if it hasn't been renamed
6021          * or unlinked, and its inode is fully committed on disk,
6022          * we don't have to worry about walking up the directory chain
6023          * to log its parents.
6024          *
6025          * So, we use the last_unlink_trans field to put this transid
6026          * into the file.  When the file is logged we check it and
6027          * don't log the parents if the file is fully on disk.
6028          */
6029         mutex_lock(&inode->log_mutex);
6030         inode->last_unlink_trans = trans->transid;
6031         mutex_unlock(&inode->log_mutex);
6032
6033         /*
6034          * if this directory was already logged any new
6035          * names for this file/dir will get recorded
6036          */
6037         if (dir->logged_trans == trans->transid)
6038                 return;
6039
6040         /*
6041          * if the inode we're about to unlink was logged,
6042          * the log will be properly updated for any new names
6043          */
6044         if (inode->logged_trans == trans->transid)
6045                 return;
6046
6047         /*
6048          * when renaming files across directories, if the directory
6049          * there we're unlinking from gets fsync'd later on, there's
6050          * no way to find the destination directory later and fsync it
6051          * properly.  So, we have to be conservative and force commits
6052          * so the new name gets discovered.
6053          */
6054         if (for_rename)
6055                 goto record;
6056
6057         /* we can safely do the unlink without any special recording */
6058         return;
6059
6060 record:
6061         mutex_lock(&dir->log_mutex);
6062         dir->last_unlink_trans = trans->transid;
6063         mutex_unlock(&dir->log_mutex);
6064 }
6065
6066 /*
6067  * Make sure that if someone attempts to fsync the parent directory of a deleted
6068  * snapshot, it ends up triggering a transaction commit. This is to guarantee
6069  * that after replaying the log tree of the parent directory's root we will not
6070  * see the snapshot anymore and at log replay time we will not see any log tree
6071  * corresponding to the deleted snapshot's root, which could lead to replaying
6072  * it after replaying the log tree of the parent directory (which would replay
6073  * the snapshot delete operation).
6074  *
6075  * Must be called before the actual snapshot destroy operation (updates to the
6076  * parent root and tree of tree roots trees, etc) are done.
6077  */
6078 void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
6079                                    struct btrfs_inode *dir)
6080 {
6081         mutex_lock(&dir->log_mutex);
6082         dir->last_unlink_trans = trans->transid;
6083         mutex_unlock(&dir->log_mutex);
6084 }
6085
6086 /*
6087  * Call this after adding a new name for a file and it will properly
6088  * update the log to reflect the new name.
6089  *
6090  * It will return zero if all goes well, and it will return 1 if a
6091  * full transaction commit is required.
6092  */
6093 int btrfs_log_new_name(struct btrfs_trans_handle *trans,
6094                         struct btrfs_inode *inode, struct btrfs_inode *old_dir,
6095                         struct dentry *parent)
6096 {
6097         struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6098         struct btrfs_root *root = inode->root;
6099
6100         /*
6101          * this will force the logging code to walk the dentry chain
6102          * up for the file
6103          */
6104         if (!S_ISDIR(inode->vfs_inode.i_mode))
6105                 inode->last_unlink_trans = trans->transid;
6106
6107         /*
6108          * if this inode hasn't been logged and directory we're renaming it
6109          * from hasn't been logged, we don't need to log it
6110          */
6111         if (inode->logged_trans <= fs_info->last_trans_committed &&
6112             (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
6113                 return 0;
6114
6115         return btrfs_log_inode_parent(trans, root, inode, parent, 0,
6116                                       LLONG_MAX, 1, NULL);
6117 }
6118