fs/ext4/fast_commit.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 /*
   4  * fs/ext4/fast_commit.c
   5  *
   6  * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com>
   7  *
   8  * Ext4 fast commits routines.
   9  */
  10 #include "ext4.h"
  11 #include "ext4_jbd2.h"
  12 #include "ext4_extents.h"
  13 #include "mballoc.h"
  14
  15 /*
  16  * Ext4 Fast Commits
  17  * -----------------
  18  *
  19  * Ext4 fast commits implement fine grained journalling for Ext4.
  20  *
  21  * Fast commits are organized as a log of tag-length-value (TLV) structs. (See
  22  * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by
  23  * TLV during the recovery phase. For the scenarios for which we currently
  24  * don't have replay code, fast commit falls back to full commits.
  25  * Fast commits record delta in one of the following three categories.
  26  *
  27  * (A) Directory entry updates:
  28  *
  29  * - EXT4_FC_TAG_UNLINK         - records directory entry unlink
  30  * - EXT4_FC_TAG_LINK           - records directory entry link
  31  * - EXT4_FC_TAG_CREAT          - records inode and directory entry creation
  32  *
  33  * (B) File specific data range updates:
  34  *
  35  * - EXT4_FC_TAG_ADD_RANGE      - records addition of new blocks to an inode
  36  * - EXT4_FC_TAG_DEL_RANGE      - records deletion of blocks from an inode
  37  *
  38  * (C) Inode metadata (mtime / ctime etc):
  39  *
  40  * - EXT4_FC_TAG_INODE          - record the inode that should be replayed
  41  *                                during recovery. Note that iblocks field is
  42  *                                not replayed and instead derived during
  43  *                                replay.
  44  * Commit Operation
  45  * ----------------
  46  * With fast commits, we maintain all the directory entry operations in the
  47  * order in which they are issued in an in-memory queue. This queue is flushed
  48  * to disk during the commit operation. We also maintain a list of inodes
  49  * that need to be committed during a fast commit in another in memory queue of
  50  * inodes. During the commit operation, we commit in the following order:
  51  *
  52  * [1] Lock inodes for any further data updates by setting COMMITTING state
  53  * [2] Submit data buffers of all the inodes
  54  * [3] Wait for [2] to complete
  55  * [4] Commit all the directory entry updates in the fast commit space
  56  * [5] Commit all the changed inode structures
  57  * [6] Write tail tag (this tag ensures the atomicity, please read the following
  58  *     section for more details).
  59  * [7] Wait for [4], [5] and [6] to complete.
  60  *
  61  * All the inode updates must call ext4_fc_start_update() before starting an
  62  * update. If such an ongoing update is present, fast commit waits for it to
  63  * complete. The completion of such an update is marked by
  64  * ext4_fc_stop_update().
  65  *
  66  * Fast Commit Ineligibility
  67  * -------------------------
  68  * Not all operations are supported by fast commits today (e.g extended
  69  * attributes). Fast commit ineligiblity is marked by calling one of the
  70  * two following functions:
  71  *
  72  * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall
  73  *   back to full commit. This is useful in case of transient errors.
  74  *
  75  * - ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() - This makes all
  76  *   the fast commits happening between ext4_fc_start_ineligible() and
  77  *   ext4_fc_stop_ineligible() and one fast commit after the call to
  78  *   ext4_fc_stop_ineligible() to fall back to full commits. It is important to
  79  *   make one more fast commit to fall back to full commit after stop call so
  80  *   that it guaranteed that the fast commit ineligible operation contained
  81  *   within ext4_fc_start_ineligible() and ext4_fc_stop_ineligible() is
  82  *   followed by at least 1 full commit.
  83  *
  84  * Atomicity of commits
  85  * --------------------
  86  * In order to guarantee atomicity during the commit operation, fast commit
  87  * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail
  88  * tag contains CRC of the contents and TID of the transaction after which
  89  * this fast commit should be applied. Recovery code replays fast commit
  90  * logs only if there's at least 1 valid tail present. For every fast commit
  91  * operation, there is 1 tail. This means, we may end up with multiple tails
  92  * in the fast commit space. Here's an example:
  93  *
  94  * - Create a new file A and remove existing file B
  95  * - fsync()
  96  * - Append contents to file A
  97  * - Truncate file A
  98  * - fsync()
  99  *
 100  * The fast commit space at the end of above operations would look like this:
 101  *      [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL]
 102  *             |<---  Fast Commit 1   --->|<---      Fast Commit 2     ---->|
 103  *
 104  * Replay code should thus check for all the valid tails in the FC area.
 105  *
 106  * TODOs
 107  * -----
 108  * 1) Make fast commit atomic updates more fine grained. Today, a fast commit
 109  *    eligible update must be protected within ext4_fc_start_update() and
 110  *    ext4_fc_stop_update(). These routines are called at much higher
 111  *    routines. This can be made more fine grained by combining with
 112  *    ext4_journal_start().
 113  *
 114  * 2) Same above for ext4_fc_start_ineligible() and ext4_fc_stop_ineligible()
 115  *
 116  * 3) Handle more ineligible cases.
 117  */
 118
 119 #include <trace/events/ext4.h>
 120 static struct kmem_cache *ext4_fc_dentry_cachep;
 121
 122 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 123 {
 124         BUFFER_TRACE(bh, "");
 125         if (uptodate) {
 126                 ext4_debug("%s: Block %lld up-to-date",
 127                            __func__, bh->b_blocknr);
 128                 set_buffer_uptodate(bh);
 129         } else {
 130                 ext4_debug("%s: Block %lld not up-to-date",
 131                            __func__, bh->b_blocknr);
 132                 clear_buffer_uptodate(bh);
 133         }
 134
 135         unlock_buffer(bh);
 136 }
 137
 138 static inline void ext4_fc_reset_inode(struct inode *inode)
 139 {
 140         struct ext4_inode_info *ei = EXT4_I(inode);
 141
 142         ei->i_fc_lblk_start = 0;
 143         ei->i_fc_lblk_len = 0;
 144 }
 145
 146 void ext4_fc_init_inode(struct inode *inode)
 147 {
 148         struct ext4_inode_info *ei = EXT4_I(inode);
 149
 150         ext4_fc_reset_inode(inode);
 151         ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 152         INIT_LIST_HEAD(&ei->i_fc_list);
 153         init_waitqueue_head(&ei->i_fc_wait);
 154         atomic_set(&ei->i_fc_updates, 0);
 155 }
 156
 157 /* This function must be called with sbi->s_fc_lock held. */
 158 static void ext4_fc_wait_committing_inode(struct inode *inode)
 159 __releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
 160 {
 161         wait_queue_head_t *wq;
 162         struct ext4_inode_info *ei = EXT4_I(inode);
 163
 164 #if (BITS_PER_LONG < 64)
 165         DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
 166                         EXT4_STATE_FC_COMMITTING);
 167         wq = bit_waitqueue(&ei->i_state_flags,
 168                                 EXT4_STATE_FC_COMMITTING);
 169 #else
 170         DEFINE_WAIT_BIT(wait, &ei->i_flags,
 171                         EXT4_STATE_FC_COMMITTING);
 172         wq = bit_waitqueue(&ei->i_flags,
 173                                 EXT4_STATE_FC_COMMITTING);
 174 #endif
 175         lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
 176         prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 177         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 178         schedule();
 179         finish_wait(wq, &wait.wq_entry);
 180 }
 181
 182 /*
 183  * Inform Ext4's fast about start of an inode update
 184  *
 185  * This function is called by the high level call VFS callbacks before
 186  * performing any inode update. This function blocks if there's an ongoing
 187  * fast commit on the inode in question.
 188  */
 189 void ext4_fc_start_update(struct inode *inode)
 190 {
 191         struct ext4_inode_info *ei = EXT4_I(inode);
 192
 193         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 194             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 195                 return;
 196
 197 restart:
 198         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 199         if (list_empty(&ei->i_fc_list))
 200                 goto out;
 201
 202         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 203                 ext4_fc_wait_committing_inode(inode);
 204                 goto restart;
 205         }
 206 out:
 207         atomic_inc(&ei->i_fc_updates);
 208         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 209 }
 210
 211 /*
 212  * Stop inode update and wake up waiting fast commits if any.
 213  */
 214 void ext4_fc_stop_update(struct inode *inode)
 215 {
 216         struct ext4_inode_info *ei = EXT4_I(inode);
 217
 218         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 219             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 220                 return;
 221
 222         if (atomic_dec_and_test(&ei->i_fc_updates))
 223                 wake_up_all(&ei->i_fc_wait);
 224 }
 225
 226 /*
 227  * Remove inode from fast commit list. If the inode is being committed
 228  * we wait until inode commit is done.
 229  */
 230 void ext4_fc_del(struct inode *inode)
 231 {
 232         struct ext4_inode_info *ei = EXT4_I(inode);
 233
 234         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 235             (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY))
 236                 return;
 237
 238 restart:
 239         spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 240         if (list_empty(&ei->i_fc_list)) {
 241                 spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 242                 return;
 243         }
 244
 245         if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
 246                 ext4_fc_wait_committing_inode(inode);
 247                 goto restart;
 248         }
 249         list_del_init(&ei->i_fc_list);
 250         spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
 251 }
 252
 253 /*
 254  * Mark file system as fast commit ineligible. This means that next commit
 255  * operation would result in a full jbd2 commit.
 256  */
 257 void ext4_fc_mark_ineligible(struct super_block *sb, int reason)
 258 {
 259         struct ext4_sb_info *sbi = EXT4_SB(sb);
 260
 261         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 262             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 263                 return;
 264
 265         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 266         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 267         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 268 }
 269
 270 /*
 271  * Start a fast commit ineligible update. Any commits that happen while
 272  * such an operation is in progress fall back to full commits.
 273  */
 274 void ext4_fc_start_ineligible(struct super_block *sb, int reason)
 275 {
 276         struct ext4_sb_info *sbi = EXT4_SB(sb);
 277
 278         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 279             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 280                 return;
 281
 282         WARN_ON(reason >= EXT4_FC_REASON_MAX);
 283         sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
 284         atomic_inc(&sbi->s_fc_ineligible_updates);
 285 }
 286
 287 /*
 288  * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here
 289  * to ensure that after stopping the ineligible update, at least one full
 290  * commit takes place.
 291  */
 292 void ext4_fc_stop_ineligible(struct super_block *sb)
 293 {
 294         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
 295             (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))
 296                 return;
 297
 298         ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
 299         atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates);
 300 }
 301
 302 static inline int ext4_fc_is_ineligible(struct super_block *sb)
 303 {
 304         return (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE) ||
 305                 atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates));
 306 }
 307
 308 /*
 309  * Generic fast commit tracking function. If this is the first time this we are
 310  * called after a full commit, we initialize fast commit fields and then call
 311  * __fc_track_fn() with update = 0. If we have already been called after a full
 312  * commit, we pass update = 1. Based on that, the track function can determine
 313  * if it needs to track a field for the first time or if it needs to just
 314  * update the previously tracked value.
 315  *
 316  * If enqueue is set, this function enqueues the inode in fast commit list.
 317  */
 318 static int ext4_fc_track_template(
 319         handle_t *handle, struct inode *inode,
 320         int (*__fc_track_fn)(struct inode *, void *, bool),
 321         void *args, int enqueue)
 322 {
 323         bool update = false;
 324         struct ext4_inode_info *ei = EXT4_I(inode);
 325         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 326         tid_t tid = 0;
 327         int ret;
 328
 329         if (!test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT) ||
 330             (sbi->s_mount_state & EXT4_FC_REPLAY))
 331                 return -EOPNOTSUPP;
 332
 333         if (ext4_fc_is_ineligible(inode->i_sb))
 334                 return -EINVAL;
 335
 336         tid = handle->h_transaction->t_tid;
 337         mutex_lock(&ei->i_fc_lock);
 338         if (tid == ei->i_sync_tid) {
 339                 update = true;
 340         } else {
 341                 ext4_fc_reset_inode(inode);
 342                 ei->i_sync_tid = tid;
 343         }
 344         ret = __fc_track_fn(inode, args, update);
 345         mutex_unlock(&ei->i_fc_lock);
 346
 347         if (!enqueue)
 348                 return ret;
 349
 350         spin_lock(&sbi->s_fc_lock);
 351         if (list_empty(&EXT4_I(inode)->i_fc_list))
 352                 list_add_tail(&EXT4_I(inode)->i_fc_list,
 353                                 (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING)) ?
 354                                 &sbi->s_fc_q[FC_Q_STAGING] :
 355                                 &sbi->s_fc_q[FC_Q_MAIN]);
 356         spin_unlock(&sbi->s_fc_lock);
 357
 358         return ret;
 359 }
 360
 361 struct __track_dentry_update_args {
 362         struct dentry *dentry;
 363         int op;
 364 };
 365
 366 /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */
 367 static int __track_dentry_update(struct inode *inode, void *arg, bool update)
 368 {
 369         struct ext4_fc_dentry_update *node;
 370         struct ext4_inode_info *ei = EXT4_I(inode);
 371         struct __track_dentry_update_args *dentry_update =
 372                 (struct __track_dentry_update_args *)arg;
 373         struct dentry *dentry = dentry_update->dentry;
 374         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 375
 376         mutex_unlock(&ei->i_fc_lock);
 377         node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
 378         if (!node) {
 379                 ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM);
 380                 mutex_lock(&ei->i_fc_lock);
 381                 return -ENOMEM;
 382         }
 383
 384         node->fcd_op = dentry_update->op;
 385         node->fcd_parent = dentry->d_parent->d_inode->i_ino;
 386         node->fcd_ino = inode->i_ino;
 387         if (dentry->d_name.len > DNAME_INLINE_LEN) {
 388                 node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS);
 389                 if (!node->fcd_name.name) {
 390                         kmem_cache_free(ext4_fc_dentry_cachep, node);
 391                         ext4_fc_mark_ineligible(inode->i_sb,
 392                                 EXT4_FC_REASON_NOMEM);
 393                         mutex_lock(&ei->i_fc_lock);
 394                         return -ENOMEM;
 395                 }
 396                 memcpy((u8 *)node->fcd_name.name, dentry->d_name.name,
 397                         dentry->d_name.len);
 398         } else {
 399                 memcpy(node->fcd_iname, dentry->d_name.name,
 400                         dentry->d_name.len);
 401                 node->fcd_name.name = node->fcd_iname;
 402         }
 403         node->fcd_name.len = dentry->d_name.len;
 404
 405         spin_lock(&sbi->s_fc_lock);
 406         if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_COMMITTING))
 407                 list_add_tail(&node->fcd_list,
 408                                 &sbi->s_fc_dentry_q[FC_Q_STAGING]);
 409         else
 410                 list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]);
 411         spin_unlock(&sbi->s_fc_lock);
 412         mutex_lock(&ei->i_fc_lock);
 413
 414         return 0;
 415 }
 416
 417 void __ext4_fc_track_unlink(handle_t *handle,
 418                 struct inode *inode, struct dentry *dentry)
 419 {
 420         struct __track_dentry_update_args args;
 421         int ret;
 422
 423         args.dentry = dentry;
 424         args.op = EXT4_FC_TAG_UNLINK;
 425
 426         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 427                                         (void *)&args, 0);
 428         trace_ext4_fc_track_unlink(inode, dentry, ret);
 429 }
 430
 431 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry)
 432 {
 433         __ext4_fc_track_unlink(handle, d_inode(dentry), dentry);
 434 }
 435
 436 void __ext4_fc_track_link(handle_t *handle,
 437         struct inode *inode, struct dentry *dentry)
 438 {
 439         struct __track_dentry_update_args args;
 440         int ret;
 441
 442         args.dentry = dentry;
 443         args.op = EXT4_FC_TAG_LINK;
 444
 445         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 446                                         (void *)&args, 0);
 447         trace_ext4_fc_track_link(inode, dentry, ret);
 448 }
 449
 450 void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
 451 {
 452         __ext4_fc_track_link(handle, d_inode(dentry), dentry);
 453 }
 454
 455 void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
 456                           struct dentry *dentry)
 457 {
 458         struct __track_dentry_update_args args;
 459         int ret;
 460
 461         args.dentry = dentry;
 462         args.op = EXT4_FC_TAG_CREAT;
 463
 464         ret = ext4_fc_track_template(handle, inode, __track_dentry_update,
 465                                         (void *)&args, 0);
 466         trace_ext4_fc_track_create(inode, dentry, ret);
 467 }
 468
 469 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
 470 {
 471         __ext4_fc_track_create(handle, d_inode(dentry), dentry);
 472 }
 473
 474 /* __track_fn for inode tracking */
 475 static int __track_inode(struct inode *inode, void *arg, bool update)
 476 {
 477         if (update)
 478                 return -EEXIST;
 479
 480         EXT4_I(inode)->i_fc_lblk_len = 0;
 481
 482         return 0;
 483 }
 484
 485 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 486 {
 487         int ret;
 488
 489         if (S_ISDIR(inode->i_mode))
 490                 return;
 491
 492         if (ext4_should_journal_data(inode)) {
 493                 ext4_fc_mark_ineligible(inode->i_sb,
 494                                         EXT4_FC_REASON_INODE_JOURNAL_DATA);
 495                 return;
 496         }
 497
 498         ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
 499         trace_ext4_fc_track_inode(inode, ret);
 500 }
 501
 502 struct __track_range_args {
 503         ext4_lblk_t start, end;
 504 };
 505
 506 /* __track_fn for tracking data updates */
 507 static int __track_range(struct inode *inode, void *arg, bool update)
 508 {
 509         struct ext4_inode_info *ei = EXT4_I(inode);
 510         ext4_lblk_t oldstart;
 511         struct __track_range_args *__arg =
 512                 (struct __track_range_args *)arg;
 513
 514         if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 515                 ext4_debug("Special inode %ld being modified\n", inode->i_ino);
 516                 return -ECANCELED;
 517         }
 518
 519         oldstart = ei->i_fc_lblk_start;
 520
 521         if (update && ei->i_fc_lblk_len > 0) {
 522                 ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
 523                 ei->i_fc_lblk_len =
 524                         max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
 525                                 ei->i_fc_lblk_start + 1;
 526         } else {
 527                 ei->i_fc_lblk_start = __arg->start;
 528                 ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
 529         }
 530
 531         return 0;
 532 }
 533
 534 void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
 535                          ext4_lblk_t end)
 536 {
 537         struct __track_range_args args;
 538         int ret;
 539
 540         if (S_ISDIR(inode->i_mode))
 541                 return;
 542
 543         args.start = start;
 544         args.end = end;
 545
 546         ret = ext4_fc_track_template(handle, inode,  __track_range, &args, 1);
 547
 548         trace_ext4_fc_track_range(inode, start, end, ret);
 549 }
 550
 551 static void ext4_fc_submit_bh(struct super_block *sb)
 552 {
 553         int write_flags = REQ_SYNC;
 554         struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh;
 555
 556         /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */
 557         if (test_opt(sb, BARRIER))
 558                 write_flags |= REQ_FUA | REQ_PREFLUSH;
 559         lock_buffer(bh);
 560         set_buffer_dirty(bh);
 561         set_buffer_uptodate(bh);
 562         bh->b_end_io = ext4_end_buffer_io_sync;
 563         submit_bh(REQ_OP_WRITE, write_flags, bh);
 564         EXT4_SB(sb)->s_fc_bh = NULL;
 565 }
 566
 567 /* Ext4 commit path routines */
 568
 569 /* memzero and update CRC */
 570 static void *ext4_fc_memzero(struct super_block *sb, void *dst, int len,
 571                                 u32 *crc)
 572 {
 573         void *ret;
 574
 575         ret = memset(dst, 0, len);
 576         if (crc)
 577                 *crc = ext4_chksum(EXT4_SB(sb), *crc, dst, len);
 578         return ret;
 579 }
 580
 581 /*
 582  * Allocate len bytes on a fast commit buffer.
 583  *
 584  * During the commit time this function is used to manage fast commit
 585  * block space. We don't split a fast commit log onto different
 586  * blocks. So this function makes sure that if there's not enough space
 587  * on the current block, the remaining space in the current block is
 588  * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case,
 589  * new block is from jbd2 and CRC is updated to reflect the padding
 590  * we added.
 591  */
 592 static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
 593 {
 594         struct ext4_fc_tl *tl;
 595         struct ext4_sb_info *sbi = EXT4_SB(sb);
 596         struct buffer_head *bh;
 597         int bsize = sbi->s_journal->j_blocksize;
 598         int ret, off = sbi->s_fc_bytes % bsize;
 599         int pad_len;
 600
 601         /*
 602          * After allocating len, we should have space at least for a 0 byte
 603          * padding.
 604          */
 605         if (len + sizeof(struct ext4_fc_tl) > bsize)
 606                 return NULL;
 607
 608         if (bsize - off - 1 > len + sizeof(struct ext4_fc_tl)) {
 609                 /*
 610                  * Only allocate from current buffer if we have enough space for
 611                  * this request AND we have space to add a zero byte padding.
 612                  */
 613                 if (!sbi->s_fc_bh) {
 614                         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 615                         if (ret)
 616                                 return NULL;
 617                         sbi->s_fc_bh = bh;
 618                 }
 619                 sbi->s_fc_bytes += len;
 620                 return sbi->s_fc_bh->b_data + off;
 621         }
 622         /* Need to add PAD tag */
 623         tl = (struct ext4_fc_tl *)(sbi->s_fc_bh->b_data + off);
 624         tl->fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD);
 625         pad_len = bsize - off - 1 - sizeof(struct ext4_fc_tl);
 626         tl->fc_len = cpu_to_le16(pad_len);
 627         if (crc)
 628                 *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl));
 629         if (pad_len > 0)
 630                 ext4_fc_memzero(sb, tl + 1, pad_len, crc);
 631         ext4_fc_submit_bh(sb);
 632
 633         ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh);
 634         if (ret)
 635                 return NULL;
 636         sbi->s_fc_bh = bh;
 637         sbi->s_fc_bytes = (sbi->s_fc_bytes / bsize + 1) * bsize + len;
 638         return sbi->s_fc_bh->b_data;
 639 }
 640
 641 /* memcpy to fc reserved space and update CRC */
 642 static void *ext4_fc_memcpy(struct super_block *sb, void *dst, const void *src,
 643                                 int len, u32 *crc)
 644 {
 645         if (crc)
 646                 *crc = ext4_chksum(EXT4_SB(sb), *crc, src, len);
 647         return memcpy(dst, src, len);
 648 }
 649
 650 /*
 651  * Complete a fast commit by writing tail tag.
 652  *
 653  * Writing tail tag marks the end of a fast commit. In order to guarantee
 654  * atomicity, after writing tail tag, even if there's space remaining
 655  * in the block, next commit shouldn't use it. That's why tail tag
 656  * has the length as that of the remaining space on the block.
 657  */
 658 static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
 659 {
 660         struct ext4_sb_info *sbi = EXT4_SB(sb);
 661         struct ext4_fc_tl tl;
 662         struct ext4_fc_tail tail;
 663         int off, bsize = sbi->s_journal->j_blocksize;
 664         u8 *dst;
 665
 666         /*
 667          * ext4_fc_reserve_space takes care of allocating an extra block if
 668          * there's no enough space on this block for accommodating this tail.
 669          */
 670         dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(tail), &crc);
 671         if (!dst)
 672                 return -ENOSPC;
 673
 674         off = sbi->s_fc_bytes % bsize;
 675
 676         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL);
 677         tl.fc_len = cpu_to_le16(bsize - off - 1 + sizeof(struct ext4_fc_tail));
 678         sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize);
 679
 680         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), &crc);
 681         dst += sizeof(tl);
 682         tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
 683         ext4_fc_memcpy(sb, dst, &tail.fc_tid, sizeof(tail.fc_tid), &crc);
 684         dst += sizeof(tail.fc_tid);
 685         tail.fc_crc = cpu_to_le32(crc);
 686         ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL);
 687
 688         ext4_fc_submit_bh(sb);
 689
 690         return 0;
 691 }
 692
 693 /*
 694  * Adds tag, length, value and updates CRC. Returns true if tlv was added.
 695  * Returns false if there's not enough space.
 696  */
 697 static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val,
 698                            u32 *crc)
 699 {
 700         struct ext4_fc_tl tl;
 701         u8 *dst;
 702
 703         dst = ext4_fc_reserve_space(sb, sizeof(tl) + len, crc);
 704         if (!dst)
 705                 return false;
 706
 707         tl.fc_tag = cpu_to_le16(tag);
 708         tl.fc_len = cpu_to_le16(len);
 709
 710         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 711         ext4_fc_memcpy(sb, dst + sizeof(tl), val, len, crc);
 712
 713         return true;
 714 }
 715
 716 /* Same as above, but adds dentry tlv. */
 717 static  bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag,
 718                                         int parent_ino, int ino, int dlen,
 719                                         const unsigned char *dname,
 720                                         u32 *crc)
 721 {
 722         struct ext4_fc_dentry_info fcd;
 723         struct ext4_fc_tl tl;
 724         u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen,
 725                                         crc);
 726
 727         if (!dst)
 728                 return false;
 729
 730         fcd.fc_parent_ino = cpu_to_le32(parent_ino);
 731         fcd.fc_ino = cpu_to_le32(ino);
 732         tl.fc_tag = cpu_to_le16(tag);
 733         tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen);
 734         ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc);
 735         dst += sizeof(tl);
 736         ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc);
 737         dst += sizeof(fcd);
 738         ext4_fc_memcpy(sb, dst, dname, dlen, crc);
 739         dst += dlen;
 740
 741         return true;
 742 }
 743
 744 /*
 745  * Writes inode in the fast commit space under TLV with tag @tag.
 746  * Returns 0 on success, error on failure.
 747  */
 748 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 749 {
 750         struct ext4_inode_info *ei = EXT4_I(inode);
 751         int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 752         int ret;
 753         struct ext4_iloc iloc;
 754         struct ext4_fc_inode fc_inode;
 755         struct ext4_fc_tl tl;
 756         u8 *dst;
 757
 758         ret = ext4_get_inode_loc(inode, &iloc);
 759         if (ret)
 760                 return ret;
 761
 762         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
 763                 inode_len += ei->i_extra_isize;
 764
 765         fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 766         tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
 767         tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
 768
 769         ret = -ECANCELED;
 770         dst = ext4_fc_reserve_space(inode->i_sb,
 771                         sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
 772         if (!dst)
 773                 goto err;
 774
 775         if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
 776                 goto err;
 777         dst += sizeof(tl);
 778         if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
 779                 goto err;
 780         dst += sizeof(fc_inode);
 781         if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
 782                                         inode_len, crc))
 783                 goto err;
 784         ret = 0;
 785 err:
 786         brelse(iloc.bh);
 787         return ret;
 788 }
 789
 790 /*
 791  * Writes updated data ranges for the inode in question. Updates CRC.
 792  * Returns 0 on success, error otherwise.
 793  */
 794 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 795 {
 796         ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 797         struct ext4_inode_info *ei = EXT4_I(inode);
 798         struct ext4_map_blocks map;
 799         struct ext4_fc_add_range fc_ext;
 800         struct ext4_fc_del_range lrange;
 801         struct ext4_extent *ex;
 802         int ret;
 803
 804         mutex_lock(&ei->i_fc_lock);
 805         if (ei->i_fc_lblk_len == 0) {
 806                 mutex_unlock(&ei->i_fc_lock);
 807                 return 0;
 808         }
 809         old_blk_size = ei->i_fc_lblk_start;
 810         new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 811         ei->i_fc_lblk_len = 0;
 812         mutex_unlock(&ei->i_fc_lock);
 813
 814         cur_lblk_off = old_blk_size;
 815         jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n",
 816                   __func__, cur_lblk_off, new_blk_size, inode->i_ino);
 817
 818         while (cur_lblk_off <= new_blk_size) {
 819                 map.m_lblk = cur_lblk_off;
 820                 map.m_len = new_blk_size - cur_lblk_off + 1;
 821                 ret = ext4_map_blocks(NULL, inode, &map, 0);
 822                 if (ret < 0)
 823                         return -ECANCELED;
 824
 825                 if (map.m_len == 0) {
 826                         cur_lblk_off++;
 827                         continue;
 828                 }
 829
 830                 if (ret == 0) {
 831                         lrange.fc_ino = cpu_to_le32(inode->i_ino);
 832                         lrange.fc_lblk = cpu_to_le32(map.m_lblk);
 833                         lrange.fc_len = cpu_to_le32(map.m_len);
 834                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
 835                                             sizeof(lrange), (u8 *)&lrange, crc))
 836                                 return -ENOSPC;
 837                 } else {
 838                         unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 839                                 EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
 840
 841                         /* Limit the number of blocks in one extent */
 842                         map.m_len = min(max, map.m_len);
 843
 844                         fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
 845                         ex = (struct ext4_extent *)&fc_ext.fc_ex;
 846                         ex->ee_block = cpu_to_le32(map.m_lblk);
 847                         ex->ee_len = cpu_to_le16(map.m_len);
 848                         ext4_ext_store_pblock(ex, map.m_pblk);
 849                         if (map.m_flags & EXT4_MAP_UNWRITTEN)
 850                                 ext4_ext_mark_unwritten(ex);
 851                         else
 852                                 ext4_ext_mark_initialized(ex);
 853                         if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
 854                                             sizeof(fc_ext), (u8 *)&fc_ext, crc))
 855                                 return -ENOSPC;
 856                 }
 857
 858                 cur_lblk_off += map.m_len;
 859         }
 860
 861         return 0;
 862 }
 863
 864
 865 /* Submit data for all the fast commit inodes */
 866 static int ext4_fc_submit_inode_data_all(journal_t *journal)
 867 {
 868         struct super_block *sb = (struct super_block *)(journal->j_private);
 869         struct ext4_sb_info *sbi = EXT4_SB(sb);
 870         struct ext4_inode_info *ei;
 871         struct list_head *pos;
 872         int ret = 0;
 873
 874         spin_lock(&sbi->s_fc_lock);
 875         ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING);
 876         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
 877                 ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
 878                 ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
 879                 while (atomic_read(&ei->i_fc_updates)) {
 880                         DEFINE_WAIT(wait);
 881
 882                         prepare_to_wait(&ei->i_fc_wait, &wait,
 883                                                 TASK_UNINTERRUPTIBLE);
 884                         if (atomic_read(&ei->i_fc_updates)) {
 885                                 spin_unlock(&sbi->s_fc_lock);
 886                                 schedule();
 887                                 spin_lock(&sbi->s_fc_lock);
 888                         }
 889                         finish_wait(&ei->i_fc_wait, &wait);
 890                 }
 891                 spin_unlock(&sbi->s_fc_lock);
 892                 ret = jbd2_submit_inode_data(ei->jinode);
 893                 if (ret)
 894                         return ret;
 895                 spin_lock(&sbi->s_fc_lock);
 896         }
 897         spin_unlock(&sbi->s_fc_lock);
 898
 899         return ret;
 900 }
 901
 902 /* Wait for completion of data for all the fast commit inodes */
 903 static int ext4_fc_wait_inode_data_all(journal_t *journal)
 904 {
 905         struct super_block *sb = (struct super_block *)(journal->j_private);
 906         struct ext4_sb_info *sbi = EXT4_SB(sb);
 907         struct ext4_inode_info *pos, *n;
 908         int ret = 0;
 909
 910         spin_lock(&sbi->s_fc_lock);
 911         list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 912                 if (!ext4_test_inode_state(&pos->vfs_inode,
 913                                            EXT4_STATE_FC_COMMITTING))
 914                         continue;
 915                 spin_unlock(&sbi->s_fc_lock);
 916
 917                 ret = jbd2_wait_inode_data(journal, pos->jinode);
 918                 if (ret)
 919                         return ret;
 920                 spin_lock(&sbi->s_fc_lock);
 921         }
 922         spin_unlock(&sbi->s_fc_lock);
 923
 924         return 0;
 925 }
 926
 927 /* Commit all the directory entry updates */
 928 static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 929 __acquires(&sbi->s_fc_lock)
 930 __releases(&sbi->s_fc_lock)
 931 {
 932         struct super_block *sb = (struct super_block *)(journal->j_private);
 933         struct ext4_sb_info *sbi = EXT4_SB(sb);
 934         struct ext4_fc_dentry_update *fc_dentry;
 935         struct inode *inode;
 936         struct list_head *pos, *n, *fcd_pos, *fcd_n;
 937         struct ext4_inode_info *ei;
 938         int ret;
 939
 940         if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN]))
 941                 return 0;
 942         list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) {
 943                 fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update,
 944                                         fcd_list);
 945                 if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
 946                         spin_unlock(&sbi->s_fc_lock);
 947                         if (!ext4_fc_add_dentry_tlv(
 948                                 sb, fc_dentry->fcd_op,
 949                                 fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 950                                 fc_dentry->fcd_name.len,
 951                                 fc_dentry->fcd_name.name, crc)) {
 952                                 ret = -ENOSPC;
 953                                 goto lock_and_exit;
 954                         }
 955                         spin_lock(&sbi->s_fc_lock);
 956                         continue;
 957                 }
 958
 959                 inode = NULL;
 960                 list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
 961                         ei = list_entry(pos, struct ext4_inode_info, i_fc_list);
 962                         if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) {
 963                                 inode = &ei->vfs_inode;
 964                                 break;
 965                         }
 966                 }
 967                 /*
 968                  * If we don't find inode in our list, then it was deleted,
 969                  * in which case, we don't need to record it's create tag.
 970                  */
 971                 if (!inode)
 972                         continue;
 973                 spin_unlock(&sbi->s_fc_lock);
 974
 975                 /*
 976                  * We first write the inode and then the create dirent. This
 977                  * allows the recovery code to create an unnamed inode first
 978                  * and then link it to a directory entry. This allows us
 979                  * to use namei.c routines almost as is and simplifies
 980                  * the recovery code.
 981                  */
 982                 ret = ext4_fc_write_inode(inode, crc);
 983                 if (ret)
 984                         goto lock_and_exit;
 985
 986                 ret = ext4_fc_write_inode_data(inode, crc);
 987                 if (ret)
 988                         goto lock_and_exit;
 989
 990                 if (!ext4_fc_add_dentry_tlv(
 991                         sb, fc_dentry->fcd_op,
 992                         fc_dentry->fcd_parent, fc_dentry->fcd_ino,
 993                         fc_dentry->fcd_name.len,
 994                         fc_dentry->fcd_name.name, crc)) {
 995                         ret = -ENOSPC;
 996                         goto lock_and_exit;
 997                 }
 998
 999                 spin_lock(&sbi->s_fc_lock);
1000         }
1001         return 0;
1002 lock_and_exit:
1003         spin_lock(&sbi->s_fc_lock);
1004         return ret;
1005 }
1006
1007 static int ext4_fc_perform_commit(journal_t *journal)
1008 {
1009         struct super_block *sb = (struct super_block *)(journal->j_private);
1010         struct ext4_sb_info *sbi = EXT4_SB(sb);
1011         struct ext4_inode_info *iter;
1012         struct ext4_fc_head head;
1013         struct list_head *pos;
1014         struct inode *inode;
1015         struct blk_plug plug;
1016         int ret = 0;
1017         u32 crc = 0;
1018
1019         ret = ext4_fc_submit_inode_data_all(journal);
1020         if (ret)
1021                 return ret;
1022
1023         ret = ext4_fc_wait_inode_data_all(journal);
1024         if (ret)
1025                 return ret;
1026
1027         /*
1028          * If file system device is different from journal device, issue a cache
1029          * flush before we start writing fast commit blocks.
1030          */
1031         if (journal->j_fs_dev != journal->j_dev)
1032                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS);
1033
1034         blk_start_plug(&plug);
1035         if (sbi->s_fc_bytes == 0) {
1036                 /*
1037                  * Add a head tag only if this is the first fast commit
1038                  * in this TID.
1039                  */
1040                 head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
1041                 head.fc_tid = cpu_to_le32(
1042                         sbi->s_journal->j_running_transaction->t_tid);
1043                 if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head),
1044                         (u8 *)&head, &crc)) {
1045                         ret = -ENOSPC;
1046                         goto out;
1047                 }
1048         }
1049
1050         spin_lock(&sbi->s_fc_lock);
1051         ret = ext4_fc_commit_dentry_updates(journal, &crc);
1052         if (ret) {
1053                 spin_unlock(&sbi->s_fc_lock);
1054                 goto out;
1055         }
1056
1057         list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) {
1058                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1059                 inode = &iter->vfs_inode;
1060                 if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
1061                         continue;
1062
1063                 spin_unlock(&sbi->s_fc_lock);
1064                 ret = ext4_fc_write_inode_data(inode, &crc);
1065                 if (ret)
1066                         goto out;
1067                 ret = ext4_fc_write_inode(inode, &crc);
1068                 if (ret)
1069                         goto out;
1070                 spin_lock(&sbi->s_fc_lock);
1071         }
1072         spin_unlock(&sbi->s_fc_lock);
1073
1074         ret = ext4_fc_write_tail(sb, crc);
1075
1076 out:
1077         blk_finish_plug(&plug);
1078         return ret;
1079 }
1080
1081 /*
1082  * The main commit entry point. Performs a fast commit for transaction
1083  * commit_tid if needed. If it's not possible to perform a fast commit
1084  * due to various reasons, we fall back to full commit. Returns 0
1085  * on success, error otherwise.
1086  */
1087 int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
1088 {
1089         struct super_block *sb = (struct super_block *)(journal->j_private);
1090         struct ext4_sb_info *sbi = EXT4_SB(sb);
1091         int nblks = 0, ret, bsize = journal->j_blocksize;
1092         int subtid = atomic_read(&sbi->s_fc_subtid);
1093         int reason = EXT4_FC_REASON_OK, fc_bufs_before = 0;
1094         ktime_t start_time, commit_time;
1095
1096         trace_ext4_fc_commit_start(sb);
1097
1098         start_time = ktime_get();
1099
1100         if (!test_opt2(sb, JOURNAL_FAST_COMMIT) ||
1101                 (ext4_fc_is_ineligible(sb))) {
1102                 reason = EXT4_FC_REASON_INELIGIBLE;
1103                 goto out;
1104         }
1105
1106 restart_fc:
1107         ret = jbd2_fc_begin_commit(journal, commit_tid);
1108         if (ret == -EALREADY) {
1109                 /* There was an ongoing commit, check if we need to restart */
1110                 if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
1111                         commit_tid > journal->j_commit_sequence)
1112                         goto restart_fc;
1113                 reason = EXT4_FC_REASON_ALREADY_COMMITTED;
1114                 goto out;
1115         } else if (ret) {
1116                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1117                 reason = EXT4_FC_REASON_FC_START_FAILED;
1118                 goto out;
1119         }
1120
1121         fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
1122         ret = ext4_fc_perform_commit(journal);
1123         if (ret < 0) {
1124                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1125                 reason = EXT4_FC_REASON_FC_FAILED;
1126                 goto out;
1127         }
1128         nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
1129         ret = jbd2_fc_wait_bufs(journal, nblks);
1130         if (ret < 0) {
1131                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1132                 reason = EXT4_FC_REASON_FC_FAILED;
1133                 goto out;
1134         }
1135         atomic_inc(&sbi->s_fc_subtid);
1136         jbd2_fc_end_commit(journal);
1137 out:
1138         /* Has any ineligible update happened since we started? */
1139         if (reason == EXT4_FC_REASON_OK && ext4_fc_is_ineligible(sb)) {
1140                 sbi->s_fc_stats.fc_ineligible_reason_count[EXT4_FC_COMMIT_FAILED]++;
1141                 reason = EXT4_FC_REASON_INELIGIBLE;
1142         }
1143
1144         spin_lock(&sbi->s_fc_lock);
1145         if (reason != EXT4_FC_REASON_OK &&
1146                 reason != EXT4_FC_REASON_ALREADY_COMMITTED) {
1147                 sbi->s_fc_stats.fc_ineligible_commits++;
1148         } else {
1149                 sbi->s_fc_stats.fc_num_commits++;
1150                 sbi->s_fc_stats.fc_numblks += nblks;
1151         }
1152         spin_unlock(&sbi->s_fc_lock);
1153         nblks = (reason == EXT4_FC_REASON_OK) ? nblks : 0;
1154         trace_ext4_fc_commit_stop(sb, nblks, reason);
1155         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1156         /*
1157          * weight the commit time higher than the average time so we don't
1158          * react too strongly to vast changes in the commit time
1159          */
1160         if (likely(sbi->s_fc_avg_commit_time))
1161                 sbi->s_fc_avg_commit_time = (commit_time +
1162                                 sbi->s_fc_avg_commit_time * 3) / 4;
1163         else
1164                 sbi->s_fc_avg_commit_time = commit_time;
1165         jbd_debug(1,
1166                 "Fast commit ended with blks = %d, reason = %d, subtid - %d",
1167                 nblks, reason, subtid);
1168         if (reason == EXT4_FC_REASON_FC_FAILED)
1169                 return jbd2_fc_end_commit_fallback(journal);
1170         if (reason == EXT4_FC_REASON_FC_START_FAILED ||
1171                 reason == EXT4_FC_REASON_INELIGIBLE)
1172                 return jbd2_complete_transaction(journal, commit_tid);
1173         return 0;
1174 }
1175
1176 /*
1177  * Fast commit cleanup routine. This is called after every fast commit and
1178  * full commit. full is true if we are called after a full commit.
1179  */
1180 static void ext4_fc_cleanup(journal_t *journal, int full)
1181 {
1182         struct super_block *sb = journal->j_private;
1183         struct ext4_sb_info *sbi = EXT4_SB(sb);
1184         struct ext4_inode_info *iter;
1185         struct ext4_fc_dentry_update *fc_dentry;
1186         struct list_head *pos, *n;
1187
1188         if (full && sbi->s_fc_bh)
1189                 sbi->s_fc_bh = NULL;
1190
1191         jbd2_fc_release_bufs(journal);
1192
1193         spin_lock(&sbi->s_fc_lock);
1194         list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) {
1195                 iter = list_entry(pos, struct ext4_inode_info, i_fc_list);
1196                 list_del_init(&iter->i_fc_list);
1197                 ext4_clear_inode_state(&iter->vfs_inode,
1198                                        EXT4_STATE_FC_COMMITTING);
1199                 ext4_fc_reset_inode(&iter->vfs_inode);
1200                 /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
1201                 smp_mb();
1202 #if (BITS_PER_LONG < 64)
1203                 wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
1204 #else
1205                 wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
1206 #endif
1207         }
1208
1209         while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
1210                 fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
1211                                              struct ext4_fc_dentry_update,
1212                                              fcd_list);
1213                 list_del_init(&fc_dentry->fcd_list);
1214                 spin_unlock(&sbi->s_fc_lock);
1215
1216                 if (fc_dentry->fcd_name.name &&
1217                         fc_dentry->fcd_name.len > DNAME_INLINE_LEN)
1218                         kfree(fc_dentry->fcd_name.name);
1219                 kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
1220                 spin_lock(&sbi->s_fc_lock);
1221         }
1222
1223         list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
1224                                 &sbi->s_fc_dentry_q[FC_Q_MAIN]);
1225         list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
1226                                 &sbi->s_fc_q[FC_Q_MAIN]);
1227
1228         ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING);
1229         ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
1230
1231         if (full)
1232                 sbi->s_fc_bytes = 0;
1233         spin_unlock(&sbi->s_fc_lock);
1234         trace_ext4_fc_stats(sb);
1235 }
1236
1237 /* Ext4 Replay Path Routines */
1238
1239 /* Helper struct for dentry replay routines */
1240 struct dentry_info_args {
1241         int parent_ino, dname_len, ino, inode_len;
1242         char *dname;
1243 };
1244
1245 static inline void tl_to_darg(struct dentry_info_args *darg,
1246                               struct  ext4_fc_tl *tl, u8 *val)
1247 {
1248         struct ext4_fc_dentry_info fcd;
1249
1250         memcpy(&fcd, val, sizeof(fcd));
1251
1252         darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino);
1253         darg->ino = le32_to_cpu(fcd.fc_ino);
1254         darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname);
1255         darg->dname_len = le16_to_cpu(tl->fc_len) -
1256                 sizeof(struct ext4_fc_dentry_info);
1257 }
1258
1259 /* Unlink replay function */
1260 static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl,
1261                                  u8 *val)
1262 {
1263         struct inode *inode, *old_parent;
1264         struct qstr entry;
1265         struct dentry_info_args darg;
1266         int ret = 0;
1267
1268         tl_to_darg(&darg, tl, val);
1269
1270         trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino,
1271                         darg.parent_ino, darg.dname_len);
1272
1273         entry.name = darg.dname;
1274         entry.len = darg.dname_len;
1275         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1276
1277         if (IS_ERR(inode)) {
1278                 jbd_debug(1, "Inode %d not found", darg.ino);
1279                 return 0;
1280         }
1281
1282         old_parent = ext4_iget(sb, darg.parent_ino,
1283                                 EXT4_IGET_NORMAL);
1284         if (IS_ERR(old_parent)) {
1285                 jbd_debug(1, "Dir with inode  %d not found", darg.parent_ino);
1286                 iput(inode);
1287                 return 0;
1288         }
1289
1290         ret = __ext4_unlink(NULL, old_parent, &entry, inode);
1291         /* -ENOENT ok coz it might not exist anymore. */
1292         if (ret == -ENOENT)
1293                 ret = 0;
1294         iput(old_parent);
1295         iput(inode);
1296         return ret;
1297 }
1298
1299 static int ext4_fc_replay_link_internal(struct super_block *sb,
1300                                 struct dentry_info_args *darg,
1301                                 struct inode *inode)
1302 {
1303         struct inode *dir = NULL;
1304         struct dentry *dentry_dir = NULL, *dentry_inode = NULL;
1305         struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len);
1306         int ret = 0;
1307
1308         dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL);
1309         if (IS_ERR(dir)) {
1310                 jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino);
1311                 dir = NULL;
1312                 goto out;
1313         }
1314
1315         dentry_dir = d_obtain_alias(dir);
1316         if (IS_ERR(dentry_dir)) {
1317                 jbd_debug(1, "Failed to obtain dentry");
1318                 dentry_dir = NULL;
1319                 goto out;
1320         }
1321
1322         dentry_inode = d_alloc(dentry_dir, &qstr_dname);
1323         if (!dentry_inode) {
1324                 jbd_debug(1, "Inode dentry not created.");
1325                 ret = -ENOMEM;
1326                 goto out;
1327         }
1328
1329         ret = __ext4_link(dir, inode, dentry_inode);
1330         /*
1331          * It's possible that link already existed since data blocks
1332          * for the dir in question got persisted before we crashed OR
1333          * we replayed this tag and crashed before the entire replay
1334          * could complete.
1335          */
1336         if (ret && ret != -EEXIST) {
1337                 jbd_debug(1, "Failed to link\n");
1338                 goto out;
1339         }
1340
1341         ret = 0;
1342 out:
1343         if (dentry_dir) {
1344                 d_drop(dentry_dir);
1345                 dput(dentry_dir);
1346         } else if (dir) {
1347                 iput(dir);
1348         }
1349         if (dentry_inode) {
1350                 d_drop(dentry_inode);
1351                 dput(dentry_inode);
1352         }
1353
1354         return ret;
1355 }
1356
1357 /* Link replay function */
1358 static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl,
1359                                u8 *val)
1360 {
1361         struct inode *inode;
1362         struct dentry_info_args darg;
1363         int ret = 0;
1364
1365         tl_to_darg(&darg, tl, val);
1366         trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino,
1367                         darg.parent_ino, darg.dname_len);
1368
1369         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1370         if (IS_ERR(inode)) {
1371                 jbd_debug(1, "Inode not found.");
1372                 return 0;
1373         }
1374
1375         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1376         iput(inode);
1377         return ret;
1378 }
1379
1380 /*
1381  * Record all the modified inodes during replay. We use this later to setup
1382  * block bitmaps correctly.
1383  */
1384 static int ext4_fc_record_modified_inode(struct super_block *sb, int ino)
1385 {
1386         struct ext4_fc_replay_state *state;
1387         int i;
1388
1389         state = &EXT4_SB(sb)->s_fc_replay_state;
1390         for (i = 0; i < state->fc_modified_inodes_used; i++)
1391                 if (state->fc_modified_inodes[i] == ino)
1392                         return 0;
1393         if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
1394                 int *fc_modified_inodes;
1395
1396                 fc_modified_inodes = krealloc(state->fc_modified_inodes,
1397                                 sizeof(int) * (state->fc_modified_inodes_size +
1398                                 EXT4_FC_REPLAY_REALLOC_INCREMENT),
1399                                 GFP_KERNEL);
1400                 if (!fc_modified_inodes)
1401                         return -ENOMEM;
1402                 state->fc_modified_inodes = fc_modified_inodes;
1403                 state->fc_modified_inodes_size +=
1404                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1405         }
1406         state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino;
1407         return 0;
1408 }
1409
1410 /*
1411  * Inode replay function
1412  */
1413 static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl,
1414                                 u8 *val)
1415 {
1416         struct ext4_fc_inode fc_inode;
1417         struct ext4_inode *raw_inode;
1418         struct ext4_inode *raw_fc_inode;
1419         struct inode *inode = NULL;
1420         struct ext4_iloc iloc;
1421         int inode_len, ino, ret, tag = le16_to_cpu(tl->fc_tag);
1422         struct ext4_extent_header *eh;
1423
1424         memcpy(&fc_inode, val, sizeof(fc_inode));
1425
1426         ino = le32_to_cpu(fc_inode.fc_ino);
1427         trace_ext4_fc_replay(sb, tag, ino, 0, 0);
1428
1429         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1430         if (!IS_ERR(inode)) {
1431                 ext4_ext_clear_bb(inode);
1432                 iput(inode);
1433         }
1434         inode = NULL;
1435
1436         ret = ext4_fc_record_modified_inode(sb, ino);
1437         if (ret)
1438                 goto out;
1439
1440         raw_fc_inode = (struct ext4_inode *)
1441                 (val + offsetof(struct ext4_fc_inode, fc_raw_inode));
1442         ret = ext4_get_fc_inode_loc(sb, ino, &iloc);
1443         if (ret)
1444                 goto out;
1445
1446         inode_len = le16_to_cpu(tl->fc_len) - sizeof(struct ext4_fc_inode);
1447         raw_inode = ext4_raw_inode(&iloc);
1448
1449         memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block));
1450         memcpy(&raw_inode->i_generation, &raw_fc_inode->i_generation,
1451                 inode_len - offsetof(struct ext4_inode, i_generation));
1452         if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) {
1453                 eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]);
1454                 if (eh->eh_magic != EXT4_EXT_MAGIC) {
1455                         memset(eh, 0, sizeof(*eh));
1456                         eh->eh_magic = EXT4_EXT_MAGIC;
1457                         eh->eh_max = cpu_to_le16(
1458                                 (sizeof(raw_inode->i_block) -
1459                                  sizeof(struct ext4_extent_header))
1460                                  / sizeof(struct ext4_extent));
1461                 }
1462         } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) {
1463                 memcpy(raw_inode->i_block, raw_fc_inode->i_block,
1464                         sizeof(raw_inode->i_block));
1465         }
1466
1467         /* Immediately update the inode on disk. */
1468         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1469         if (ret)
1470                 goto out;
1471         ret = sync_dirty_buffer(iloc.bh);
1472         if (ret)
1473                 goto out;
1474         ret = ext4_mark_inode_used(sb, ino);
1475         if (ret)
1476                 goto out;
1477
1478         /* Given that we just wrote the inode on disk, this SHOULD succeed. */
1479         inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL);
1480         if (IS_ERR(inode)) {
1481                 jbd_debug(1, "Inode not found.");
1482                 return -EFSCORRUPTED;
1483         }
1484
1485         /*
1486          * Our allocator could have made different decisions than before
1487          * crashing. This should be fixed but until then, we calculate
1488          * the number of blocks the inode.
1489          */
1490         ext4_ext_replay_set_iblocks(inode);
1491
1492         inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation);
1493         ext4_reset_inode_seed(inode);
1494
1495         ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode));
1496         ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh);
1497         sync_dirty_buffer(iloc.bh);
1498         brelse(iloc.bh);
1499 out:
1500         iput(inode);
1501         if (!ret)
1502                 blkdev_issue_flush(sb->s_bdev, GFP_KERNEL);
1503
1504         return 0;
1505 }
1506
1507 /*
1508  * Dentry create replay function.
1509  *
1510  * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the
1511  * inode for which we are trying to create a dentry here, should already have
1512  * been replayed before we start here.
1513  */
1514 static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl,
1515                                  u8 *val)
1516 {
1517         int ret = 0;
1518         struct inode *inode = NULL;
1519         struct inode *dir = NULL;
1520         struct dentry_info_args darg;
1521
1522         tl_to_darg(&darg, tl, val);
1523
1524         trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
1525                         darg.parent_ino, darg.dname_len);
1526
1527         /* This takes care of update group descriptor and other metadata */
1528         ret = ext4_mark_inode_used(sb, darg.ino);
1529         if (ret)
1530                 goto out;
1531
1532         inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
1533         if (IS_ERR(inode)) {
1534                 jbd_debug(1, "inode %d not found.", darg.ino);
1535                 inode = NULL;
1536                 ret = -EINVAL;
1537                 goto out;
1538         }
1539
1540         if (S_ISDIR(inode->i_mode)) {
1541                 /*
1542                  * If we are creating a directory, we need to make sure that the
1543                  * dot and dot dot dirents are setup properly.
1544                  */
1545                 dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL);
1546                 if (IS_ERR(dir)) {
1547                         jbd_debug(1, "Dir %d not found.", darg.ino);
1548                         goto out;
1549                 }
1550                 ret = ext4_init_new_dir(NULL, dir, inode);
1551                 iput(dir);
1552                 if (ret) {
1553                         ret = 0;
1554                         goto out;
1555                 }
1556         }
1557         ret = ext4_fc_replay_link_internal(sb, &darg, inode);
1558         if (ret)
1559                 goto out;
1560         set_nlink(inode, 1);
1561         ext4_mark_inode_dirty(NULL, inode);
1562 out:
1563         if (inode)
1564                 iput(inode);
1565         return ret;
1566 }
1567
1568 /*
1569  * Record physical disk regions which are in use as per fast commit area,
1570  * and used by inodes during replay phase. Our simple replay phase
1571  * allocator excludes these regions from allocation.
1572  */
1573 int ext4_fc_record_regions(struct super_block *sb, int ino,
1574                 ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay)
1575 {
1576         struct ext4_fc_replay_state *state;
1577         struct ext4_fc_alloc_region *region;
1578
1579         state = &EXT4_SB(sb)->s_fc_replay_state;
1580         /*
1581          * during replay phase, the fc_regions_valid may not same as
1582          * fc_regions_used, update it when do new additions.
1583          */
1584         if (replay && state->fc_regions_used != state->fc_regions_valid)
1585                 state->fc_regions_used = state->fc_regions_valid;
1586         if (state->fc_regions_used == state->fc_regions_size) {
1587                 struct ext4_fc_alloc_region *fc_regions;
1588
1589                 fc_regions = krealloc(state->fc_regions,
1590                                       sizeof(struct ext4_fc_alloc_region) *
1591                                       (state->fc_regions_size +
1592                                        EXT4_FC_REPLAY_REALLOC_INCREMENT),
1593                                       GFP_KERNEL);
1594                 if (!fc_regions)
1595                         return -ENOMEM;
1596                 state->fc_regions_size +=
1597                         EXT4_FC_REPLAY_REALLOC_INCREMENT;
1598                 state->fc_regions = fc_regions;
1599         }
1600         region = &state->fc_regions[state->fc_regions_used++];
1601         region->ino = ino;
1602         region->lblk = lblk;
1603         region->pblk = pblk;
1604         region->len = len;
1605
1606         if (replay)
1607                 state->fc_regions_valid++;
1608
1609         return 0;
1610 }
1611
1612 /* Replay add range tag */
1613 static int ext4_fc_replay_add_range(struct super_block *sb,
1614                                     struct ext4_fc_tl *tl, u8 *val)
1615 {
1616         struct ext4_fc_add_range fc_add_ex;
1617         struct ext4_extent newex, *ex;
1618         struct inode *inode;
1619         ext4_lblk_t start, cur;
1620         int remaining, len;
1621         ext4_fsblk_t start_pblk;
1622         struct ext4_map_blocks map;
1623         struct ext4_ext_path *path = NULL;
1624         int ret;
1625
1626         memcpy(&fc_add_ex, val, sizeof(fc_add_ex));
1627         ex = (struct ext4_extent *)&fc_add_ex.fc_ex;
1628
1629         trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE,
1630                 le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block),
1631                 ext4_ext_get_actual_len(ex));
1632
1633         inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL);
1634         if (IS_ERR(inode)) {
1635                 jbd_debug(1, "Inode not found.");
1636                 return 0;
1637         }
1638
1639         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1640         if (ret)
1641                 goto out;
1642
1643         start = le32_to_cpu(ex->ee_block);
1644         start_pblk = ext4_ext_pblock(ex);
1645         len = ext4_ext_get_actual_len(ex);
1646
1647         cur = start;
1648         remaining = len;
1649         jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n",
1650                   start, start_pblk, len, ext4_ext_is_unwritten(ex),
1651                   inode->i_ino);
1652
1653         while (remaining > 0) {
1654                 map.m_lblk = cur;
1655                 map.m_len = remaining;
1656                 map.m_pblk = 0;
1657                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1658
1659                 if (ret < 0)
1660                         goto out;
1661
1662                 if (ret == 0) {
1663                         /* Range is not mapped */
1664                         path = ext4_find_extent(inode, cur, NULL, 0);
1665                         if (IS_ERR(path))
1666                                 goto out;
1667                         memset(&newex, 0, sizeof(newex));
1668                         newex.ee_block = cpu_to_le32(cur);
1669                         ext4_ext_store_pblock(
1670                                 &newex, start_pblk + cur - start);
1671                         newex.ee_len = cpu_to_le16(map.m_len);
1672                         if (ext4_ext_is_unwritten(ex))
1673                                 ext4_ext_mark_unwritten(&newex);
1674                         down_write(&EXT4_I(inode)->i_data_sem);
1675                         ret = ext4_ext_insert_extent(
1676                                 NULL, inode, &path, &newex, 0);
1677                         up_write((&EXT4_I(inode)->i_data_sem));
1678                         ext4_ext_drop_refs(path);
1679                         kfree(path);
1680                         if (ret)
1681                                 goto out;
1682                         goto next;
1683                 }
1684
1685                 if (start_pblk + cur - start != map.m_pblk) {
1686                         /*
1687                          * Logical to physical mapping changed. This can happen
1688                          * if this range was removed and then reallocated to
1689                          * map to new physical blocks during a fast commit.
1690                          */
1691                         ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1692                                         ext4_ext_is_unwritten(ex),
1693                                         start_pblk + cur - start);
1694                         if (ret)
1695                                 goto out;
1696                         /*
1697                          * Mark the old blocks as free since they aren't used
1698                          * anymore. We maintain an array of all the modified
1699                          * inodes. In case these blocks are still used at either
1700                          * a different logical range in the same inode or in
1701                          * some different inode, we will mark them as allocated
1702                          * at the end of the FC replay using our array of
1703                          * modified inodes.
1704                          */
1705                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1706                         goto next;
1707                 }
1708
1709                 /* Range is mapped and needs a state change */
1710                 jbd_debug(1, "Converting from %ld to %d %lld",
1711                                 map.m_flags & EXT4_MAP_UNWRITTEN,
1712                         ext4_ext_is_unwritten(ex), map.m_pblk);
1713                 ret = ext4_ext_replay_update_ex(inode, cur, map.m_len,
1714                                         ext4_ext_is_unwritten(ex), map.m_pblk);
1715                 if (ret)
1716                         goto out;
1717                 /*
1718                  * We may have split the extent tree while toggling the state.
1719                  * Try to shrink the extent tree now.
1720                  */
1721                 ext4_ext_replay_shrink_inode(inode, start + len);
1722 next:
1723                 cur += map.m_len;
1724                 remaining -= map.m_len;
1725         }
1726         ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >>
1727                                         sb->s_blocksize_bits);
1728 out:
1729         iput(inode);
1730         return 0;
1731 }
1732
1733 /* Replay DEL_RANGE tag */
1734 static int
1735 ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl,
1736                          u8 *val)
1737 {
1738         struct inode *inode;
1739         struct ext4_fc_del_range lrange;
1740         struct ext4_map_blocks map;
1741         ext4_lblk_t cur, remaining;
1742         int ret;
1743
1744         memcpy(&lrange, val, sizeof(lrange));
1745         cur = le32_to_cpu(lrange.fc_lblk);
1746         remaining = le32_to_cpu(lrange.fc_len);
1747
1748         trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE,
1749                 le32_to_cpu(lrange.fc_ino), cur, remaining);
1750
1751         inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL);
1752         if (IS_ERR(inode)) {
1753                 jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino));
1754                 return 0;
1755         }
1756
1757         ret = ext4_fc_record_modified_inode(sb, inode->i_ino);
1758         if (ret)
1759                 goto out;
1760
1761         jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n",
1762                         inode->i_ino, le32_to_cpu(lrange.fc_lblk),
1763                         le32_to_cpu(lrange.fc_len));
1764         while (remaining > 0) {
1765                 map.m_lblk = cur;
1766                 map.m_len = remaining;
1767
1768                 ret = ext4_map_blocks(NULL, inode, &map, 0);
1769                 if (ret < 0)
1770                         goto out;
1771                 if (ret > 0) {
1772                         remaining -= ret;
1773                         cur += ret;
1774                         ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0);
1775                 } else {
1776                         remaining -= map.m_len;
1777                         cur += map.m_len;
1778                 }
1779         }
1780
1781         down_write(&EXT4_I(inode)->i_data_sem);
1782         ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk),
1783                                 le32_to_cpu(lrange.fc_lblk) +
1784                                 le32_to_cpu(lrange.fc_len) - 1);
1785         up_write(&EXT4_I(inode)->i_data_sem);
1786         if (ret)
1787                 goto out;
1788         ext4_ext_replay_shrink_inode(inode,
1789                 i_size_read(inode) >> sb->s_blocksize_bits);
1790         ext4_mark_inode_dirty(NULL, inode);
1791 out:
1792         iput(inode);
1793         return 0;
1794 }
1795
1796 static inline const char *tag2str(u16 tag)
1797 {
1798         switch (tag) {
1799         case EXT4_FC_TAG_LINK:
1800                 return "TAG_ADD_ENTRY";
1801         case EXT4_FC_TAG_UNLINK:
1802                 return "TAG_DEL_ENTRY";
1803         case EXT4_FC_TAG_ADD_RANGE:
1804                 return "TAG_ADD_RANGE";
1805         case EXT4_FC_TAG_CREAT:
1806                 return "TAG_CREAT_DENTRY";
1807         case EXT4_FC_TAG_DEL_RANGE:
1808                 return "TAG_DEL_RANGE";
1809         case EXT4_FC_TAG_INODE:
1810                 return "TAG_INODE";
1811         case EXT4_FC_TAG_PAD:
1812                 return "TAG_PAD";
1813         case EXT4_FC_TAG_TAIL:
1814                 return "TAG_TAIL";
1815         case EXT4_FC_TAG_HEAD:
1816                 return "TAG_HEAD";
1817         default:
1818                 return "TAG_ERROR";
1819         }
1820 }
1821
1822 static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
1823 {
1824         struct ext4_fc_replay_state *state;
1825         struct inode *inode;
1826         struct ext4_ext_path *path = NULL;
1827         struct ext4_map_blocks map;
1828         int i, ret, j;
1829         ext4_lblk_t cur, end;
1830
1831         state = &EXT4_SB(sb)->s_fc_replay_state;
1832         for (i = 0; i < state->fc_modified_inodes_used; i++) {
1833                 inode = ext4_iget(sb, state->fc_modified_inodes[i],
1834                         EXT4_IGET_NORMAL);
1835                 if (IS_ERR(inode)) {
1836                         jbd_debug(1, "Inode %d not found.",
1837                                 state->fc_modified_inodes[i]);
1838                         continue;
1839                 }
1840                 cur = 0;
1841                 end = EXT_MAX_BLOCKS;
1842                 while (cur < end) {
1843                         map.m_lblk = cur;
1844                         map.m_len = end - cur;
1845
1846                         ret = ext4_map_blocks(NULL, inode, &map, 0);
1847                         if (ret < 0)
1848                                 break;
1849
1850                         if (ret > 0) {
1851                                 path = ext4_find_extent(inode, map.m_lblk, NULL, 0);
1852                                 if (!IS_ERR(path)) {
1853                                         for (j = 0; j < path->p_depth; j++)
1854                                                 ext4_mb_mark_bb(inode->i_sb,
1855                                                         path[j].p_block, 1, 1);
1856                                         ext4_ext_drop_refs(path);
1857                                         kfree(path);
1858                                 }
1859                                 cur += ret;
1860                                 ext4_mb_mark_bb(inode->i_sb, map.m_pblk,
1861                                                         map.m_len, 1);
1862                         } else {
1863                                 cur = cur + (map.m_len ? map.m_len : 1);
1864                         }
1865                 }
1866                 iput(inode);
1867         }
1868 }
1869
1870 /*
1871  * Check if block is in excluded regions for block allocation. The simple
1872  * allocator that runs during replay phase is calls this function to see
1873  * if it is okay to use a block.
1874  */
1875 bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk)
1876 {
1877         int i;
1878         struct ext4_fc_replay_state *state;
1879
1880         state = &EXT4_SB(sb)->s_fc_replay_state;
1881         for (i = 0; i < state->fc_regions_valid; i++) {
1882                 if (state->fc_regions[i].ino == 0 ||
1883                         state->fc_regions[i].len == 0)
1884                         continue;
1885                 if (blk >= state->fc_regions[i].pblk &&
1886                     blk < state->fc_regions[i].pblk + state->fc_regions[i].len)
1887                         return true;
1888         }
1889         return false;
1890 }
1891
1892 /* Cleanup function called after replay */
1893 void ext4_fc_replay_cleanup(struct super_block *sb)
1894 {
1895         struct ext4_sb_info *sbi = EXT4_SB(sb);
1896
1897         sbi->s_mount_state &= ~EXT4_FC_REPLAY;
1898         kfree(sbi->s_fc_replay_state.fc_regions);
1899         kfree(sbi->s_fc_replay_state.fc_modified_inodes);
1900 }
1901
1902 /*
1903  * Recovery Scan phase handler
1904  *
1905  * This function is called during the scan phase and is responsible
1906  * for doing following things:
1907  * - Make sure the fast commit area has valid tags for replay
1908  * - Count number of tags that need to be replayed by the replay handler
1909  * - Verify CRC
1910  * - Create a list of excluded blocks for allocation during replay phase
1911  *
1912  * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is
1913  * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP
1914  * to indicate that scan has finished and JBD2 can now start replay phase.
1915  * It returns a negative error to indicate that there was an error. At the end
1916  * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set
1917  * to indicate the number of tags that need to replayed during the replay phase.
1918  */
1919 static int ext4_fc_replay_scan(journal_t *journal,
1920                                 struct buffer_head *bh, int off,
1921                                 tid_t expected_tid)
1922 {
1923         struct super_block *sb = journal->j_private;
1924         struct ext4_sb_info *sbi = EXT4_SB(sb);
1925         struct ext4_fc_replay_state *state;
1926         int ret = JBD2_FC_REPLAY_CONTINUE;
1927         struct ext4_fc_add_range ext;
1928         struct ext4_fc_tl tl;
1929         struct ext4_fc_tail tail;
1930         __u8 *start, *end, *cur, *val;
1931         struct ext4_fc_head head;
1932         struct ext4_extent *ex;
1933
1934         state = &sbi->s_fc_replay_state;
1935
1936         start = (u8 *)bh->b_data;
1937         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
1938
1939         if (state->fc_replay_expected_off == 0) {
1940                 state->fc_cur_tag = 0;
1941                 state->fc_replay_num_tags = 0;
1942                 state->fc_crc = 0;
1943                 state->fc_regions = NULL;
1944                 state->fc_regions_valid = state->fc_regions_used =
1945                         state->fc_regions_size = 0;
1946                 /* Check if we can stop early */
1947                 if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag)
1948                         != EXT4_FC_TAG_HEAD)
1949                         return 0;
1950         }
1951
1952         if (off != state->fc_replay_expected_off) {
1953                 ret = -EFSCORRUPTED;
1954                 goto out_err;
1955         }
1956
1957         state->fc_replay_expected_off++;
1958         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
1959                 memcpy(&tl, cur, sizeof(tl));
1960                 val = cur + sizeof(tl);
1961                 jbd_debug(3, "Scan phase, tag:%s, blk %lld\n",
1962                           tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr);
1963                 switch (le16_to_cpu(tl.fc_tag)) {
1964                 case EXT4_FC_TAG_ADD_RANGE:
1965                         memcpy(&ext, val, sizeof(ext));
1966                         ex = (struct ext4_extent *)&ext.fc_ex;
1967                         ret = ext4_fc_record_regions(sb,
1968                                 le32_to_cpu(ext.fc_ino),
1969                                 le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex),
1970                                 ext4_ext_get_actual_len(ex), 0);
1971                         if (ret < 0)
1972                                 break;
1973                         ret = JBD2_FC_REPLAY_CONTINUE;
1974                         fallthrough;
1975                 case EXT4_FC_TAG_DEL_RANGE:
1976                 case EXT4_FC_TAG_LINK:
1977                 case EXT4_FC_TAG_UNLINK:
1978                 case EXT4_FC_TAG_CREAT:
1979                 case EXT4_FC_TAG_INODE:
1980                 case EXT4_FC_TAG_PAD:
1981                         state->fc_cur_tag++;
1982                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1983                                         sizeof(tl) + le16_to_cpu(tl.fc_len));
1984                         break;
1985                 case EXT4_FC_TAG_TAIL:
1986                         state->fc_cur_tag++;
1987                         memcpy(&tail, val, sizeof(tail));
1988                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
1989                                                 sizeof(tl) +
1990                                                 offsetof(struct ext4_fc_tail,
1991                                                 fc_crc));
1992                         if (le32_to_cpu(tail.fc_tid) == expected_tid &&
1993                                 le32_to_cpu(tail.fc_crc) == state->fc_crc) {
1994                                 state->fc_replay_num_tags = state->fc_cur_tag;
1995                                 state->fc_regions_valid =
1996                                         state->fc_regions_used;
1997                         } else {
1998                                 ret = state->fc_replay_num_tags ?
1999                                         JBD2_FC_REPLAY_STOP : -EFSBADCRC;
2000                         }
2001                         state->fc_crc = 0;
2002                         break;
2003                 case EXT4_FC_TAG_HEAD:
2004                         memcpy(&head, val, sizeof(head));
2005                         if (le32_to_cpu(head.fc_features) &
2006                                 ~EXT4_FC_SUPPORTED_FEATURES) {
2007                                 ret = -EOPNOTSUPP;
2008                                 break;
2009                         }
2010                         if (le32_to_cpu(head.fc_tid) != expected_tid) {
2011                                 ret = JBD2_FC_REPLAY_STOP;
2012                                 break;
2013                         }
2014                         state->fc_cur_tag++;
2015                         state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
2016                                             sizeof(tl) + le16_to_cpu(tl.fc_len));
2017                         break;
2018                 default:
2019                         ret = state->fc_replay_num_tags ?
2020                                 JBD2_FC_REPLAY_STOP : -ECANCELED;
2021                 }
2022                 if (ret < 0 || ret == JBD2_FC_REPLAY_STOP)
2023                         break;
2024         }
2025
2026 out_err:
2027         trace_ext4_fc_replay_scan(sb, ret, off);
2028         return ret;
2029 }
2030
2031 /*
2032  * Main recovery path entry point.
2033  * The meaning of return codes is similar as above.
2034  */
2035 static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh,
2036                                 enum passtype pass, int off, tid_t expected_tid)
2037 {
2038         struct super_block *sb = journal->j_private;
2039         struct ext4_sb_info *sbi = EXT4_SB(sb);
2040         struct ext4_fc_tl tl;
2041         __u8 *start, *end, *cur, *val;
2042         int ret = JBD2_FC_REPLAY_CONTINUE;
2043         struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state;
2044         struct ext4_fc_tail tail;
2045
2046         if (pass == PASS_SCAN) {
2047                 state->fc_current_pass = PASS_SCAN;
2048                 return ext4_fc_replay_scan(journal, bh, off, expected_tid);
2049         }
2050
2051         if (state->fc_current_pass != pass) {
2052                 state->fc_current_pass = pass;
2053                 sbi->s_mount_state |= EXT4_FC_REPLAY;
2054         }
2055         if (!sbi->s_fc_replay_state.fc_replay_num_tags) {
2056                 jbd_debug(1, "Replay stops\n");
2057                 ext4_fc_set_bitmaps_and_counters(sb);
2058                 return 0;
2059         }
2060
2061 #ifdef CONFIG_EXT4_DEBUG
2062         if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) {
2063                 pr_warn("Dropping fc block %d because max_replay set\n", off);
2064                 return JBD2_FC_REPLAY_STOP;
2065         }
2066 #endif
2067
2068         start = (u8 *)bh->b_data;
2069         end = (__u8 *)bh->b_data + journal->j_blocksize - 1;
2070
2071         for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) {
2072                 memcpy(&tl, cur, sizeof(tl));
2073                 val = cur + sizeof(tl);
2074
2075                 if (state->fc_replay_num_tags == 0) {
2076                         ret = JBD2_FC_REPLAY_STOP;
2077                         ext4_fc_set_bitmaps_and_counters(sb);
2078                         break;
2079                 }
2080                 jbd_debug(3, "Replay phase, tag:%s\n",
2081                                 tag2str(le16_to_cpu(tl.fc_tag)));
2082                 state->fc_replay_num_tags--;
2083                 switch (le16_to_cpu(tl.fc_tag)) {
2084                 case EXT4_FC_TAG_LINK:
2085                         ret = ext4_fc_replay_link(sb, &tl, val);
2086                         break;
2087                 case EXT4_FC_TAG_UNLINK:
2088                         ret = ext4_fc_replay_unlink(sb, &tl, val);
2089                         break;
2090                 case EXT4_FC_TAG_ADD_RANGE:
2091                         ret = ext4_fc_replay_add_range(sb, &tl, val);
2092                         break;
2093                 case EXT4_FC_TAG_CREAT:
2094                         ret = ext4_fc_replay_create(sb, &tl, val);
2095                         break;
2096                 case EXT4_FC_TAG_DEL_RANGE:
2097                         ret = ext4_fc_replay_del_range(sb, &tl, val);
2098                         break;
2099                 case EXT4_FC_TAG_INODE:
2100                         ret = ext4_fc_replay_inode(sb, &tl, val);
2101                         break;
2102                 case EXT4_FC_TAG_PAD:
2103                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0,
2104                                              le16_to_cpu(tl.fc_len), 0);
2105                         break;
2106                 case EXT4_FC_TAG_TAIL:
2107                         trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0,
2108                                              le16_to_cpu(tl.fc_len), 0);
2109                         memcpy(&tail, val, sizeof(tail));
2110                         WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid);
2111                         break;
2112                 case EXT4_FC_TAG_HEAD:
2113                         break;
2114                 default:
2115                         trace_ext4_fc_replay(sb, le16_to_cpu(tl.fc_tag), 0,
2116                                              le16_to_cpu(tl.fc_len), 0);
2117                         ret = -ECANCELED;
2118                         break;
2119                 }
2120                 if (ret < 0)
2121                         break;
2122                 ret = JBD2_FC_REPLAY_CONTINUE;
2123         }
2124         return ret;
2125 }
2126
2127 void ext4_fc_init(struct super_block *sb, journal_t *journal)
2128 {
2129         /*
2130          * We set replay callback even if fast commit disabled because we may
2131          * could still have fast commit blocks that need to be replayed even if
2132          * fast commit has now been turned off.
2133          */
2134         journal->j_fc_replay_callback = ext4_fc_replay;
2135         if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
2136                 return;
2137         journal->j_fc_cleanup_callback = ext4_fc_cleanup;
2138 }
2139
2140 static const char *fc_ineligible_reasons[] = {
2141         "Extended attributes changed",
2142         "Cross rename",
2143         "Journal flag changed",
2144         "Insufficient memory",
2145         "Swap boot",
2146         "Resize",
2147         "Dir renamed",
2148         "Falloc range op",
2149         "Data journalling",
2150         "FC Commit Failed"
2151 };
2152
2153 int ext4_fc_info_show(struct seq_file *seq, void *v)
2154 {
2155         struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
2156         struct ext4_fc_stats *stats = &sbi->s_fc_stats;
2157         int i;
2158
2159         if (v != SEQ_START_TOKEN)
2160                 return 0;
2161
2162         seq_printf(seq,
2163                 "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
2164                    stats->fc_num_commits, stats->fc_ineligible_commits,
2165                    stats->fc_numblks,
2166                    div_u64(sbi->s_fc_avg_commit_time, 1000));
2167         seq_puts(seq, "Ineligible reasons:\n");
2168         for (i = 0; i < EXT4_FC_REASON_MAX; i++)
2169                 seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
2170                         stats->fc_ineligible_reason_count[i]);
2171
2172         return 0;
2173 }
2174
2175 int __init ext4_fc_init_dentry_cache(void)
2176 {
2177         ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
2178                                            SLAB_RECLAIM_ACCOUNT);
2179
2180         if (ext4_fc_dentry_cachep == NULL)
2181                 return -ENOMEM;
2182
2183         return 0;
2184 }
2185
2186 void ext4_fc_destroy_dentry_cache(void)
2187 {
2188         kmem_cache_destroy(ext4_fc_dentry_cachep);
2189 }