fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25 #include <linux/writeback.h>
  26 #include <linux/backing-dev.h>
  27 #include <linux/bio.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/bitops.h>
  30 #include <trace/events/jbd2.h>
  31
  32 /*
  33  * IO end handler for temporary buffer_heads handling writes to the journal.
  34  */
  35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  36 {
  37         struct buffer_head *orig_bh = bh->b_private;
  38
  39         BUFFER_TRACE(bh, "");
  40         if (uptodate)
  41                 set_buffer_uptodate(bh);
  42         else
  43                 clear_buffer_uptodate(bh);
  44         if (orig_bh) {
  45                 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
  46                 smp_mb__after_atomic();
  47                 wake_up_bit(&orig_bh->b_state, BH_Shadow);
  48         }
  49         unlock_buffer(bh);
  50 }
  51
  52 /*
  53  * When an ext4 file is truncated, it is possible that some pages are not
  54  * successfully freed, because they are attached to a committing transaction.
  55  * After the transaction commits, these pages are left on the LRU, with no
  56  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  57  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  58  * the numbers in /proc/meminfo look odd.
  59  *
  60  * So here, we have a buffer which has just come off the forget list.  Look to
  61  * see if we can strip all buffers from the backing page.
  62  *
  63  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  64  * caller provided us with a ref against the buffer, and we drop that here.
  65  */
  66 static void release_buffer_page(struct buffer_head *bh)
  67 {
  68         struct page *page;
  69
  70         if (buffer_dirty(bh))
  71                 goto nope;
  72         if (atomic_read(&bh->b_count) != 1)
  73                 goto nope;
  74         page = bh->b_page;
  75         if (!page)
  76                 goto nope;
  77         if (page->mapping)
  78                 goto nope;
  79
  80         /* OK, it's a truncated page */
  81         if (!trylock_page(page))
  82                 goto nope;
  83
  84         get_page(page);
  85         __brelse(bh);
  86         try_to_free_buffers(page);
  87         unlock_page(page);
  88         put_page(page);
  89         return;
  90
  91 nope:
  92         __brelse(bh);
  93 }
  94
  95 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
  96 {
  97         struct commit_header *h;
  98         __u32 csum;
  99
 100         if (!jbd2_journal_has_csum_v2or3(j))
 101                 return;
 102
 103         h = (struct commit_header *)(bh->b_data);
 104         h->h_chksum_type = 0;
 105         h->h_chksum_size = 0;
 106         h->h_chksum[0] = 0;
 107         csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 108         h->h_chksum[0] = cpu_to_be32(csum);
 109 }
 110
 111 /*
 112  * Done it all: now submit the commit record.  We should have
 113  * cleaned up our previous buffers by now, so if we are in abort
 114  * mode we can now just skip the rest of the journal write
 115  * entirely.
 116  *
 117  * Returns 1 if the journal needs to be aborted or 0 on success
 118  */
 119 static int journal_submit_commit_record(journal_t *journal,
 120                                         transaction_t *commit_transaction,
 121                                         struct buffer_head **cbh,
 122                                         __u32 crc32_sum)
 123 {
 124         struct commit_header *tmp;
 125         struct buffer_head *bh;
 126         int ret;
 127         struct timespec64 now = current_kernel_time64();
 128
 129         *cbh = NULL;
 130
 131         if (is_journal_aborted(journal))
 132                 return 0;
 133
 134         bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
 135                                                 JBD2_COMMIT_BLOCK);
 136         if (!bh)
 137                 return 1;
 138
 139         tmp = (struct commit_header *)bh->b_data;
 140         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 141         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 142
 143         if (jbd2_has_feature_checksum(journal)) {
 144                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 145                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 146                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 147         }
 148         jbd2_commit_block_csum_set(journal, bh);
 149
 150         BUFFER_TRACE(bh, "submit commit block");
 151         lock_buffer(bh);
 152         clear_buffer_dirty(bh);
 153         set_buffer_uptodate(bh);
 154         bh->b_end_io = journal_end_buffer_io_sync;
 155
 156         if (journal->j_flags & JBD2_BARRIER &&
 157             !jbd2_has_feature_async_commit(journal))
 158                 ret = submit_bh(REQ_OP_WRITE,
 159                         REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
 160         else
 161                 ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 162
 163         *cbh = bh;
 164         return ret;
 165 }
 166
 167 /*
 168  * This function along with journal_submit_commit_record
 169  * allows to write the commit record asynchronously.
 170  */
 171 static int journal_wait_on_commit_record(journal_t *journal,
 172                                          struct buffer_head *bh)
 173 {
 174         int ret = 0;
 175
 176         clear_buffer_dirty(bh);
 177         wait_on_buffer(bh);
 178
 179         if (unlikely(!buffer_uptodate(bh)))
 180                 ret = -EIO;
 181         put_bh(bh);            /* One for getblk() */
 182
 183         return ret;
 184 }
 185
 186 /*
 187  * write the filemap data using writepage() address_space_operations.
 188  * We don't do block allocation here even for delalloc. We don't
 189  * use writepages() because with dealyed allocation we may be doing
 190  * block allocation in writepages().
 191  */
 192 static int journal_submit_inode_data_buffers(struct address_space *mapping,
 193                 loff_t dirty_start, loff_t dirty_end)
 194 {
 195         int ret;
 196         struct writeback_control wbc = {
 197                 .sync_mode =  WB_SYNC_ALL,
 198                 .nr_to_write = mapping->nrpages * 2,
 199                 .range_start = dirty_start,
 200                 .range_end = dirty_end,
 201         };
 202
 203         ret = generic_writepages(mapping, &wbc);
 204         return ret;
 205 }
 206
 207 /*
 208  * Submit all the data buffers of inode associated with the transaction to
 209  * disk.
 210  *
 211  * We are in a committing transaction. Therefore no new inode can be added to
 212  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 213  * operate on from being released while we write out pages.
 214  */
 215 static int journal_submit_data_buffers(journal_t *journal,
 216                 transaction_t *commit_transaction)
 217 {
 218         struct jbd2_inode *jinode;
 219         int err, ret = 0;
 220         struct address_space *mapping;
 221
 222         spin_lock(&journal->j_list_lock);
 223         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 224                 loff_t dirty_start = jinode->i_dirty_start;
 225                 loff_t dirty_end = jinode->i_dirty_end;
 226
 227                 if (!(jinode->i_flags & JI_WRITE_DATA))
 228                         continue;
 229                 mapping = jinode->i_vfs_inode->i_mapping;
 230                 jinode->i_flags |= JI_COMMIT_RUNNING;
 231                 spin_unlock(&journal->j_list_lock);
 232                 /*
 233                  * submit the inode data buffers. We use writepage
 234                  * instead of writepages. Because writepages can do
 235                  * block allocation  with delalloc. We need to write
 236                  * only allocated blocks here.
 237                  */
 238                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 239                 err = journal_submit_inode_data_buffers(mapping, dirty_start,
 240                                 dirty_end);
 241                 if (!ret)
 242                         ret = err;
 243                 spin_lock(&journal->j_list_lock);
 244                 J_ASSERT(jinode->i_transaction == commit_transaction);
 245                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 246                 smp_mb();
 247                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 248         }
 249         spin_unlock(&journal->j_list_lock);
 250         return ret;
 251 }
 252
 253 /*
 254  * Wait for data submitted for writeout, refile inodes to proper
 255  * transaction if needed.
 256  *
 257  */
 258 static int journal_finish_inode_data_buffers(journal_t *journal,
 259                 transaction_t *commit_transaction)
 260 {
 261         struct jbd2_inode *jinode, *next_i;
 262         int err, ret = 0;
 263
 264         /* For locking, see the comment in journal_submit_data_buffers() */
 265         spin_lock(&journal->j_list_lock);
 266         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 267                 loff_t dirty_start = jinode->i_dirty_start;
 268                 loff_t dirty_end = jinode->i_dirty_end;
 269
 270                 if (!(jinode->i_flags & JI_WAIT_DATA))
 271                         continue;
 272                 jinode->i_flags |= JI_COMMIT_RUNNING;
 273                 spin_unlock(&journal->j_list_lock);
 274                 err = filemap_fdatawait_range_keep_errors(
 275                                 jinode->i_vfs_inode->i_mapping, dirty_start,
 276                                 dirty_end);
 277                 if (!ret)
 278                         ret = err;
 279                 spin_lock(&journal->j_list_lock);
 280                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 281                 smp_mb();
 282                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 283         }
 284
 285         /* Now refile inode to proper lists */
 286         list_for_each_entry_safe(jinode, next_i,
 287                                  &commit_transaction->t_inode_list, i_list) {
 288                 list_del(&jinode->i_list);
 289                 if (jinode->i_next_transaction) {
 290                         jinode->i_transaction = jinode->i_next_transaction;
 291                         jinode->i_next_transaction = NULL;
 292                         list_add(&jinode->i_list,
 293                                 &jinode->i_transaction->t_inode_list);
 294                 } else {
 295                         jinode->i_transaction = NULL;
 296                         jinode->i_dirty_start = 0;
 297                         jinode->i_dirty_end = 0;
 298                 }
 299         }
 300         spin_unlock(&journal->j_list_lock);
 301
 302         return ret;
 303 }
 304
 305 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 306 {
 307         struct page *page = bh->b_page;
 308         char *addr;
 309         __u32 checksum;
 310
 311         addr = kmap_atomic(page);
 312         checksum = crc32_be(crc32_sum,
 313                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 314         kunmap_atomic(addr);
 315
 316         return checksum;
 317 }
 318
 319 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
 320                                    unsigned long long block)
 321 {
 322         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 323         if (jbd2_has_feature_64bit(j))
 324                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 325 }
 326
 327 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
 328                                     struct buffer_head *bh, __u32 sequence)
 329 {
 330         journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
 331         struct page *page = bh->b_page;
 332         __u8 *addr;
 333         __u32 csum32;
 334         __be32 seq;
 335
 336         if (!jbd2_journal_has_csum_v2or3(j))
 337                 return;
 338
 339         seq = cpu_to_be32(sequence);
 340         addr = kmap_atomic(page);
 341         csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
 342         csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
 343                              bh->b_size);
 344         kunmap_atomic(addr);
 345
 346         if (jbd2_has_feature_csum3(j))
 347                 tag3->t_checksum = cpu_to_be32(csum32);
 348         else
 349                 tag->t_checksum = cpu_to_be16(csum32);
 350 }
 351 /*
 352  * jbd2_journal_commit_transaction
 353  *
 354  * The primary function for committing a transaction to the log.  This
 355  * function is called by the journal thread to begin a complete commit.
 356  */
 357 void jbd2_journal_commit_transaction(journal_t *journal)
 358 {
 359         struct transaction_stats_s stats;
 360         transaction_t *commit_transaction;
 361         struct journal_head *jh;
 362         struct buffer_head *descriptor;
 363         struct buffer_head **wbuf = journal->j_wbuf;
 364         int bufs;
 365         int flags;
 366         int err;
 367         unsigned long long blocknr;
 368         ktime_t start_time;
 369         u64 commit_time;
 370         char *tagp = NULL;
 371         journal_block_tag_t *tag = NULL;
 372         int space_left = 0;
 373         int first_tag = 0;
 374         int tag_flag;
 375         int i;
 376         int tag_bytes = journal_tag_bytes(journal);
 377         struct buffer_head *cbh = NULL; /* For transactional checksums */
 378         __u32 crc32_sum = ~0;
 379         struct blk_plug plug;
 380         /* Tail of the journal */
 381         unsigned long first_block;
 382         tid_t first_tid;
 383         int update_tail;
 384         int csum_size = 0;
 385         LIST_HEAD(io_bufs);
 386         LIST_HEAD(log_bufs);
 387
 388         if (jbd2_journal_has_csum_v2or3(journal))
 389                 csum_size = sizeof(struct jbd2_journal_block_tail);
 390
 391         /*
 392          * First job: lock down the current transaction and wait for
 393          * all outstanding updates to complete.
 394          */
 395
 396         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 397         if (journal->j_flags & JBD2_FLUSHED) {
 398                 jbd_debug(3, "super block updated\n");
 399                 mutex_lock_io(&journal->j_checkpoint_mutex);
 400                 /*
 401                  * We hold j_checkpoint_mutex so tail cannot change under us.
 402                  * We don't need any special data guarantees for writing sb
 403                  * since journal is empty and it is ok for write to be
 404                  * flushed only with transaction commit.
 405                  */
 406                 jbd2_journal_update_sb_log_tail(journal,
 407                                                 journal->j_tail_sequence,
 408                                                 journal->j_tail,
 409                                                 REQ_SYNC);
 410                 mutex_unlock(&journal->j_checkpoint_mutex);
 411         } else {
 412                 jbd_debug(3, "superblock not updated\n");
 413         }
 414
 415         J_ASSERT(journal->j_running_transaction != NULL);
 416         J_ASSERT(journal->j_committing_transaction == NULL);
 417
 418         commit_transaction = journal->j_running_transaction;
 419
 420         trace_jbd2_start_commit(journal, commit_transaction);
 421         jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 422                         commit_transaction->t_tid);
 423
 424         write_lock(&journal->j_state_lock);
 425         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 426         commit_transaction->t_state = T_LOCKED;
 427
 428         trace_jbd2_commit_locking(journal, commit_transaction);
 429         stats.run.rs_wait = commit_transaction->t_max_wait;
 430         stats.run.rs_request_delay = 0;
 431         stats.run.rs_locked = jiffies;
 432         if (commit_transaction->t_requested)
 433                 stats.run.rs_request_delay =
 434                         jbd2_time_diff(commit_transaction->t_requested,
 435                                        stats.run.rs_locked);
 436         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 437                                               stats.run.rs_locked);
 438
 439         spin_lock(&commit_transaction->t_handle_lock);
 440         while (atomic_read(&commit_transaction->t_updates)) {
 441                 DEFINE_WAIT(wait);
 442
 443                 prepare_to_wait(&journal->j_wait_updates, &wait,
 444                                         TASK_UNINTERRUPTIBLE);
 445                 if (atomic_read(&commit_transaction->t_updates)) {
 446                         spin_unlock(&commit_transaction->t_handle_lock);
 447                         write_unlock(&journal->j_state_lock);
 448                         schedule();
 449                         write_lock(&journal->j_state_lock);
 450                         spin_lock(&commit_transaction->t_handle_lock);
 451                 }
 452                 finish_wait(&journal->j_wait_updates, &wait);
 453         }
 454         spin_unlock(&commit_transaction->t_handle_lock);
 455
 456         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 457                         journal->j_max_transaction_buffers);
 458
 459         /*
 460          * First thing we are allowed to do is to discard any remaining
 461          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 462          * that there are no such buffers: if a large filesystem
 463          * operation like a truncate needs to split itself over multiple
 464          * transactions, then it may try to do a jbd2_journal_restart() while
 465          * there are still BJ_Reserved buffers outstanding.  These must
 466          * be released cleanly from the current transaction.
 467          *
 468          * In this case, the filesystem must still reserve write access
 469          * again before modifying the buffer in the new transaction, but
 470          * we do not require it to remember exactly which old buffers it
 471          * has reserved.  This is consistent with the existing behaviour
 472          * that multiple jbd2_journal_get_write_access() calls to the same
 473          * buffer are perfectly permissible.
 474          */
 475         while (commit_transaction->t_reserved_list) {
 476                 jh = commit_transaction->t_reserved_list;
 477                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 478                 /*
 479                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 480                  * leave undo-committed data.
 481                  */
 482                 if (jh->b_committed_data) {
 483                         struct buffer_head *bh = jh2bh(jh);
 484
 485                         jbd_lock_bh_state(bh);
 486                         jbd2_free(jh->b_committed_data, bh->b_size);
 487                         jh->b_committed_data = NULL;
 488                         jbd_unlock_bh_state(bh);
 489                 }
 490                 jbd2_journal_refile_buffer(journal, jh);
 491         }
 492
 493         /*
 494          * Now try to drop any written-back buffers from the journal's
 495          * checkpoint lists.  We do this *before* commit because it potentially
 496          * frees some memory
 497          */
 498         spin_lock(&journal->j_list_lock);
 499         __jbd2_journal_clean_checkpoint_list(journal, false);
 500         spin_unlock(&journal->j_list_lock);
 501
 502         jbd_debug(3, "JBD2: commit phase 1\n");
 503
 504         /*
 505          * Clear revoked flag to reflect there is no revoked buffers
 506          * in the next transaction which is going to be started.
 507          */
 508         jbd2_clear_buffer_revoked_flags(journal);
 509
 510         /*
 511          * Switch to a new revoke table.
 512          */
 513         jbd2_journal_switch_revoke_table(journal);
 514
 515         /*
 516          * Reserved credits cannot be claimed anymore, free them
 517          */
 518         atomic_sub(atomic_read(&journal->j_reserved_credits),
 519                    &commit_transaction->t_outstanding_credits);
 520
 521         trace_jbd2_commit_flushing(journal, commit_transaction);
 522         stats.run.rs_flushing = jiffies;
 523         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 524                                              stats.run.rs_flushing);
 525
 526         commit_transaction->t_state = T_FLUSH;
 527         journal->j_committing_transaction = commit_transaction;
 528         journal->j_running_transaction = NULL;
 529         start_time = ktime_get();
 530         commit_transaction->t_log_start = journal->j_head;
 531         wake_up(&journal->j_wait_transaction_locked);
 532         write_unlock(&journal->j_state_lock);
 533
 534         jbd_debug(3, "JBD2: commit phase 2a\n");
 535
 536         /*
 537          * Now start flushing things to disk, in the order they appear
 538          * on the transaction lists.  Data blocks go first.
 539          */
 540         err = journal_submit_data_buffers(journal, commit_transaction);
 541         if (err)
 542                 jbd2_journal_abort(journal, err);
 543
 544         blk_start_plug(&plug);
 545         jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
 546
 547         jbd_debug(3, "JBD2: commit phase 2b\n");
 548
 549         /*
 550          * Way to go: we have now written out all of the data for a
 551          * transaction!  Now comes the tricky part: we need to write out
 552          * metadata.  Loop over the transaction's entire buffer list:
 553          */
 554         write_lock(&journal->j_state_lock);
 555         commit_transaction->t_state = T_COMMIT;
 556         write_unlock(&journal->j_state_lock);
 557
 558         trace_jbd2_commit_logging(journal, commit_transaction);
 559         stats.run.rs_logging = jiffies;
 560         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 561                                                stats.run.rs_logging);
 562         stats.run.rs_blocks =
 563                 atomic_read(&commit_transaction->t_outstanding_credits);
 564         stats.run.rs_blocks_logged = 0;
 565
 566         J_ASSERT(commit_transaction->t_nr_buffers <=
 567                  atomic_read(&commit_transaction->t_outstanding_credits));
 568
 569         err = 0;
 570         bufs = 0;
 571         descriptor = NULL;
 572         while (commit_transaction->t_buffers) {
 573
 574                 /* Find the next buffer to be journaled... */
 575
 576                 jh = commit_transaction->t_buffers;
 577
 578                 /* If we're in abort mode, we just un-journal the buffer and
 579                    release it. */
 580
 581                 if (is_journal_aborted(journal)) {
 582                         clear_buffer_jbddirty(jh2bh(jh));
 583                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 584                         jbd2_buffer_abort_trigger(jh,
 585                                                   jh->b_frozen_data ?
 586                                                   jh->b_frozen_triggers :
 587                                                   jh->b_triggers);
 588                         jbd2_journal_refile_buffer(journal, jh);
 589                         /* If that was the last one, we need to clean up
 590                          * any descriptor buffers which may have been
 591                          * already allocated, even if we are now
 592                          * aborting. */
 593                         if (!commit_transaction->t_buffers)
 594                                 goto start_journal_io;
 595                         continue;
 596                 }
 597
 598                 /* Make sure we have a descriptor block in which to
 599                    record the metadata buffer. */
 600
 601                 if (!descriptor) {
 602                         J_ASSERT (bufs == 0);
 603
 604                         jbd_debug(4, "JBD2: get descriptor\n");
 605
 606                         descriptor = jbd2_journal_get_descriptor_buffer(
 607                                                         commit_transaction,
 608                                                         JBD2_DESCRIPTOR_BLOCK);
 609                         if (!descriptor) {
 610                                 jbd2_journal_abort(journal, -EIO);
 611                                 continue;
 612                         }
 613
 614                         jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
 615                                 (unsigned long long)descriptor->b_blocknr,
 616                                 descriptor->b_data);
 617                         tagp = &descriptor->b_data[sizeof(journal_header_t)];
 618                         space_left = descriptor->b_size -
 619                                                 sizeof(journal_header_t);
 620                         first_tag = 1;
 621                         set_buffer_jwrite(descriptor);
 622                         set_buffer_dirty(descriptor);
 623                         wbuf[bufs++] = descriptor;
 624
 625                         /* Record it so that we can wait for IO
 626                            completion later */
 627                         BUFFER_TRACE(descriptor, "ph3: file as descriptor");
 628                         jbd2_file_log_bh(&log_bufs, descriptor);
 629                 }
 630
 631                 /* Where is the buffer to be written? */
 632
 633                 err = jbd2_journal_next_log_block(journal, &blocknr);
 634                 /* If the block mapping failed, just abandon the buffer
 635                    and repeat this loop: we'll fall into the
 636                    refile-on-abort condition above. */
 637                 if (err) {
 638                         jbd2_journal_abort(journal, err);
 639                         continue;
 640                 }
 641
 642                 /*
 643                  * start_this_handle() uses t_outstanding_credits to determine
 644                  * the free space in the log, but this counter is changed
 645                  * by jbd2_journal_next_log_block() also.
 646                  */
 647                 atomic_dec(&commit_transaction->t_outstanding_credits);
 648
 649                 /* Bump b_count to prevent truncate from stumbling over
 650                    the shadowed buffer!  @@@ This can go if we ever get
 651                    rid of the shadow pairing of buffers. */
 652                 atomic_inc(&jh2bh(jh)->b_count);
 653
 654                 /*
 655                  * Make a temporary IO buffer with which to write it out
 656                  * (this will requeue the metadata buffer to BJ_Shadow).
 657                  */
 658                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 659                 JBUFFER_TRACE(jh, "ph3: write metadata");
 660                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 661                                                 jh, &wbuf[bufs], blocknr);
 662                 if (flags < 0) {
 663                         jbd2_journal_abort(journal, flags);
 664                         continue;
 665                 }
 666                 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
 667
 668                 /* Record the new block's tag in the current descriptor
 669                    buffer */
 670
 671                 tag_flag = 0;
 672                 if (flags & 1)
 673                         tag_flag |= JBD2_FLAG_ESCAPE;
 674                 if (!first_tag)
 675                         tag_flag |= JBD2_FLAG_SAME_UUID;
 676
 677                 tag = (journal_block_tag_t *) tagp;
 678                 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
 679                 tag->t_flags = cpu_to_be16(tag_flag);
 680                 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
 681                                         commit_transaction->t_tid);
 682                 tagp += tag_bytes;
 683                 space_left -= tag_bytes;
 684                 bufs++;
 685
 686                 if (first_tag) {
 687                         memcpy (tagp, journal->j_uuid, 16);
 688                         tagp += 16;
 689                         space_left -= 16;
 690                         first_tag = 0;
 691                 }
 692
 693                 /* If there's no more to do, or if the descriptor is full,
 694                    let the IO rip! */
 695
 696                 if (bufs == journal->j_wbufsize ||
 697                     commit_transaction->t_buffers == NULL ||
 698                     space_left < tag_bytes + 16 + csum_size) {
 699
 700                         jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 701
 702                         /* Write an end-of-descriptor marker before
 703                            submitting the IOs.  "tag" still points to
 704                            the last tag we set up. */
 705
 706                         tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
 707 start_journal_io:
 708                         if (descriptor)
 709                                 jbd2_descriptor_block_csum_set(journal,
 710                                                         descriptor);
 711
 712                         for (i = 0; i < bufs; i++) {
 713                                 struct buffer_head *bh = wbuf[i];
 714                                 /*
 715                                  * Compute checksum.
 716                                  */
 717                                 if (jbd2_has_feature_checksum(journal)) {
 718                                         crc32_sum =
 719                                             jbd2_checksum_data(crc32_sum, bh);
 720                                 }
 721
 722                                 lock_buffer(bh);
 723                                 clear_buffer_dirty(bh);
 724                                 set_buffer_uptodate(bh);
 725                                 bh->b_end_io = journal_end_buffer_io_sync;
 726                                 submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 727                         }
 728                         cond_resched();
 729
 730                         /* Force a new descriptor to be generated next
 731                            time round the loop. */
 732                         descriptor = NULL;
 733                         bufs = 0;
 734                 }
 735         }
 736
 737         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 738         if (err) {
 739                 printk(KERN_WARNING
 740                         "JBD2: Detected IO errors while flushing file data "
 741                        "on %s\n", journal->j_devname);
 742                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 743                         jbd2_journal_abort(journal, err);
 744                 err = 0;
 745         }
 746
 747         /*
 748          * Get current oldest transaction in the log before we issue flush
 749          * to the filesystem device. After the flush we can be sure that
 750          * blocks of all older transactions are checkpointed to persistent
 751          * storage and we will be safe to update journal start in the
 752          * superblock with the numbers we get here.
 753          */
 754         update_tail =
 755                 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
 756
 757         write_lock(&journal->j_state_lock);
 758         if (update_tail) {
 759                 long freed = first_block - journal->j_tail;
 760
 761                 if (first_block < journal->j_tail)
 762                         freed += journal->j_last - journal->j_first;
 763                 /* Update tail only if we free significant amount of space */
 764                 if (freed < journal->j_maxlen / 4)
 765                         update_tail = 0;
 766         }
 767         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 768         commit_transaction->t_state = T_COMMIT_DFLUSH;
 769         write_unlock(&journal->j_state_lock);
 770
 771         /*
 772          * If the journal is not located on the file system device,
 773          * then we must flush the file system device before we issue
 774          * the commit record
 775          */
 776         if (commit_transaction->t_need_data_flush &&
 777             (journal->j_fs_dev != journal->j_dev) &&
 778             (journal->j_flags & JBD2_BARRIER))
 779                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
 780
 781         /* Done it all: now write the commit record asynchronously. */
 782         if (jbd2_has_feature_async_commit(journal)) {
 783                 err = journal_submit_commit_record(journal, commit_transaction,
 784                                                  &cbh, crc32_sum);
 785                 if (err)
 786                         jbd2_journal_abort(journal, err);
 787         }
 788
 789         blk_finish_plug(&plug);
 790
 791         /* Lo and behold: we have just managed to send a transaction to
 792            the log.  Before we can commit it, wait for the IO so far to
 793            complete.  Control buffers being written are on the
 794            transaction's t_log_list queue, and metadata buffers are on
 795            the io_bufs list.
 796
 797            Wait for the buffers in reverse order.  That way we are
 798            less likely to be woken up until all IOs have completed, and
 799            so we incur less scheduling load.
 800         */
 801
 802         jbd_debug(3, "JBD2: commit phase 3\n");
 803
 804         while (!list_empty(&io_bufs)) {
 805                 struct buffer_head *bh = list_entry(io_bufs.prev,
 806                                                     struct buffer_head,
 807                                                     b_assoc_buffers);
 808
 809                 wait_on_buffer(bh);
 810                 cond_resched();
 811
 812                 if (unlikely(!buffer_uptodate(bh)))
 813                         err = -EIO;
 814                 jbd2_unfile_log_bh(bh);
 815                 stats.run.rs_blocks_logged++;
 816
 817                 /*
 818                  * The list contains temporary buffer heads created by
 819                  * jbd2_journal_write_metadata_buffer().
 820                  */
 821                 BUFFER_TRACE(bh, "dumping temporary bh");
 822                 __brelse(bh);
 823                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 824                 free_buffer_head(bh);
 825
 826                 /* We also have to refile the corresponding shadowed buffer */
 827                 jh = commit_transaction->t_shadow_list->b_tprev;
 828                 bh = jh2bh(jh);
 829                 clear_buffer_jwrite(bh);
 830                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 831                 J_ASSERT_BH(bh, !buffer_shadow(bh));
 832
 833                 /* The metadata is now released for reuse, but we need
 834                    to remember it against this transaction so that when
 835                    we finally commit, we can do any checkpointing
 836                    required. */
 837                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 838                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 839                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 840                 __brelse(bh);
 841         }
 842
 843         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 844
 845         jbd_debug(3, "JBD2: commit phase 4\n");
 846
 847         /* Here we wait for the revoke record and descriptor record buffers */
 848         while (!list_empty(&log_bufs)) {
 849                 struct buffer_head *bh;
 850
 851                 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
 852                 wait_on_buffer(bh);
 853                 cond_resched();
 854
 855                 if (unlikely(!buffer_uptodate(bh)))
 856                         err = -EIO;
 857
 858                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 859                 clear_buffer_jwrite(bh);
 860                 jbd2_unfile_log_bh(bh);
 861                 stats.run.rs_blocks_logged++;
 862                 __brelse(bh);           /* One for getblk */
 863                 /* AKPM: bforget here */
 864         }
 865
 866         if (err)
 867                 jbd2_journal_abort(journal, err);
 868
 869         jbd_debug(3, "JBD2: commit phase 5\n");
 870         write_lock(&journal->j_state_lock);
 871         J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 872         commit_transaction->t_state = T_COMMIT_JFLUSH;
 873         write_unlock(&journal->j_state_lock);
 874
 875         if (!jbd2_has_feature_async_commit(journal)) {
 876                 err = journal_submit_commit_record(journal, commit_transaction,
 877                                                 &cbh, crc32_sum);
 878                 if (err)
 879                         jbd2_journal_abort(journal, err);
 880         }
 881         if (cbh)
 882                 err = journal_wait_on_commit_record(journal, cbh);
 883         stats.run.rs_blocks_logged++;
 884         if (jbd2_has_feature_async_commit(journal) &&
 885             journal->j_flags & JBD2_BARRIER) {
 886                 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
 887         }
 888
 889         if (err)
 890                 jbd2_journal_abort(journal, err);
 891
 892         /*
 893          * Now disk caches for filesystem device are flushed so we are safe to
 894          * erase checkpointed transactions from the log by updating journal
 895          * superblock.
 896          */
 897         if (update_tail)
 898                 jbd2_update_log_tail(journal, first_tid, first_block);
 899
 900         /* End of a transaction!  Finally, we can do checkpoint
 901            processing: any buffers committed as a result of this
 902            transaction can be removed from any checkpoint list it was on
 903            before. */
 904
 905         jbd_debug(3, "JBD2: commit phase 6\n");
 906
 907         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 908         J_ASSERT(commit_transaction->t_buffers == NULL);
 909         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 910         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 911
 912 restart_loop:
 913         /*
 914          * As there are other places (journal_unmap_buffer()) adding buffers
 915          * to this list we have to be careful and hold the j_list_lock.
 916          */
 917         spin_lock(&journal->j_list_lock);
 918         while (commit_transaction->t_forget) {
 919                 transaction_t *cp_transaction;
 920                 struct buffer_head *bh;
 921                 int try_to_free = 0;
 922
 923                 jh = commit_transaction->t_forget;
 924                 spin_unlock(&journal->j_list_lock);
 925                 bh = jh2bh(jh);
 926                 /*
 927                  * Get a reference so that bh cannot be freed before we are
 928                  * done with it.
 929                  */
 930                 get_bh(bh);
 931                 jbd_lock_bh_state(bh);
 932                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 933
 934                 /*
 935                  * If there is undo-protected committed data against
 936                  * this buffer, then we can remove it now.  If it is a
 937                  * buffer needing such protection, the old frozen_data
 938                  * field now points to a committed version of the
 939                  * buffer, so rotate that field to the new committed
 940                  * data.
 941                  *
 942                  * Otherwise, we can just throw away the frozen data now.
 943                  *
 944                  * We also know that the frozen data has already fired
 945                  * its triggers if they exist, so we can clear that too.
 946                  */
 947                 if (jh->b_committed_data) {
 948                         jbd2_free(jh->b_committed_data, bh->b_size);
 949                         jh->b_committed_data = NULL;
 950                         if (jh->b_frozen_data) {
 951                                 jh->b_committed_data = jh->b_frozen_data;
 952                                 jh->b_frozen_data = NULL;
 953                                 jh->b_frozen_triggers = NULL;
 954                         }
 955                 } else if (jh->b_frozen_data) {
 956                         jbd2_free(jh->b_frozen_data, bh->b_size);
 957                         jh->b_frozen_data = NULL;
 958                         jh->b_frozen_triggers = NULL;
 959                 }
 960
 961                 spin_lock(&journal->j_list_lock);
 962                 cp_transaction = jh->b_cp_transaction;
 963                 if (cp_transaction) {
 964                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 965                         cp_transaction->t_chp_stats.cs_dropped++;
 966                         __jbd2_journal_remove_checkpoint(jh);
 967                 }
 968
 969                 /* Only re-checkpoint the buffer_head if it is marked
 970                  * dirty.  If the buffer was added to the BJ_Forget list
 971                  * by jbd2_journal_forget, it may no longer be dirty and
 972                  * there's no point in keeping a checkpoint record for
 973                  * it. */
 974
 975                 /*
 976                  * A buffer which has been freed while still being journaled
 977                  * by a previous transaction, refile the buffer to BJ_Forget of
 978                  * the running transaction. If the just committed transaction
 979                  * contains "add to orphan" operation, we can completely
 980                  * invalidate the buffer now. We are rather through in that
 981                  * since the buffer may be still accessible when blocksize <
 982                  * pagesize and it is attached to the last partial page.
 983                  */
 984                 if (buffer_freed(bh) && !jh->b_next_transaction) {
 985                         struct address_space *mapping;
 986
 987                         clear_buffer_freed(bh);
 988                         clear_buffer_jbddirty(bh);
 989
 990                         /*
 991                          * Block device buffers need to stay mapped all the
 992                          * time, so it is enough to clear buffer_jbddirty and
 993                          * buffer_freed bits. For the file mapping buffers (i.e.
 994                          * journalled data) we need to unmap buffer and clear
 995                          * more bits. We also need to be careful about the check
 996                          * because the data page mapping can get cleared under
 997                          * our hands. Note that if mapping == NULL, we don't
 998                          * need to make buffer unmapped because the page is
 999                          * already detached from the mapping and buffers cannot
1000                          * get reused.
1001                          */
1002                         mapping = READ_ONCE(bh->b_page->mapping);
1003                         if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1004                                 clear_buffer_mapped(bh);
1005                                 clear_buffer_new(bh);
1006                                 clear_buffer_req(bh);
1007                                 bh->b_bdev = NULL;
1008                         }
1009                 }
1010
1011                 if (buffer_jbddirty(bh)) {
1012                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
1013                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1014                         if (is_journal_aborted(journal))
1015                                 clear_buffer_jbddirty(bh);
1016                 } else {
1017                         J_ASSERT_BH(bh, !buffer_dirty(bh));
1018                         /*
1019                          * The buffer on BJ_Forget list and not jbddirty means
1020                          * it has been freed by this transaction and hence it
1021                          * could not have been reallocated until this
1022                          * transaction has committed. *BUT* it could be
1023                          * reallocated once we have written all the data to
1024                          * disk and before we process the buffer on BJ_Forget
1025                          * list.
1026                          */
1027                         if (!jh->b_next_transaction)
1028                                 try_to_free = 1;
1029                 }
1030                 JBUFFER_TRACE(jh, "refile or unfile buffer");
1031                 __jbd2_journal_refile_buffer(jh);
1032                 jbd_unlock_bh_state(bh);
1033                 if (try_to_free)
1034                         release_buffer_page(bh);        /* Drops bh reference */
1035                 else
1036                         __brelse(bh);
1037                 cond_resched_lock(&journal->j_list_lock);
1038         }
1039         spin_unlock(&journal->j_list_lock);
1040         /*
1041          * This is a bit sleazy.  We use j_list_lock to protect transition
1042          * of a transaction into T_FINISHED state and calling
1043          * __jbd2_journal_drop_transaction(). Otherwise we could race with
1044          * other checkpointing code processing the transaction...
1045          */
1046         write_lock(&journal->j_state_lock);
1047         spin_lock(&journal->j_list_lock);
1048         /*
1049          * Now recheck if some buffers did not get attached to the transaction
1050          * while the lock was dropped...
1051          */
1052         if (commit_transaction->t_forget) {
1053                 spin_unlock(&journal->j_list_lock);
1054                 write_unlock(&journal->j_state_lock);
1055                 goto restart_loop;
1056         }
1057
1058         /* Add the transaction to the checkpoint list
1059          * __journal_remove_checkpoint() can not destroy transaction
1060          * under us because it is not marked as T_FINISHED yet */
1061         if (journal->j_checkpoint_transactions == NULL) {
1062                 journal->j_checkpoint_transactions = commit_transaction;
1063                 commit_transaction->t_cpnext = commit_transaction;
1064                 commit_transaction->t_cpprev = commit_transaction;
1065         } else {
1066                 commit_transaction->t_cpnext =
1067                         journal->j_checkpoint_transactions;
1068                 commit_transaction->t_cpprev =
1069                         commit_transaction->t_cpnext->t_cpprev;
1070                 commit_transaction->t_cpnext->t_cpprev =
1071                         commit_transaction;
1072                 commit_transaction->t_cpprev->t_cpnext =
1073                                 commit_transaction;
1074         }
1075         spin_unlock(&journal->j_list_lock);
1076
1077         /* Done with this transaction! */
1078
1079         jbd_debug(3, "JBD2: commit phase 7\n");
1080
1081         J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1082
1083         commit_transaction->t_start = jiffies;
1084         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1085                                               commit_transaction->t_start);
1086
1087         /*
1088          * File the transaction statistics
1089          */
1090         stats.ts_tid = commit_transaction->t_tid;
1091         stats.run.rs_handle_count =
1092                 atomic_read(&commit_transaction->t_handle_count);
1093         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1094                              commit_transaction->t_tid, &stats.run);
1095         stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1096
1097         commit_transaction->t_state = T_COMMIT_CALLBACK;
1098         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1099         journal->j_commit_sequence = commit_transaction->t_tid;
1100         journal->j_committing_transaction = NULL;
1101         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1102
1103         /*
1104          * weight the commit time higher than the average time so we don't
1105          * react too strongly to vast changes in the commit time
1106          */
1107         if (likely(journal->j_average_commit_time))
1108                 journal->j_average_commit_time = (commit_time +
1109                                 journal->j_average_commit_time*3) / 4;
1110         else
1111                 journal->j_average_commit_time = commit_time;
1112
1113         write_unlock(&journal->j_state_lock);
1114
1115         if (journal->j_commit_callback)
1116                 journal->j_commit_callback(journal, commit_transaction);
1117
1118         trace_jbd2_end_commit(journal, commit_transaction);
1119         jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1120                   journal->j_commit_sequence, journal->j_tail_sequence);
1121
1122         write_lock(&journal->j_state_lock);
1123         spin_lock(&journal->j_list_lock);
1124         commit_transaction->t_state = T_FINISHED;
1125         /* Check if the transaction can be dropped now that we are finished */
1126         if (commit_transaction->t_checkpoint_list == NULL &&
1127             commit_transaction->t_checkpoint_io_list == NULL) {
1128                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1129                 jbd2_journal_free_transaction(commit_transaction);
1130         }
1131         spin_unlock(&journal->j_list_lock);
1132         write_unlock(&journal->j_state_lock);
1133         wake_up(&journal->j_wait_done_commit);
1134
1135         /*
1136          * Calculate overall stats
1137          */
1138         spin_lock(&journal->j_history_lock);
1139         journal->j_stats.ts_tid++;
1140         journal->j_stats.ts_requested += stats.ts_requested;
1141         journal->j_stats.run.rs_wait += stats.run.rs_wait;
1142         journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1143         journal->j_stats.run.rs_running += stats.run.rs_running;
1144         journal->j_stats.run.rs_locked += stats.run.rs_locked;
1145         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1146         journal->j_stats.run.rs_logging += stats.run.rs_logging;
1147         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1148         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1149         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1150         spin_unlock(&journal->j_history_lock);
1151 }