fs/jbd2/commit.c

   1 // SPDX-License-Identifier: GPL-2.0+
   2 /*
   3  * linux/fs/jbd2/commit.c
   4  *
   5  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   6  *
   7  * Copyright 1998 Red Hat corp --- All Rights Reserved
   8  *
   9  * Journal commit routines for the generic filesystem journaling code;
  10  * part of the ext2fs journaling system.
  11  */
  12
  13 #include <linux/time.h>
  14 #include <linux/fs.h>
  15 #include <linux/jbd2.h>
  16 #include <linux/errno.h>
  17 #include <linux/slab.h>
  18 #include <linux/mm.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/jiffies.h>
  21 #include <linux/crc32.h>
  22 #include <linux/writeback.h>
  23 #include <linux/backing-dev.h>
  24 #include <linux/bio.h>
  25 #include <linux/blkdev.h>
  26 #include <linux/bitops.h>
  27 #include <trace/events/jbd2.h>
  28
  29 /*
  30  * IO end handler for temporary buffer_heads handling writes to the journal.
  31  */
  32 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  33 {
  34         struct buffer_head *orig_bh = bh->b_private;
  35
  36         BUFFER_TRACE(bh, "");
  37         if (uptodate)
  38                 set_buffer_uptodate(bh);
  39         else
  40                 clear_buffer_uptodate(bh);
  41         if (orig_bh) {
  42                 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
  43                 smp_mb__after_atomic();
  44                 wake_up_bit(&orig_bh->b_state, BH_Shadow);
  45         }
  46         unlock_buffer(bh);
  47 }
  48
  49 /*
  50  * When an ext4 file is truncated, it is possible that some pages are not
  51  * successfully freed, because they are attached to a committing transaction.
  52  * After the transaction commits, these pages are left on the LRU, with no
  53  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  54  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  55  * the numbers in /proc/meminfo look odd.
  56  *
  57  * So here, we have a buffer which has just come off the forget list.  Look to
  58  * see if we can strip all buffers from the backing page.
  59  *
  60  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  61  * caller provided us with a ref against the buffer, and we drop that here.
  62  */
  63 static void release_buffer_page(struct buffer_head *bh)
  64 {
  65         struct page *page;
  66
  67         if (buffer_dirty(bh))
  68                 goto nope;
  69         if (atomic_read(&bh->b_count) != 1)
  70                 goto nope;
  71         page = bh->b_page;
  72         if (!page)
  73                 goto nope;
  74         if (page->mapping)
  75                 goto nope;
  76
  77         /* OK, it's a truncated page */
  78         if (!trylock_page(page))
  79                 goto nope;
  80
  81         get_page(page);
  82         __brelse(bh);
  83         try_to_free_buffers(page);
  84         unlock_page(page);
  85         put_page(page);
  86         return;
  87
  88 nope:
  89         __brelse(bh);
  90 }
  91
  92 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
  93 {
  94         struct commit_header *h;
  95         __u32 csum;
  96
  97         if (!jbd2_journal_has_csum_v2or3(j))
  98                 return;
  99
 100         h = (struct commit_header *)(bh->b_data);
 101         h->h_chksum_type = 0;
 102         h->h_chksum_size = 0;
 103         h->h_chksum[0] = 0;
 104         csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
 105         h->h_chksum[0] = cpu_to_be32(csum);
 106 }
 107
 108 /*
 109  * Done it all: now submit the commit record.  We should have
 110  * cleaned up our previous buffers by now, so if we are in abort
 111  * mode we can now just skip the rest of the journal write
 112  * entirely.
 113  *
 114  * Returns 1 if the journal needs to be aborted or 0 on success
 115  */
 116 static int journal_submit_commit_record(journal_t *journal,
 117                                         transaction_t *commit_transaction,
 118                                         struct buffer_head **cbh,
 119                                         __u32 crc32_sum)
 120 {
 121         struct commit_header *tmp;
 122         struct buffer_head *bh;
 123         int ret;
 124         struct timespec64 now;
 125
 126         *cbh = NULL;
 127
 128         if (is_journal_aborted(journal))
 129                 return 0;
 130
 131         bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
 132                                                 JBD2_COMMIT_BLOCK);
 133         if (!bh)
 134                 return 1;
 135
 136         tmp = (struct commit_header *)bh->b_data;
 137         ktime_get_coarse_real_ts64(&now);
 138         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 139         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 140
 141         if (jbd2_has_feature_checksum(journal)) {
 142                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 143                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 144                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 145         }
 146         jbd2_commit_block_csum_set(journal, bh);
 147
 148         BUFFER_TRACE(bh, "submit commit block");
 149         lock_buffer(bh);
 150         clear_buffer_dirty(bh);
 151         set_buffer_uptodate(bh);
 152         bh->b_end_io = journal_end_buffer_io_sync;
 153
 154         if (journal->j_flags & JBD2_BARRIER &&
 155             !jbd2_has_feature_async_commit(journal))
 156                 ret = submit_bh(REQ_OP_WRITE,
 157                         REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
 158         else
 159                 ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 160
 161         *cbh = bh;
 162         return ret;
 163 }
 164
 165 /*
 166  * This function along with journal_submit_commit_record
 167  * allows to write the commit record asynchronously.
 168  */
 169 static int journal_wait_on_commit_record(journal_t *journal,
 170                                          struct buffer_head *bh)
 171 {
 172         int ret = 0;
 173
 174         clear_buffer_dirty(bh);
 175         wait_on_buffer(bh);
 176
 177         if (unlikely(!buffer_uptodate(bh)))
 178                 ret = -EIO;
 179         put_bh(bh);            /* One for getblk() */
 180
 181         return ret;
 182 }
 183
 184 /*
 185  * write the filemap data using writepage() address_space_operations.
 186  * We don't do block allocation here even for delalloc. We don't
 187  * use writepages() because with dealyed allocation we may be doing
 188  * block allocation in writepages().
 189  */
 190 static int journal_submit_inode_data_buffers(struct address_space *mapping)
 191 {
 192         int ret;
 193         struct writeback_control wbc = {
 194                 .sync_mode =  WB_SYNC_ALL,
 195                 .nr_to_write = mapping->nrpages * 2,
 196                 .range_start = 0,
 197                 .range_end = i_size_read(mapping->host),
 198         };
 199
 200         ret = generic_writepages(mapping, &wbc);
 201         return ret;
 202 }
 203
 204 /*
 205  * Submit all the data buffers of inode associated with the transaction to
 206  * disk.
 207  *
 208  * We are in a committing transaction. Therefore no new inode can be added to
 209  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 210  * operate on from being released while we write out pages.
 211  */
 212 static int journal_submit_data_buffers(journal_t *journal,
 213                 transaction_t *commit_transaction)
 214 {
 215         struct jbd2_inode *jinode;
 216         int err, ret = 0;
 217         struct address_space *mapping;
 218
 219         spin_lock(&journal->j_list_lock);
 220         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 221                 if (!(jinode->i_flags & JI_WRITE_DATA))
 222                         continue;
 223                 mapping = jinode->i_vfs_inode->i_mapping;
 224                 jinode->i_flags |= JI_COMMIT_RUNNING;
 225                 spin_unlock(&journal->j_list_lock);
 226                 /*
 227                  * submit the inode data buffers. We use writepage
 228                  * instead of writepages. Because writepages can do
 229                  * block allocation  with delalloc. We need to write
 230                  * only allocated blocks here.
 231                  */
 232                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 233                 err = journal_submit_inode_data_buffers(mapping);
 234                 if (!ret)
 235                         ret = err;
 236                 spin_lock(&journal->j_list_lock);
 237                 J_ASSERT(jinode->i_transaction == commit_transaction);
 238                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 239                 smp_mb();
 240                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 241         }
 242         spin_unlock(&journal->j_list_lock);
 243         return ret;
 244 }
 245
 246 /*
 247  * Wait for data submitted for writeout, refile inodes to proper
 248  * transaction if needed.
 249  *
 250  */
 251 static int journal_finish_inode_data_buffers(journal_t *journal,
 252                 transaction_t *commit_transaction)
 253 {
 254         struct jbd2_inode *jinode, *next_i;
 255         int err, ret = 0;
 256
 257         /* For locking, see the comment in journal_submit_data_buffers() */
 258         spin_lock(&journal->j_list_lock);
 259         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 260                 if (!(jinode->i_flags & JI_WAIT_DATA))
 261                         continue;
 262                 jinode->i_flags |= JI_COMMIT_RUNNING;
 263                 spin_unlock(&journal->j_list_lock);
 264                 err = filemap_fdatawait_keep_errors(
 265                                 jinode->i_vfs_inode->i_mapping);
 266                 if (!ret)
 267                         ret = err;
 268                 spin_lock(&journal->j_list_lock);
 269                 jinode->i_flags &= ~JI_COMMIT_RUNNING;
 270                 smp_mb();
 271                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 272         }
 273
 274         /* Now refile inode to proper lists */
 275         list_for_each_entry_safe(jinode, next_i,
 276                                  &commit_transaction->t_inode_list, i_list) {
 277                 list_del(&jinode->i_list);
 278                 if (jinode->i_next_transaction) {
 279                         jinode->i_transaction = jinode->i_next_transaction;
 280                         jinode->i_next_transaction = NULL;
 281                         list_add(&jinode->i_list,
 282                                 &jinode->i_transaction->t_inode_list);
 283                 } else {
 284                         jinode->i_transaction = NULL;
 285                 }
 286         }
 287         spin_unlock(&journal->j_list_lock);
 288
 289         return ret;
 290 }
 291
 292 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 293 {
 294         struct page *page = bh->b_page;
 295         char *addr;
 296         __u32 checksum;
 297
 298         addr = kmap_atomic(page);
 299         checksum = crc32_be(crc32_sum,
 300                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 301         kunmap_atomic(addr);
 302
 303         return checksum;
 304 }
 305
 306 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
 307                                    unsigned long long block)
 308 {
 309         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 310         if (jbd2_has_feature_64bit(j))
 311                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 312 }
 313
 314 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
 315                                     struct buffer_head *bh, __u32 sequence)
 316 {
 317         journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
 318         struct page *page = bh->b_page;
 319         __u8 *addr;
 320         __u32 csum32;
 321         __be32 seq;
 322
 323         if (!jbd2_journal_has_csum_v2or3(j))
 324                 return;
 325
 326         seq = cpu_to_be32(sequence);
 327         addr = kmap_atomic(page);
 328         csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
 329         csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
 330                              bh->b_size);
 331         kunmap_atomic(addr);
 332
 333         if (jbd2_has_feature_csum3(j))
 334                 tag3->t_checksum = cpu_to_be32(csum32);
 335         else
 336                 tag->t_checksum = cpu_to_be16(csum32);
 337 }
 338 /*
 339  * jbd2_journal_commit_transaction
 340  *
 341  * The primary function for committing a transaction to the log.  This
 342  * function is called by the journal thread to begin a complete commit.
 343  */
 344 void jbd2_journal_commit_transaction(journal_t *journal)
 345 {
 346         struct transaction_stats_s stats;
 347         transaction_t *commit_transaction;
 348         struct journal_head *jh;
 349         struct buffer_head *descriptor;
 350         struct buffer_head **wbuf = journal->j_wbuf;
 351         int bufs;
 352         int flags;
 353         int err;
 354         unsigned long long blocknr;
 355         ktime_t start_time;
 356         u64 commit_time;
 357         char *tagp = NULL;
 358         journal_block_tag_t *tag = NULL;
 359         int space_left = 0;
 360         int first_tag = 0;
 361         int tag_flag;
 362         int i;
 363         int tag_bytes = journal_tag_bytes(journal);
 364         struct buffer_head *cbh = NULL; /* For transactional checksums */
 365         __u32 crc32_sum = ~0;
 366         struct blk_plug plug;
 367         /* Tail of the journal */
 368         unsigned long first_block;
 369         tid_t first_tid;
 370         int update_tail;
 371         int csum_size = 0;
 372         LIST_HEAD(io_bufs);
 373         LIST_HEAD(log_bufs);
 374
 375         if (jbd2_journal_has_csum_v2or3(journal))
 376                 csum_size = sizeof(struct jbd2_journal_block_tail);
 377
 378         /*
 379          * First job: lock down the current transaction and wait for
 380          * all outstanding updates to complete.
 381          */
 382
 383         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 384         if (journal->j_flags & JBD2_FLUSHED) {
 385                 jbd_debug(3, "super block updated\n");
 386                 mutex_lock_io(&journal->j_checkpoint_mutex);
 387                 /*
 388                  * We hold j_checkpoint_mutex so tail cannot change under us.
 389                  * We don't need any special data guarantees for writing sb
 390                  * since journal is empty and it is ok for write to be
 391                  * flushed only with transaction commit.
 392                  */
 393                 jbd2_journal_update_sb_log_tail(journal,
 394                                                 journal->j_tail_sequence,
 395                                                 journal->j_tail,
 396                                                 REQ_SYNC);
 397                 mutex_unlock(&journal->j_checkpoint_mutex);
 398         } else {
 399                 jbd_debug(3, "superblock not updated\n");
 400         }
 401
 402         J_ASSERT(journal->j_running_transaction != NULL);
 403         J_ASSERT(journal->j_committing_transaction == NULL);
 404
 405         commit_transaction = journal->j_running_transaction;
 406
 407         trace_jbd2_start_commit(journal, commit_transaction);
 408         jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 409                         commit_transaction->t_tid);
 410
 411         write_lock(&journal->j_state_lock);
 412         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 413         commit_transaction->t_state = T_LOCKED;
 414
 415         trace_jbd2_commit_locking(journal, commit_transaction);
 416         stats.run.rs_wait = commit_transaction->t_max_wait;
 417         stats.run.rs_request_delay = 0;
 418         stats.run.rs_locked = jiffies;
 419         if (commit_transaction->t_requested)
 420                 stats.run.rs_request_delay =
 421                         jbd2_time_diff(commit_transaction->t_requested,
 422                                        stats.run.rs_locked);
 423         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 424                                               stats.run.rs_locked);
 425
 426         spin_lock(&commit_transaction->t_handle_lock);
 427         while (atomic_read(&commit_transaction->t_updates)) {
 428                 DEFINE_WAIT(wait);
 429
 430                 prepare_to_wait(&journal->j_wait_updates, &wait,
 431                                         TASK_UNINTERRUPTIBLE);
 432                 if (atomic_read(&commit_transaction->t_updates)) {
 433                         spin_unlock(&commit_transaction->t_handle_lock);
 434                         write_unlock(&journal->j_state_lock);
 435                         schedule();
 436                         write_lock(&journal->j_state_lock);
 437                         spin_lock(&commit_transaction->t_handle_lock);
 438                 }
 439                 finish_wait(&journal->j_wait_updates, &wait);
 440         }
 441         spin_unlock(&commit_transaction->t_handle_lock);
 442         commit_transaction->t_state = T_SWITCH;
 443         write_unlock(&journal->j_state_lock);
 444
 445         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 446                         journal->j_max_transaction_buffers);
 447
 448         /*
 449          * First thing we are allowed to do is to discard any remaining
 450          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 451          * that there are no such buffers: if a large filesystem
 452          * operation like a truncate needs to split itself over multiple
 453          * transactions, then it may try to do a jbd2_journal_restart() while
 454          * there are still BJ_Reserved buffers outstanding.  These must
 455          * be released cleanly from the current transaction.
 456          *
 457          * In this case, the filesystem must still reserve write access
 458          * again before modifying the buffer in the new transaction, but
 459          * we do not require it to remember exactly which old buffers it
 460          * has reserved.  This is consistent with the existing behaviour
 461          * that multiple jbd2_journal_get_write_access() calls to the same
 462          * buffer are perfectly permissible.
 463          */
 464         while (commit_transaction->t_reserved_list) {
 465                 jh = commit_transaction->t_reserved_list;
 466                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 467                 /*
 468                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 469                  * leave undo-committed data.
 470                  */
 471                 if (jh->b_committed_data) {
 472                         struct buffer_head *bh = jh2bh(jh);
 473
 474                         jbd_lock_bh_state(bh);
 475                         jbd2_free(jh->b_committed_data, bh->b_size);
 476                         jh->b_committed_data = NULL;
 477                         jbd_unlock_bh_state(bh);
 478                 }
 479                 jbd2_journal_refile_buffer(journal, jh);
 480         }
 481
 482         /*
 483          * Now try to drop any written-back buffers from the journal's
 484          * checkpoint lists.  We do this *before* commit because it potentially
 485          * frees some memory
 486          */
 487         spin_lock(&journal->j_list_lock);
 488         __jbd2_journal_clean_checkpoint_list(journal, false);
 489         spin_unlock(&journal->j_list_lock);
 490
 491         jbd_debug(3, "JBD2: commit phase 1\n");
 492
 493         /*
 494          * Clear revoked flag to reflect there is no revoked buffers
 495          * in the next transaction which is going to be started.
 496          */
 497         jbd2_clear_buffer_revoked_flags(journal);
 498
 499         /*
 500          * Switch to a new revoke table.
 501          */
 502         jbd2_journal_switch_revoke_table(journal);
 503
 504         /*
 505          * Reserved credits cannot be claimed anymore, free them
 506          */
 507         atomic_sub(atomic_read(&journal->j_reserved_credits),
 508                    &commit_transaction->t_outstanding_credits);
 509
 510         write_lock(&journal->j_state_lock);
 511         trace_jbd2_commit_flushing(journal, commit_transaction);
 512         stats.run.rs_flushing = jiffies;
 513         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 514                                              stats.run.rs_flushing);
 515
 516         commit_transaction->t_state = T_FLUSH;
 517         journal->j_committing_transaction = commit_transaction;
 518         journal->j_running_transaction = NULL;
 519         start_time = ktime_get();
 520         commit_transaction->t_log_start = journal->j_head;
 521         wake_up(&journal->j_wait_transaction_locked);
 522         write_unlock(&journal->j_state_lock);
 523
 524         jbd_debug(3, "JBD2: commit phase 2a\n");
 525
 526         /*
 527          * Now start flushing things to disk, in the order they appear
 528          * on the transaction lists.  Data blocks go first.
 529          */
 530         err = journal_submit_data_buffers(journal, commit_transaction);
 531         if (err)
 532                 jbd2_journal_abort(journal, err);
 533
 534         blk_start_plug(&plug);
 535         jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
 536
 537         jbd_debug(3, "JBD2: commit phase 2b\n");
 538
 539         /*
 540          * Way to go: we have now written out all of the data for a
 541          * transaction!  Now comes the tricky part: we need to write out
 542          * metadata.  Loop over the transaction's entire buffer list:
 543          */
 544         write_lock(&journal->j_state_lock);
 545         commit_transaction->t_state = T_COMMIT;
 546         write_unlock(&journal->j_state_lock);
 547
 548         trace_jbd2_commit_logging(journal, commit_transaction);
 549         stats.run.rs_logging = jiffies;
 550         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 551                                                stats.run.rs_logging);
 552         stats.run.rs_blocks =
 553                 atomic_read(&commit_transaction->t_outstanding_credits);
 554         stats.run.rs_blocks_logged = 0;
 555
 556         J_ASSERT(commit_transaction->t_nr_buffers <=
 557                  atomic_read(&commit_transaction->t_outstanding_credits));
 558
 559         err = 0;
 560         bufs = 0;
 561         descriptor = NULL;
 562         while (commit_transaction->t_buffers) {
 563
 564                 /* Find the next buffer to be journaled... */
 565
 566                 jh = commit_transaction->t_buffers;
 567
 568                 /* If we're in abort mode, we just un-journal the buffer and
 569                    release it. */
 570
 571                 if (is_journal_aborted(journal)) {
 572                         clear_buffer_jbddirty(jh2bh(jh));
 573                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 574                         jbd2_buffer_abort_trigger(jh,
 575                                                   jh->b_frozen_data ?
 576                                                   jh->b_frozen_triggers :
 577                                                   jh->b_triggers);
 578                         jbd2_journal_refile_buffer(journal, jh);
 579                         /* If that was the last one, we need to clean up
 580                          * any descriptor buffers which may have been
 581                          * already allocated, even if we are now
 582                          * aborting. */
 583                         if (!commit_transaction->t_buffers)
 584                                 goto start_journal_io;
 585                         continue;
 586                 }
 587
 588                 /* Make sure we have a descriptor block in which to
 589                    record the metadata buffer. */
 590
 591                 if (!descriptor) {
 592                         J_ASSERT (bufs == 0);
 593
 594                         jbd_debug(4, "JBD2: get descriptor\n");
 595
 596                         descriptor = jbd2_journal_get_descriptor_buffer(
 597                                                         commit_transaction,
 598                                                         JBD2_DESCRIPTOR_BLOCK);
 599                         if (!descriptor) {
 600                                 jbd2_journal_abort(journal, -EIO);
 601                                 continue;
 602                         }
 603
 604                         jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
 605                                 (unsigned long long)descriptor->b_blocknr,
 606                                 descriptor->b_data);
 607                         tagp = &descriptor->b_data[sizeof(journal_header_t)];
 608                         space_left = descriptor->b_size -
 609                                                 sizeof(journal_header_t);
 610                         first_tag = 1;
 611                         set_buffer_jwrite(descriptor);
 612                         set_buffer_dirty(descriptor);
 613                         wbuf[bufs++] = descriptor;
 614
 615                         /* Record it so that we can wait for IO
 616                            completion later */
 617                         BUFFER_TRACE(descriptor, "ph3: file as descriptor");
 618                         jbd2_file_log_bh(&log_bufs, descriptor);
 619                 }
 620
 621                 /* Where is the buffer to be written? */
 622
 623                 err = jbd2_journal_next_log_block(journal, &blocknr);
 624                 /* If the block mapping failed, just abandon the buffer
 625                    and repeat this loop: we'll fall into the
 626                    refile-on-abort condition above. */
 627                 if (err) {
 628                         jbd2_journal_abort(journal, err);
 629                         continue;
 630                 }
 631
 632                 /*
 633                  * start_this_handle() uses t_outstanding_credits to determine
 634                  * the free space in the log, but this counter is changed
 635                  * by jbd2_journal_next_log_block() also.
 636                  */
 637                 atomic_dec(&commit_transaction->t_outstanding_credits);
 638
 639                 /* Bump b_count to prevent truncate from stumbling over
 640                    the shadowed buffer!  @@@ This can go if we ever get
 641                    rid of the shadow pairing of buffers. */
 642                 atomic_inc(&jh2bh(jh)->b_count);
 643
 644                 /*
 645                  * Make a temporary IO buffer with which to write it out
 646                  * (this will requeue the metadata buffer to BJ_Shadow).
 647                  */
 648                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 649                 JBUFFER_TRACE(jh, "ph3: write metadata");
 650                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 651                                                 jh, &wbuf[bufs], blocknr);
 652                 if (flags < 0) {
 653                         jbd2_journal_abort(journal, flags);
 654                         continue;
 655                 }
 656                 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
 657
 658                 /* Record the new block's tag in the current descriptor
 659                    buffer */
 660
 661                 tag_flag = 0;
 662                 if (flags & 1)
 663                         tag_flag |= JBD2_FLAG_ESCAPE;
 664                 if (!first_tag)
 665                         tag_flag |= JBD2_FLAG_SAME_UUID;
 666
 667                 tag = (journal_block_tag_t *) tagp;
 668                 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
 669                 tag->t_flags = cpu_to_be16(tag_flag);
 670                 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
 671                                         commit_transaction->t_tid);
 672                 tagp += tag_bytes;
 673                 space_left -= tag_bytes;
 674                 bufs++;
 675
 676                 if (first_tag) {
 677                         memcpy (tagp, journal->j_uuid, 16);
 678                         tagp += 16;
 679                         space_left -= 16;
 680                         first_tag = 0;
 681                 }
 682
 683                 /* If there's no more to do, or if the descriptor is full,
 684                    let the IO rip! */
 685
 686                 if (bufs == journal->j_wbufsize ||
 687                     commit_transaction->t_buffers == NULL ||
 688                     space_left < tag_bytes + 16 + csum_size) {
 689
 690                         jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 691
 692                         /* Write an end-of-descriptor marker before
 693                            submitting the IOs.  "tag" still points to
 694                            the last tag we set up. */
 695
 696                         tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
 697 start_journal_io:
 698                         if (descriptor)
 699                                 jbd2_descriptor_block_csum_set(journal,
 700                                                         descriptor);
 701
 702                         for (i = 0; i < bufs; i++) {
 703                                 struct buffer_head *bh = wbuf[i];
 704                                 /*
 705                                  * Compute checksum.
 706                                  */
 707                                 if (jbd2_has_feature_checksum(journal)) {
 708                                         crc32_sum =
 709                                             jbd2_checksum_data(crc32_sum, bh);
 710                                 }
 711
 712                                 lock_buffer(bh);
 713                                 clear_buffer_dirty(bh);
 714                                 set_buffer_uptodate(bh);
 715                                 bh->b_end_io = journal_end_buffer_io_sync;
 716                                 submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 717                         }
 718                         cond_resched();
 719                         stats.run.rs_blocks_logged += bufs;
 720
 721                         /* Force a new descriptor to be generated next
 722                            time round the loop. */
 723                         descriptor = NULL;
 724                         bufs = 0;
 725                 }
 726         }
 727
 728         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 729         if (err) {
 730                 printk(KERN_WARNING
 731                         "JBD2: Detected IO errors while flushing file data "
 732                        "on %s\n", journal->j_devname);
 733                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 734                         jbd2_journal_abort(journal, err);
 735                 err = 0;
 736         }
 737
 738         /*
 739          * Get current oldest transaction in the log before we issue flush
 740          * to the filesystem device. After the flush we can be sure that
 741          * blocks of all older transactions are checkpointed to persistent
 742          * storage and we will be safe to update journal start in the
 743          * superblock with the numbers we get here.
 744          */
 745         update_tail =
 746                 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
 747
 748         write_lock(&journal->j_state_lock);
 749         if (update_tail) {
 750                 long freed = first_block - journal->j_tail;
 751
 752                 if (first_block < journal->j_tail)
 753                         freed += journal->j_last - journal->j_first;
 754                 /* Update tail only if we free significant amount of space */
 755                 if (freed < journal->j_maxlen / 4)
 756                         update_tail = 0;
 757         }
 758         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 759         commit_transaction->t_state = T_COMMIT_DFLUSH;
 760         write_unlock(&journal->j_state_lock);
 761
 762         /*
 763          * If the journal is not located on the file system device,
 764          * then we must flush the file system device before we issue
 765          * the commit record
 766          */
 767         if (commit_transaction->t_need_data_flush &&
 768             (journal->j_fs_dev != journal->j_dev) &&
 769             (journal->j_flags & JBD2_BARRIER))
 770                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
 771
 772         /* Done it all: now write the commit record asynchronously. */
 773         if (jbd2_has_feature_async_commit(journal)) {
 774                 err = journal_submit_commit_record(journal, commit_transaction,
 775                                                  &cbh, crc32_sum);
 776                 if (err)
 777                         __jbd2_journal_abort_hard(journal);
 778         }
 779
 780         blk_finish_plug(&plug);
 781
 782         /* Lo and behold: we have just managed to send a transaction to
 783            the log.  Before we can commit it, wait for the IO so far to
 784            complete.  Control buffers being written are on the
 785            transaction's t_log_list queue, and metadata buffers are on
 786            the io_bufs list.
 787
 788            Wait for the buffers in reverse order.  That way we are
 789            less likely to be woken up until all IOs have completed, and
 790            so we incur less scheduling load.
 791         */
 792
 793         jbd_debug(3, "JBD2: commit phase 3\n");
 794
 795         while (!list_empty(&io_bufs)) {
 796                 struct buffer_head *bh = list_entry(io_bufs.prev,
 797                                                     struct buffer_head,
 798                                                     b_assoc_buffers);
 799
 800                 wait_on_buffer(bh);
 801                 cond_resched();
 802
 803                 if (unlikely(!buffer_uptodate(bh)))
 804                         err = -EIO;
 805                 jbd2_unfile_log_bh(bh);
 806
 807                 /*
 808                  * The list contains temporary buffer heads created by
 809                  * jbd2_journal_write_metadata_buffer().
 810                  */
 811                 BUFFER_TRACE(bh, "dumping temporary bh");
 812                 __brelse(bh);
 813                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 814                 free_buffer_head(bh);
 815
 816                 /* We also have to refile the corresponding shadowed buffer */
 817                 jh = commit_transaction->t_shadow_list->b_tprev;
 818                 bh = jh2bh(jh);
 819                 clear_buffer_jwrite(bh);
 820                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 821                 J_ASSERT_BH(bh, !buffer_shadow(bh));
 822
 823                 /* The metadata is now released for reuse, but we need
 824                    to remember it against this transaction so that when
 825                    we finally commit, we can do any checkpointing
 826                    required. */
 827                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 828                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 829                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 830                 __brelse(bh);
 831         }
 832
 833         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 834
 835         jbd_debug(3, "JBD2: commit phase 4\n");
 836
 837         /* Here we wait for the revoke record and descriptor record buffers */
 838         while (!list_empty(&log_bufs)) {
 839                 struct buffer_head *bh;
 840
 841                 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
 842                 wait_on_buffer(bh);
 843                 cond_resched();
 844
 845                 if (unlikely(!buffer_uptodate(bh)))
 846                         err = -EIO;
 847
 848                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 849                 clear_buffer_jwrite(bh);
 850                 jbd2_unfile_log_bh(bh);
 851                 __brelse(bh);           /* One for getblk */
 852                 /* AKPM: bforget here */
 853         }
 854
 855         if (err)
 856                 jbd2_journal_abort(journal, err);
 857
 858         jbd_debug(3, "JBD2: commit phase 5\n");
 859         write_lock(&journal->j_state_lock);
 860         J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 861         commit_transaction->t_state = T_COMMIT_JFLUSH;
 862         write_unlock(&journal->j_state_lock);
 863
 864         if (!jbd2_has_feature_async_commit(journal)) {
 865                 err = journal_submit_commit_record(journal, commit_transaction,
 866                                                 &cbh, crc32_sum);
 867                 if (err)
 868                         __jbd2_journal_abort_hard(journal);
 869         }
 870         if (cbh)
 871                 err = journal_wait_on_commit_record(journal, cbh);
 872         if (jbd2_has_feature_async_commit(journal) &&
 873             journal->j_flags & JBD2_BARRIER) {
 874                 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
 875         }
 876
 877         if (err)
 878                 jbd2_journal_abort(journal, err);
 879
 880         /*
 881          * Now disk caches for filesystem device are flushed so we are safe to
 882          * erase checkpointed transactions from the log by updating journal
 883          * superblock.
 884          */
 885         if (update_tail)
 886                 jbd2_update_log_tail(journal, first_tid, first_block);
 887
 888         /* End of a transaction!  Finally, we can do checkpoint
 889            processing: any buffers committed as a result of this
 890            transaction can be removed from any checkpoint list it was on
 891            before. */
 892
 893         jbd_debug(3, "JBD2: commit phase 6\n");
 894
 895         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 896         J_ASSERT(commit_transaction->t_buffers == NULL);
 897         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 898         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 899
 900 restart_loop:
 901         /*
 902          * As there are other places (journal_unmap_buffer()) adding buffers
 903          * to this list we have to be careful and hold the j_list_lock.
 904          */
 905         spin_lock(&journal->j_list_lock);
 906         while (commit_transaction->t_forget) {
 907                 transaction_t *cp_transaction;
 908                 struct buffer_head *bh;
 909                 int try_to_free = 0;
 910
 911                 jh = commit_transaction->t_forget;
 912                 spin_unlock(&journal->j_list_lock);
 913                 bh = jh2bh(jh);
 914                 /*
 915                  * Get a reference so that bh cannot be freed before we are
 916                  * done with it.
 917                  */
 918                 get_bh(bh);
 919                 jbd_lock_bh_state(bh);
 920                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 921
 922                 /*
 923                  * If there is undo-protected committed data against
 924                  * this buffer, then we can remove it now.  If it is a
 925                  * buffer needing such protection, the old frozen_data
 926                  * field now points to a committed version of the
 927                  * buffer, so rotate that field to the new committed
 928                  * data.
 929                  *
 930                  * Otherwise, we can just throw away the frozen data now.
 931                  *
 932                  * We also know that the frozen data has already fired
 933                  * its triggers if they exist, so we can clear that too.
 934                  */
 935                 if (jh->b_committed_data) {
 936                         jbd2_free(jh->b_committed_data, bh->b_size);
 937                         jh->b_committed_data = NULL;
 938                         if (jh->b_frozen_data) {
 939                                 jh->b_committed_data = jh->b_frozen_data;
 940                                 jh->b_frozen_data = NULL;
 941                                 jh->b_frozen_triggers = NULL;
 942                         }
 943                 } else if (jh->b_frozen_data) {
 944                         jbd2_free(jh->b_frozen_data, bh->b_size);
 945                         jh->b_frozen_data = NULL;
 946                         jh->b_frozen_triggers = NULL;
 947                 }
 948
 949                 spin_lock(&journal->j_list_lock);
 950                 cp_transaction = jh->b_cp_transaction;
 951                 if (cp_transaction) {
 952                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 953                         cp_transaction->t_chp_stats.cs_dropped++;
 954                         __jbd2_journal_remove_checkpoint(jh);
 955                 }
 956
 957                 /* Only re-checkpoint the buffer_head if it is marked
 958                  * dirty.  If the buffer was added to the BJ_Forget list
 959                  * by jbd2_journal_forget, it may no longer be dirty and
 960                  * there's no point in keeping a checkpoint record for
 961                  * it. */
 962
 963                 /*
 964                 * A buffer which has been freed while still being journaled by
 965                 * a previous transaction.
 966                 */
 967                 if (buffer_freed(bh)) {
 968                         /*
 969                          * If the running transaction is the one containing
 970                          * "add to orphan" operation (b_next_transaction !=
 971                          * NULL), we have to wait for that transaction to
 972                          * commit before we can really get rid of the buffer.
 973                          * So just clear b_modified to not confuse transaction
 974                          * credit accounting and refile the buffer to
 975                          * BJ_Forget of the running transaction. If the just
 976                          * committed transaction contains "add to orphan"
 977                          * operation, we can completely invalidate the buffer
 978                          * now. We are rather through in that since the
 979                          * buffer may be still accessible when blocksize <
 980                          * pagesize and it is attached to the last partial
 981                          * page.
 982                          */
 983                         jh->b_modified = 0;
 984                         if (!jh->b_next_transaction) {
 985                                 clear_buffer_freed(bh);
 986                                 clear_buffer_jbddirty(bh);
 987                                 clear_buffer_mapped(bh);
 988                                 clear_buffer_new(bh);
 989                                 clear_buffer_req(bh);
 990                                 bh->b_bdev = NULL;
 991                         }
 992                 }
 993
 994                 if (buffer_jbddirty(bh)) {
 995                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
 996                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
 997                         if (is_journal_aborted(journal))
 998                                 clear_buffer_jbddirty(bh);
 999                 } else {
1000                         J_ASSERT_BH(bh, !buffer_dirty(bh));
1001                         /*
1002                          * The buffer on BJ_Forget list and not jbddirty means
1003                          * it has been freed by this transaction and hence it
1004                          * could not have been reallocated until this
1005                          * transaction has committed. *BUT* it could be
1006                          * reallocated once we have written all the data to
1007                          * disk and before we process the buffer on BJ_Forget
1008                          * list.
1009                          */
1010                         if (!jh->b_next_transaction)
1011                                 try_to_free = 1;
1012                 }
1013                 JBUFFER_TRACE(jh, "refile or unfile buffer");
1014                 __jbd2_journal_refile_buffer(jh);
1015                 jbd_unlock_bh_state(bh);
1016                 if (try_to_free)
1017                         release_buffer_page(bh);        /* Drops bh reference */
1018                 else
1019                         __brelse(bh);
1020                 cond_resched_lock(&journal->j_list_lock);
1021         }
1022         spin_unlock(&journal->j_list_lock);
1023         /*
1024          * This is a bit sleazy.  We use j_list_lock to protect transition
1025          * of a transaction into T_FINISHED state and calling
1026          * __jbd2_journal_drop_transaction(). Otherwise we could race with
1027          * other checkpointing code processing the transaction...
1028          */
1029         write_lock(&journal->j_state_lock);
1030         spin_lock(&journal->j_list_lock);
1031         /*
1032          * Now recheck if some buffers did not get attached to the transaction
1033          * while the lock was dropped...
1034          */
1035         if (commit_transaction->t_forget) {
1036                 spin_unlock(&journal->j_list_lock);
1037                 write_unlock(&journal->j_state_lock);
1038                 goto restart_loop;
1039         }
1040
1041         /* Add the transaction to the checkpoint list
1042          * __journal_remove_checkpoint() can not destroy transaction
1043          * under us because it is not marked as T_FINISHED yet */
1044         if (journal->j_checkpoint_transactions == NULL) {
1045                 journal->j_checkpoint_transactions = commit_transaction;
1046                 commit_transaction->t_cpnext = commit_transaction;
1047                 commit_transaction->t_cpprev = commit_transaction;
1048         } else {
1049                 commit_transaction->t_cpnext =
1050                         journal->j_checkpoint_transactions;
1051                 commit_transaction->t_cpprev =
1052                         commit_transaction->t_cpnext->t_cpprev;
1053                 commit_transaction->t_cpnext->t_cpprev =
1054                         commit_transaction;
1055                 commit_transaction->t_cpprev->t_cpnext =
1056                                 commit_transaction;
1057         }
1058         spin_unlock(&journal->j_list_lock);
1059
1060         /* Done with this transaction! */
1061
1062         jbd_debug(3, "JBD2: commit phase 7\n");
1063
1064         J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1065
1066         commit_transaction->t_start = jiffies;
1067         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1068                                               commit_transaction->t_start);
1069
1070         /*
1071          * File the transaction statistics
1072          */
1073         stats.ts_tid = commit_transaction->t_tid;
1074         stats.run.rs_handle_count =
1075                 atomic_read(&commit_transaction->t_handle_count);
1076         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1077                              commit_transaction->t_tid, &stats.run);
1078         stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1079
1080         commit_transaction->t_state = T_COMMIT_CALLBACK;
1081         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1082         journal->j_commit_sequence = commit_transaction->t_tid;
1083         journal->j_committing_transaction = NULL;
1084         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1085
1086         /*
1087          * weight the commit time higher than the average time so we don't
1088          * react too strongly to vast changes in the commit time
1089          */
1090         if (likely(journal->j_average_commit_time))
1091                 journal->j_average_commit_time = (commit_time +
1092                                 journal->j_average_commit_time*3) / 4;
1093         else
1094                 journal->j_average_commit_time = commit_time;
1095
1096         write_unlock(&journal->j_state_lock);
1097
1098         if (journal->j_commit_callback)
1099                 journal->j_commit_callback(journal, commit_transaction);
1100
1101         trace_jbd2_end_commit(journal, commit_transaction);
1102         jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1103                   journal->j_commit_sequence, journal->j_tail_sequence);
1104
1105         write_lock(&journal->j_state_lock);
1106         spin_lock(&journal->j_list_lock);
1107         commit_transaction->t_state = T_FINISHED;
1108         /* Check if the transaction can be dropped now that we are finished */
1109         if (commit_transaction->t_checkpoint_list == NULL &&
1110             commit_transaction->t_checkpoint_io_list == NULL) {
1111                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1112                 jbd2_journal_free_transaction(commit_transaction);
1113         }
1114         spin_unlock(&journal->j_list_lock);
1115         write_unlock(&journal->j_state_lock);
1116         wake_up(&journal->j_wait_done_commit);
1117
1118         /*
1119          * Calculate overall stats
1120          */
1121         spin_lock(&journal->j_history_lock);
1122         journal->j_stats.ts_tid++;
1123         journal->j_stats.ts_requested += stats.ts_requested;
1124         journal->j_stats.run.rs_wait += stats.run.rs_wait;
1125         journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1126         journal->j_stats.run.rs_running += stats.run.rs_running;
1127         journal->j_stats.run.rs_locked += stats.run.rs_locked;
1128         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1129         journal->j_stats.run.rs_logging += stats.run.rs_logging;
1130         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1131         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1132         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1133         spin_unlock(&journal->j_history_lock);
1134 }