From: Sasha Levin Date: Mon, 2 Aug 2021 22:03:22 +0000 (-0400) Subject: Fixes for 5.4 X-Git-Tag: v4.4.278~15 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=80a86cca18f0e3a14afa36dc94b862585a4b741c;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.4 Signed-off-by: Sasha Levin --- diff --git a/queue-5.4/btrfs-delete-duplicated-words-other-fixes-in-comment.patch b/queue-5.4/btrfs-delete-duplicated-words-other-fixes-in-comment.patch new file mode 100644 index 00000000000..1a425635d1d --- /dev/null +++ b/queue-5.4/btrfs-delete-duplicated-words-other-fixes-in-comment.patch @@ -0,0 +1,131 @@ +From 7b436b4e8de8b8f0b89199957d21ebd79364ef2d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 4 Aug 2020 19:48:34 -0700 +Subject: btrfs: delete duplicated words + other fixes in comments + +From: Randy Dunlap + +[ Upstream commit 260db43cd2f556677f6ae818ba09f997eed81004 ] + +Delete repeated words in fs/btrfs/. +{to, the, a, and old} +and change "into 2 part" to "into 2 parts". + +Reviewed-by: Nikolay Borisov +Signed-off-by: Randy Dunlap +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/block-group.c | 2 +- + fs/btrfs/ctree.c | 2 +- + fs/btrfs/disk-io.c | 2 +- + fs/btrfs/extent_io.c | 2 +- + fs/btrfs/free-space-cache.c | 2 +- + fs/btrfs/qgroup.c | 2 +- + fs/btrfs/tree-log.c | 4 ++-- + 7 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c +index a352c1704042..e98d6ea35ea8 100644 +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -2637,7 +2637,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) + * finished yet (no block group item in the extent tree + * yet, etc). If this is the case, wait for all free + * space endio workers to finish and retry. This is a +- * a very rare case so no need for a more efficient and ++ * very rare case so no need for a more efficient and + * complex approach. + */ + if (ret == -ENOENT) { +diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c +index ab69e3563b12..dac30b00d14b 100644 +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -5232,7 +5232,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + slot--; + /* + * check this node pointer against the min_trans parameters. +- * If it is too old, old, skip to the next one. ++ * If it is too old, skip to the next one. + */ + while (slot < nritems) { + u64 gen; +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index e6aa94a583e9..1d28333bb798 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -2816,7 +2816,7 @@ int open_ctree(struct super_block *sb, + } + + /* +- * Verify the type first, if that or the the checksum value are ++ * Verify the type first, if that or the checksum value are + * corrupted, we'll find out + */ + csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data); +diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c +index eca3abc1a7cd..9108a73423f7 100644 +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3152,7 +3152,7 @@ static int __do_readpage(struct extent_io_tree *tree, + + /* + * If we have a file range that points to a compressed extent +- * and it's followed by a consecutive file range that points to ++ * and it's followed by a consecutive file range that points + * to the same compressed extent (possibly with a different + * offset and/or length, so it either points to the whole extent + * or only part of it), we must make sure we do not submit a +diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c +index 23f59d463e24..d2d32fed8f2e 100644 +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -1339,7 +1339,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, + + /* + * at this point the pages are under IO and we're happy, +- * The caller is responsible for waiting on them and updating the ++ * The caller is responsible for waiting on them and updating + * the cache and the inode + */ + io_ctl->entries = entries; +diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c +index cd8e81c02f63..837bd5e29c8a 100644 +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -2262,7 +2262,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + * Update qgroup rfer/excl counters. + * Rfer update is easy, codes can explain themselves. + * +- * Excl update is tricky, the update is split into 2 part. ++ * Excl update is tricky, the update is split into 2 parts. + * Part 1: Possible exclusive <-> sharing detect: + * | A | !A | + * ------------------------------------- +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index afc6731bb692..dcbdd0ebea83 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -4923,7 +4923,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, + * Check the inode's logged_trans only instead of + * btrfs_inode_in_log(). This is because the last_log_commit of + * the inode is not updated when we only log that it exists and +- * and it has the full sync bit set (see btrfs_log_inode()). ++ * it has the full sync bit set (see btrfs_log_inode()). + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) { + spin_unlock(&BTRFS_I(inode)->lock); +@@ -6426,7 +6426,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT + * otherwise. + * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to +- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, ++ * sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, + * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be + * committed (without attempting to sync the log). + */ +-- +2.30.2 + diff --git a/queue-5.4/btrfs-do-not-commit-logs-and-transactions-during-lin.patch b/queue-5.4/btrfs-do-not-commit-logs-and-transactions-during-lin.patch new file mode 100644 index 00000000000..49e17d820ca --- /dev/null +++ b/queue-5.4/btrfs-do-not-commit-logs-and-transactions-during-lin.patch @@ -0,0 +1,492 @@ +From aa2ae7be22586b1cb4ef74bd325ec887ba5d1252 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 11 Aug 2020 12:43:48 +0100 +Subject: btrfs: do not commit logs and transactions during link and rename + operations + +From: Filipe Manana + +[ Upstream commit 75b463d2b47aef96fe1dc3e0237629963034764b ] + +Since commit d4682ba03ef618 ("Btrfs: sync log after logging new name") we +started to commit logs, and fallback to transaction commits when we failed +to log the new names or commit the logs, after link and rename operations +when the target inodes (or their parents) were previously logged in the +current transaction. This was to avoid losing directories despite an +explicit fsync on them when they are ancestors of some inode that got a +new named logged, due to a link or rename operation. However that adds the +cost of starting IO and waiting for it to complete, which can cause higher +latencies for applications. + +Instead of doing that, just make sure that when we log a new name for an +inode we don't mark any of its ancestors as logged, so that if any one +does an fsync against any of them, without doing any other change on them, +the fsync commits the log. This way we only pay the cost of a log commit +(or a transaction commit if something goes wrong or a new block group was +created) if the application explicitly asks to fsync any of the parent +directories. + +Using dbench, which mixes several filesystems operations including renames, +revealed some significant latency gains. The following script that uses +dbench was used to test this: + + #!/bin/bash + + DEV=/dev/nvme0n1 + MNT=/mnt/btrfs + MOUNT_OPTIONS="-o ssd -o space_cache=v2" + MKFS_OPTIONS="-m single -d single" + THREADS=16 + + echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor + mkfs.btrfs -f $MKFS_OPTIONS $DEV + mount $MOUNT_OPTIONS $DEV $MNT + + dbench -t 300 -D $MNT $THREADS + + umount $MNT + +The test was run on bare metal, no virtualization, on a box with 12 cores +(Intel i7-8700), 64Gb of RAM and using a NVMe device, with a kernel +configuration that is the default of typical distributions (debian in this +case), without debug options enabled (kasan, kmemleak, slub debug, debug +of page allocations, lock debugging, etc). + +Results before this patch: + + Operation Count AvgLat MaxLat + ---------------------------------------- + NTCreateX 10750455 0.011 155.088 + Close 7896674 0.001 0.243 + Rename 455222 2.158 1101.947 + Unlink 2171189 0.067 121.638 + Deltree 256 2.425 7.816 + Mkdir 128 0.002 0.003 + Qpathinfo 9744323 0.006 21.370 + Qfileinfo 1707092 0.001 0.146 + Qfsinfo 1786756 0.001 11.228 + Sfileinfo 875612 0.003 21.263 + Find 3767281 0.025 9.617 + WriteX 5356924 0.011 211.390 + ReadX 16852694 0.003 9.442 + LockX 35008 0.002 0.119 + UnlockX 35008 0.001 0.138 + Flush 753458 4.252 1102.249 + +Throughput 1128.35 MB/sec 16 clients 16 procs max_latency=1102.255 ms + +Results after this patch: + +16 clients, after + + Operation Count AvgLat MaxLat + ---------------------------------------- + NTCreateX 11471098 0.012 448.281 + Close 8426396 0.001 0.925 + Rename 485746 0.123 267.183 + Unlink 2316477 0.080 63.433 + Deltree 288 2.830 11.144 + Mkdir 144 0.003 0.010 + Qpathinfo 10397420 0.006 10.288 + Qfileinfo 1822039 0.001 0.169 + Qfsinfo 1906497 0.002 14.039 + Sfileinfo 934433 0.004 2.438 + Find 4019879 0.026 10.200 + WriteX 5718932 0.011 200.985 + ReadX 17981671 0.003 10.036 + LockX 37352 0.002 0.076 + UnlockX 37352 0.001 0.109 + Flush 804018 5.015 778.033 + +Throughput 1201.98 MB/sec 16 clients 16 procs max_latency=778.036 ms +(+6.5% throughput, -29.4% max latency, -75.8% rename latency) + +Test case generic/498 from fstests tests the scenario that the previously +mentioned commit fixed. + +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/inode.c | 115 +++++--------------------------------------- + fs/btrfs/tree-log.c | 100 +++++++++++++++++--------------------- + fs/btrfs/tree-log.h | 14 ++---- + 3 files changed, 60 insertions(+), 169 deletions(-) + +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 025b02e9799f..8959d011aafa 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -6992,7 +6992,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + drop_inode = 1; + } else { + struct dentry *parent = dentry->d_parent; +- int ret; + + err = btrfs_update_inode(trans, root, inode); + if (err) +@@ -7007,12 +7006,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + goto fail; + } + d_instantiate(dentry, inode); +- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, +- true, NULL); +- if (ret == BTRFS_NEED_TRANS_COMMIT) { +- err = btrfs_commit_transaction(trans); +- trans = NULL; +- } ++ btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); + } + + fail: +@@ -9699,27 +9693,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec64 ctime = current_time(old_inode); +- struct dentry *parent; + u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); + u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); + u64 old_idx = 0; + u64 new_idx = 0; + int ret; ++ int ret2; + bool root_log_pinned = false; + bool dest_log_pinned = false; +- struct btrfs_log_ctx ctx_root; +- struct btrfs_log_ctx ctx_dest; +- bool sync_log_root = false; +- bool sync_log_dest = false; +- bool commit_transaction = false; + + /* we only allow rename subvolume link between subvolumes */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + return -EXDEV; + +- btrfs_init_log_ctx(&ctx_root, old_inode); +- btrfs_init_log_ctx(&ctx_dest, new_inode); +- + /* close the race window with snapshot create/destroy ioctl */ + if (old_ino == BTRFS_FIRST_FREE_OBJECTID || + new_ino == BTRFS_FIRST_FREE_OBJECTID) +@@ -9861,30 +9847,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, + BTRFS_I(new_inode)->dir_index = new_idx; + + if (root_log_pinned) { +- parent = new_dentry->d_parent; +- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), +- BTRFS_I(old_dir), parent, +- false, &ctx_root); +- if (ret == BTRFS_NEED_LOG_SYNC) +- sync_log_root = true; +- else if (ret == BTRFS_NEED_TRANS_COMMIT) +- commit_transaction = true; +- ret = 0; ++ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), ++ new_dentry->d_parent); + btrfs_end_log_trans(root); + root_log_pinned = false; + } + if (dest_log_pinned) { +- if (!commit_transaction) { +- parent = old_dentry->d_parent; +- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), +- BTRFS_I(new_dir), parent, +- false, &ctx_dest); +- if (ret == BTRFS_NEED_LOG_SYNC) +- sync_log_dest = true; +- else if (ret == BTRFS_NEED_TRANS_COMMIT) +- commit_transaction = true; +- ret = 0; +- } ++ btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), ++ old_dentry->d_parent); + btrfs_end_log_trans(dest); + dest_log_pinned = false; + } +@@ -9917,46 +9887,13 @@ static int btrfs_rename_exchange(struct inode *old_dir, + dest_log_pinned = false; + } + } +- if (!ret && sync_log_root && !commit_transaction) { +- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, +- &ctx_root); +- if (ret) +- commit_transaction = true; +- } +- if (!ret && sync_log_dest && !commit_transaction) { +- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, +- &ctx_dest); +- if (ret) +- commit_transaction = true; +- } +- if (commit_transaction) { +- /* +- * We may have set commit_transaction when logging the new name +- * in the destination root, in which case we left the source +- * root context in the list of log contextes. So make sure we +- * remove it to avoid invalid memory accesses, since the context +- * was allocated in our stack frame. +- */ +- if (sync_log_root) { +- mutex_lock(&root->log_mutex); +- list_del_init(&ctx_root.list); +- mutex_unlock(&root->log_mutex); +- } +- ret = btrfs_commit_transaction(trans); +- } else { +- int ret2; +- +- ret2 = btrfs_end_transaction(trans); +- ret = ret ? ret : ret2; +- } ++ ret2 = btrfs_end_transaction(trans); ++ ret = ret ? ret : ret2; + out_notrans: + if (new_ino == BTRFS_FIRST_FREE_OBJECTID || + old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&fs_info->subvol_sem); + +- ASSERT(list_empty(&ctx_root.list)); +- ASSERT(list_empty(&ctx_dest.list)); +- + return ret; + } + +@@ -10024,11 +9961,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *old_inode = d_inode(old_dentry); + u64 index = 0; + int ret; ++ int ret2; + u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); + bool log_pinned = false; +- struct btrfs_log_ctx ctx; +- bool sync_log = false; +- bool commit_transaction = false; + + if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return -EPERM; +@@ -10178,17 +10113,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + BTRFS_I(old_inode)->dir_index = index; + + if (log_pinned) { +- struct dentry *parent = new_dentry->d_parent; +- +- btrfs_init_log_ctx(&ctx, old_inode); +- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), +- BTRFS_I(old_dir), parent, +- false, &ctx); +- if (ret == BTRFS_NEED_LOG_SYNC) +- sync_log = true; +- else if (ret == BTRFS_NEED_TRANS_COMMIT) +- commit_transaction = true; +- ret = 0; ++ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), ++ new_dentry->d_parent); + btrfs_end_log_trans(root); + log_pinned = false; + } +@@ -10225,23 +10151,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + btrfs_end_log_trans(root); + log_pinned = false; + } +- if (!ret && sync_log) { +- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); +- if (ret) +- commit_transaction = true; +- } else if (sync_log) { +- mutex_lock(&root->log_mutex); +- list_del(&ctx.list); +- mutex_unlock(&root->log_mutex); +- } +- if (commit_transaction) { +- ret = btrfs_commit_transaction(trans); +- } else { +- int ret2; +- +- ret2 = btrfs_end_transaction(trans); +- ret = ret ? ret : ret2; +- } ++ ret2 = btrfs_end_transaction(trans); ++ ret = ret ? ret : ret2; + out_notrans: + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + up_read(&fs_info->subvol_sem); +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index dcbdd0ebea83..53607156b008 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -174,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, + + atomic_inc(&root->log_batch); + atomic_inc(&root->log_writers); +- if (ctx) { ++ if (ctx && !ctx->logging_new_name) { + int index = root->log_transid % 2; + list_add_tail(&ctx->list, &root->log_ctxs[index]); + ctx->log_transid = root->log_transid; +@@ -5379,19 +5379,34 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, + } + + /* +- * Don't update last_log_commit if we logged that an inode exists after +- * it was loaded to memory (full_sync bit set). +- * This is to prevent data loss when we do a write to the inode, then +- * the inode gets evicted after all delalloc was flushed, then we log +- * it exists (due to a rename for example) and then fsync it. This last +- * fsync would do nothing (not logging the extents previously written). ++ * If we are logging that an ancestor inode exists as part of logging a ++ * new name from a link or rename operation, don't mark the inode as ++ * logged - otherwise if an explicit fsync is made against an ancestor, ++ * the fsync considers the inode in the log and doesn't sync the log, ++ * resulting in the ancestor missing after a power failure unless the ++ * log was synced as part of an fsync against any other unrelated inode. ++ * So keep it simple for this case and just don't flag the ancestors as ++ * logged. + */ +- spin_lock(&inode->lock); +- inode->logged_trans = trans->transid; +- if (inode_only != LOG_INODE_EXISTS || +- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) +- inode->last_log_commit = inode->last_sub_trans; +- spin_unlock(&inode->lock); ++ if (!ctx || ++ !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && ++ &inode->vfs_inode != ctx->inode)) { ++ spin_lock(&inode->lock); ++ inode->logged_trans = trans->transid; ++ /* ++ * Don't update last_log_commit if we logged that an inode exists ++ * after it was loaded to memory (full_sync bit set). ++ * This is to prevent data loss when we do a write to the inode, ++ * then the inode gets evicted after all delalloc was flushed, ++ * then we log it exists (due to a rename for example) and then ++ * fsync it. This last fsync would do nothing (not logging the ++ * extents previously written). ++ */ ++ if (inode_only != LOG_INODE_EXISTS || ++ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) ++ inode->last_log_commit = inode->last_sub_trans; ++ spin_unlock(&inode->lock); ++ } + out_unlock: + mutex_unlock(&inode->log_mutex); + +@@ -6417,26 +6432,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + /* + * Call this after adding a new name for a file and it will properly + * update the log to reflect the new name. +- * +- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's +- * true (because it's not used). +- * +- * Return value depends on whether @sync_log is true or false. +- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be +- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT +- * otherwise. +- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to +- * sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, +- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be +- * committed (without attempting to sync the log). + */ +-int btrfs_log_new_name(struct btrfs_trans_handle *trans, ++void btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, struct btrfs_inode *old_dir, +- struct dentry *parent, +- bool sync_log, struct btrfs_log_ctx *ctx) ++ struct dentry *parent) + { + struct btrfs_fs_info *fs_info = trans->fs_info; +- int ret; ++ struct btrfs_log_ctx ctx; + + /* + * this will force the logging code to walk the dentry chain +@@ -6451,34 +6453,18 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, + */ + if (inode->logged_trans <= fs_info->last_trans_committed && + (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) +- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : +- BTRFS_DONT_NEED_LOG_SYNC; +- +- if (sync_log) { +- struct btrfs_log_ctx ctx2; +- +- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); +- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, +- LOG_INODE_EXISTS, &ctx2); +- if (ret == BTRFS_NO_LOG_SYNC) +- return BTRFS_DONT_NEED_TRANS_COMMIT; +- else if (ret) +- return BTRFS_NEED_TRANS_COMMIT; +- +- ret = btrfs_sync_log(trans, inode->root, &ctx2); +- if (ret) +- return BTRFS_NEED_TRANS_COMMIT; +- return BTRFS_DONT_NEED_TRANS_COMMIT; +- } +- +- ASSERT(ctx); +- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, +- LOG_INODE_EXISTS, ctx); +- if (ret == BTRFS_NO_LOG_SYNC) +- return BTRFS_DONT_NEED_LOG_SYNC; +- else if (ret) +- return BTRFS_NEED_TRANS_COMMIT; ++ return; + +- return BTRFS_NEED_LOG_SYNC; ++ btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ++ ctx.logging_new_name = true; ++ /* ++ * We don't care about the return value. If we fail to log the new name ++ * then we know the next attempt to sync the log will fallback to a full ++ * transaction commit (due to a call to btrfs_set_log_full_commit()), so ++ * we don't need to worry about getting a log committed that has an ++ * inconsistent state after a rename operation. ++ */ ++ btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, ++ LOG_INODE_EXISTS, &ctx); + } + +diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h +index 132e43d29034..ddfc6789d9bf 100644 +--- a/fs/btrfs/tree-log.h ++++ b/fs/btrfs/tree-log.h +@@ -16,6 +16,7 @@ struct btrfs_log_ctx { + int log_ret; + int log_transid; + bool log_new_dentries; ++ bool logging_new_name; + struct inode *inode; + struct list_head list; + }; +@@ -26,6 +27,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, + ctx->log_ret = 0; + ctx->log_transid = 0; + ctx->log_new_dentries = false; ++ ctx->logging_new_name = false; + ctx->inode = inode; + INIT_LIST_HEAD(&ctx->list); + } +@@ -67,16 +69,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, + int for_rename); + void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir); +-/* Return values for btrfs_log_new_name() */ +-enum { +- BTRFS_DONT_NEED_TRANS_COMMIT, +- BTRFS_NEED_TRANS_COMMIT, +- BTRFS_DONT_NEED_LOG_SYNC, +- BTRFS_NEED_LOG_SYNC, +-}; +-int btrfs_log_new_name(struct btrfs_trans_handle *trans, ++void btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, struct btrfs_inode *old_dir, +- struct dentry *parent, +- bool sync_log, struct btrfs_log_ctx *ctx); ++ struct dentry *parent); + + #endif +-- +2.30.2 + diff --git a/queue-5.4/btrfs-fix-lost-inode-on-log-replay-after-mix-of-fsyn.patch b/queue-5.4/btrfs-fix-lost-inode-on-log-replay-after-mix-of-fsyn.patch new file mode 100644 index 00000000000..5e2b26ab6a4 --- /dev/null +++ b/queue-5.4/btrfs-fix-lost-inode-on-log-replay-after-mix-of-fsyn.patch @@ -0,0 +1,106 @@ +From e85781ce25731c67341997be8e4a90bc3cfdc1cd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 27 Jul 2021 11:24:43 +0100 +Subject: btrfs: fix lost inode on log replay after mix of fsync, rename and + inode eviction + +From: Filipe Manana + +[ Upstream commit ecc64fab7d49c678e70bd4c35fe64d2ab3e3d212 ] + +When checking if we need to log the new name of a renamed inode, we are +checking if the inode and its parent inode have been logged before, and if +not we don't log the new name. The check however is buggy, as it directly +compares the logged_trans field of the inodes versus the ID of the current +transaction. The problem is that logged_trans is a transient field, only +stored in memory and never persisted in the inode item, so if an inode +was logged before, evicted and reloaded, its logged_trans field is set to +a value of 0, meaning the check will return false and the new name of the +renamed inode is not logged. If the old parent directory was previously +fsynced and we deleted the logged directory entries corresponding to the +old name, we end up with a log that when replayed will delete the renamed +inode. + +The following example triggers the problem: + + $ mkfs.btrfs -f /dev/sdc + $ mount /dev/sdc /mnt + + $ mkdir /mnt/A + $ mkdir /mnt/B + $ echo -n "hello world" > /mnt/A/foo + + $ sync + + # Add some new file to A and fsync directory A. + $ touch /mnt/A/bar + $ xfs_io -c "fsync" /mnt/A + + # Now trigger inode eviction. We are only interested in triggering + # eviction for the inode of directory A. + $ echo 2 > /proc/sys/vm/drop_caches + + # Move foo from directory A to directory B. + # This deletes the directory entries for foo in A from the log, and + # does not add the new name for foo in directory B to the log, because + # logged_trans of A is 0, which is less than the current transaction ID. + $ mv /mnt/A/foo /mnt/B/foo + + # Now make an fsync to anything except A, B or any file inside them, + # like for example create a file at the root directory and fsync this + # new file. This syncs the log that contains all the changes done by + # previous rename operation. + $ touch /mnt/baz + $ xfs_io -c "fsync" /mnt/baz + + + + # Mount the filesystem and replay the log. + $ mount /dev/sdc /mnt + + # Check the filesystem content. + $ ls -1R /mnt + /mnt/: + A + B + baz + + /mnt/A: + bar + + /mnt/B: + $ + + # File foo is gone, it's neither in A/ nor in B/. + +Fix this by using the inode_logged() helper at btrfs_log_new_name(), which +safely checks if an inode was logged before in the current transaction. + +A test case for fstests will follow soon. + +CC: stable@vger.kernel.org # 4.14+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 9912b5231bcf..5412361d0c27 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -6450,8 +6450,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, + * if this inode hasn't been logged and directory we're renaming it + * from hasn't been logged, we don't need to log it + */ +- if (inode->logged_trans < trans->transid && +- (!old_dir || old_dir->logged_trans < trans->transid)) ++ if (!inode_logged(trans, inode) && ++ (!old_dir || !inode_logged(trans, old_dir))) + return; + + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); +-- +2.30.2 + diff --git a/queue-5.4/btrfs-fix-race-causing-unnecessary-inode-logging-dur.patch b/queue-5.4/btrfs-fix-race-causing-unnecessary-inode-logging-dur.patch new file mode 100644 index 00000000000..b5aebbced28 --- /dev/null +++ b/queue-5.4/btrfs-fix-race-causing-unnecessary-inode-logging-dur.patch @@ -0,0 +1,93 @@ +From f795577dcfd1a34ef1486eb80cbf005137a68199 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 Nov 2020 12:19:23 +0000 +Subject: btrfs: fix race causing unnecessary inode logging during link and + rename + +From: Filipe Manana + +[ Upstream commit de53d892e5c51dfa0a158e812575a75a6c991f39 ] + +When we are doing a rename or a link operation for an inode that was logged +in the previous transaction and that transaction is still committing, we +have a time window where we incorrectly consider that the inode was logged +previously in the current transaction and therefore decide to log it to +update it in the log. The following steps give an example on how this +happens during a link operation: + +1) Inode X is logged in transaction 1000, so its logged_trans field is set + to 1000; + +2) Task A starts to commit transaction 1000; + +3) The state of transaction 1000 is changed to TRANS_STATE_UNBLOCKED; + +4) Task B starts a link operation for inode X, and as a consequence it + starts transaction 1001; + +5) Task A is still committing transaction 1000, therefore the value stored + at fs_info->last_trans_committed is still 999; + +6) Task B calls btrfs_log_new_name(), it reads a value of 999 from + fs_info->last_trans_committed and because the logged_trans field of + inode X has a value of 1000, the function does not return immediately, + instead it proceeds to logging the inode, which should not happen + because the inode was logged in the previous transaction (1000) and + not in the current one (1001). + +This is not a functional problem, just wasted time and space logging an +inode that does not need to be logged, contributing to higher latency +for link and rename operations. + +So fix this by comparing the inodes' logged_trans field with the +generation of the current transaction instead of comparing with the value +stored in fs_info->last_trans_committed. + +This case is often hit when running dbench for a long enough duration, as +it does lots of rename operations. + +This patch belongs to a patch set that is comprised of the following +patches: + + btrfs: fix race causing unnecessary inode logging during link and rename + btrfs: fix race that results in logging old extents during a fast fsync + btrfs: fix race that causes unnecessary logging of ancestor inodes + btrfs: fix race that makes inode logging fallback to transaction commit + btrfs: fix race leading to unnecessary transaction commit when logging inode + btrfs: do not block inode logging for so long during transaction commit + +Performance results are mentioned in the change log of the last patch. + +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/tree-log.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 53607156b008..9912b5231bcf 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -6437,7 +6437,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, struct btrfs_inode *old_dir, + struct dentry *parent) + { +- struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_log_ctx ctx; + + /* +@@ -6451,8 +6450,8 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, + * if this inode hasn't been logged and directory we're renaming it + * from hasn't been logged, we don't need to log it + */ +- if (inode->logged_trans <= fs_info->last_trans_committed && +- (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) ++ if (inode->logged_trans < trans->transid && ++ (!old_dir || old_dir->logged_trans < trans->transid)) + return; + + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); +-- +2.30.2 + diff --git a/queue-5.4/series b/queue-5.4/series index 2a78876aab3..922470bd83c 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -38,3 +38,7 @@ powerpc-pseries-fix-regression-while-building-external-modules.patch revert-perf-map-fix-dso-nsinfo-refcounting.patch i40e-add-additional-info-to-phy-type-error.patch can-j1939-j1939_session_deactivate-clarify-lifetime-of-session-object.patch +btrfs-delete-duplicated-words-other-fixes-in-comment.patch +btrfs-do-not-commit-logs-and-transactions-during-lin.patch +btrfs-fix-race-causing-unnecessary-inode-logging-dur.patch +btrfs-fix-lost-inode-on-log-replay-after-mix-of-fsyn.patch