From 55e5fbd66b64ac0688822128c1361fea8f3462d2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 16 May 2016 17:04:17 -0700 Subject: [PATCH] 4.5-stable patches added patches: btrfs-do-not-collect-ordered-extents-when-logging-that-inode-exists.patch btrfs-fix-deadlock-between-direct-io-reads-and-buffered-writes.patch btrfs-fix-extent_same-allowing-destination-offset-beyond-i_size.patch btrfs-fix-race-when-checking-if-we-can-skip-fsync-ing-an-inode.patch --- ...tents-when-logging-that-inode-exists.patch | 60 +++ ...-direct-io-reads-and-buffered-writes.patch | 390 ++++++++++++++++++ ...ing-destination-offset-beyond-i_size.patch | 81 ++++ ...ng-if-we-can-skip-fsync-ing-an-inode.patch | 51 +++ queue-4.5/series | 4 + 5 files changed, 586 insertions(+) create mode 100644 queue-4.5/btrfs-do-not-collect-ordered-extents-when-logging-that-inode-exists.patch create mode 100644 queue-4.5/btrfs-fix-deadlock-between-direct-io-reads-and-buffered-writes.patch create mode 100644 queue-4.5/btrfs-fix-extent_same-allowing-destination-offset-beyond-i_size.patch create mode 100644 queue-4.5/btrfs-fix-race-when-checking-if-we-can-skip-fsync-ing-an-inode.patch diff --git a/queue-4.5/btrfs-do-not-collect-ordered-extents-when-logging-that-inode-exists.patch b/queue-4.5/btrfs-do-not-collect-ordered-extents-when-logging-that-inode-exists.patch new file mode 100644 index 00000000000..cc769937a1b --- /dev/null +++ b/queue-4.5/btrfs-do-not-collect-ordered-extents-when-logging-that-inode-exists.patch @@ -0,0 +1,60 @@ +From 5e33a2bd7ca7fa687fb0965869196eea6815d1f3 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 25 Feb 2016 23:19:38 +0000 +Subject: Btrfs: do not collect ordered extents when logging that inode exists + +From: Filipe Manana + +commit 5e33a2bd7ca7fa687fb0965869196eea6815d1f3 upstream. + +When logging that an inode exists, for example as part of a directory +fsync operation, we were collecting any ordered extents for the inode but +we ended up doing nothing with them except tagging them as processed, by +setting the flag BTRFS_ORDERED_LOGGED on them, which prevented a +subsequent fsync of that inode (using the LOG_INODE_ALL mode) from +collecting and processing them. This created a time window where a second +fsync against the inode, using the fast path, ended up not logging the +checksums for the new extents but it logged the extents since they were +part of the list of modified extents. This happened because the ordered +extents were not collected and checksums were not yet added to the csum +tree - the ordered extents have not gone through btrfs_finish_ordered_io() +yet (which is where we add them to the csum tree by calling +inode.c:add_pending_csums()). + +So fix this by not collecting an inode's ordered extents if we are logging +it with the LOG_INODE_EXISTS mode. + +Signed-off-by: Filipe Manana +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 17 ++++++++++++++++- + 1 file changed, 16 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -4621,7 +4621,22 @@ static int btrfs_log_inode(struct btrfs_ + + mutex_lock(&BTRFS_I(inode)->log_mutex); + +- btrfs_get_logged_extents(inode, &logged_list, start, end); ++ /* ++ * Collect ordered extents only if we are logging data. This is to ++ * ensure a subsequent request to log this inode in LOG_INODE_ALL mode ++ * will process the ordered extents if they still exists at the time, ++ * because when we collect them we test and set for the flag ++ * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the ++ * same ordered extents. The consequence for the LOG_INODE_ALL log mode ++ * not processing the ordered extents is that we end up logging the ++ * corresponding file extent items, based on the extent maps in the ++ * inode's extent_map_tree's modified_list, without logging the ++ * respective checksums (since the may still be only attached to the ++ * ordered extents and have not been inserted in the csum tree by ++ * btrfs_finish_ordered_io() yet). ++ */ ++ if (inode_only == LOG_INODE_ALL) ++ btrfs_get_logged_extents(inode, &logged_list, start, end); + + /* + * a brute force approach to making sure we get the most uptodate diff --git a/queue-4.5/btrfs-fix-deadlock-between-direct-io-reads-and-buffered-writes.patch b/queue-4.5/btrfs-fix-deadlock-between-direct-io-reads-and-buffered-writes.patch new file mode 100644 index 00000000000..4699782b318 --- /dev/null +++ b/queue-4.5/btrfs-fix-deadlock-between-direct-io-reads-and-buffered-writes.patch @@ -0,0 +1,390 @@ +From ade770294df29e08f913e5d733a756893128f45e Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 18 Feb 2016 14:28:55 +0000 +Subject: Btrfs: fix deadlock between direct IO reads and buffered writes + +From: Filipe Manana + +commit ade770294df29e08f913e5d733a756893128f45e upstream. + +While running a test with a mix of buffered IO and direct IO against +the same files I hit a deadlock reported by the following trace: + +[11642.140352] INFO: task kworker/u32:3:15282 blocked for more than 120 seconds. +[11642.142452] Not tainted 4.4.0-rc6-btrfs-next-21+ #1 +[11642.143982] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[11642.146332] kworker/u32:3 D ffff880230ef7988 [11642.147737] systemd-journald[571]: Sent WATCHDOG=1 notification. +[11642.149771] 0 15282 2 0x00000000 +[11642.151205] Workqueue: btrfs-flush_delalloc btrfs_flush_delalloc_helper [btrfs] +[11642.154074] ffff880230ef7988 0000000000000246 0000000000014ec0 ffff88023ec94ec0 +[11642.156722] ffff880233fe8f80 ffff880230ef8000 ffff88023ec94ec0 7fffffffffffffff +[11642.159205] 0000000000000002 ffffffff8147b7f9 ffff880230ef79a0 ffffffff8147b541 +[11642.161403] Call Trace: +[11642.162129] [] ? bit_wait+0x2f/0x2f +[11642.163396] [] schedule+0x82/0x9a +[11642.164871] [] schedule_timeout+0x43/0x109 +[11642.167020] [] ? bit_wait+0x2f/0x2f +[11642.167931] [] ? trace_hardirqs_on_caller+0x17b/0x197 +[11642.182320] [] ? trace_hardirqs_on+0xd/0xf +[11642.183762] [] ? timekeeping_get_ns+0xe/0x33 +[11642.185308] [] ? ktime_get+0x41/0x52 +[11642.186782] [] io_schedule_timeout+0xa0/0x102 +[11642.188217] [] ? io_schedule_timeout+0xa0/0x102 +[11642.189626] [] bit_wait_io+0x1b/0x39 +[11642.190803] [] __wait_on_bit_lock+0x4c/0x90 +[11642.192158] [] __lock_page+0x66/0x68 +[11642.193379] [] ? autoremove_wake_function+0x3a/0x3a +[11642.194831] [] lock_page+0x31/0x34 [btrfs] +[11642.197068] [] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs] +[11642.199188] [] extent_writepages+0x4b/0x5c [btrfs] +[11642.200723] [] ? btrfs_writepage_start_hook+0xce/0xce [btrfs] +[11642.202465] [] btrfs_writepages+0x28/0x2a [btrfs] +[11642.203836] [] do_writepages+0x23/0x2c +[11642.205624] [] __filemap_fdatawrite_range+0x5a/0x61 +[11642.207057] [] filemap_fdatawrite_range+0x13/0x15 +[11642.208529] [] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs] +[11642.210375] [] ? btrfs_scrubparity_helper+0x140/0x33a [btrfs] +[11642.212132] [] btrfs_run_ordered_extent_work+0x25/0x34 [btrfs] +[11642.213837] [] btrfs_scrubparity_helper+0x15c/0x33a [btrfs] +[11642.215457] [] btrfs_flush_delalloc_helper+0xe/0x10 [btrfs] +[11642.217095] [] process_one_work+0x256/0x48b +[11642.218324] [] worker_thread+0x1f5/0x2a7 +[11642.219466] [] ? rescuer_thread+0x289/0x289 +[11642.220801] [] kthread+0xd4/0xdc +[11642.222032] [] ? kthread_parkme+0x24/0x24 +[11642.223190] [] ret_from_fork+0x3f/0x70 +[11642.224394] [] ? kthread_parkme+0x24/0x24 +[11642.226295] 2 locks held by kworker/u32:3/15282: +[11642.227273] #0: ("%s-%s""btrfs", name){++++.+}, at: [] process_one_work+0x165/0x48b +[11642.229412] #1: ((&work->normal_work)){+.+.+.}, at: [] process_one_work+0x165/0x48b +[11642.231414] INFO: task kworker/u32:8:15289 blocked for more than 120 seconds. +[11642.232872] Not tainted 4.4.0-rc6-btrfs-next-21+ #1 +[11642.234109] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[11642.235776] kworker/u32:8 D ffff88020de5f848 0 15289 2 0x00000000 +[11642.237412] Workqueue: writeback wb_workfn (flush-btrfs-481) +[11642.238670] ffff88020de5f848 0000000000000246 0000000000014ec0 ffff88023ed54ec0 +[11642.240475] ffff88021b1ece40 ffff88020de60000 ffff88023ed54ec0 7fffffffffffffff +[11642.242154] 0000000000000002 ffffffff8147b7f9 ffff88020de5f860 ffffffff8147b541 +[11642.243715] Call Trace: +[11642.244390] [] ? bit_wait+0x2f/0x2f +[11642.245432] [] schedule+0x82/0x9a +[11642.246392] [] schedule_timeout+0x43/0x109 +[11642.247479] [] ? bit_wait+0x2f/0x2f +[11642.248551] [] ? trace_hardirqs_on_caller+0x17b/0x197 +[11642.249968] [] ? trace_hardirqs_on+0xd/0xf +[11642.251043] [] ? timekeeping_get_ns+0xe/0x33 +[11642.252202] [] ? ktime_get+0x41/0x52 +[11642.253210] [] io_schedule_timeout+0xa0/0x102 +[11642.254307] [] ? io_schedule_timeout+0xa0/0x102 +[11642.256118] [] bit_wait_io+0x1b/0x39 +[11642.257131] [] __wait_on_bit_lock+0x4c/0x90 +[11642.258200] [] __lock_page+0x66/0x68 +[11642.259168] [] ? autoremove_wake_function+0x3a/0x3a +[11642.260516] [] lock_page+0x31/0x34 [btrfs] +[11642.261841] [] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs] +[11642.263531] [] extent_writepages+0x4b/0x5c [btrfs] +[11642.264747] [] ? btrfs_writepage_start_hook+0xce/0xce [btrfs] +[11642.266148] [] btrfs_writepages+0x28/0x2a [btrfs] +[11642.267264] [] do_writepages+0x23/0x2c +[11642.268280] [] __writeback_single_inode+0xda/0x5ba +[11642.269407] [] writeback_sb_inodes+0x27b/0x43d +[11642.270476] [] __writeback_inodes_wb+0x76/0xae +[11642.271547] [] wb_writeback+0x19e/0x41c +[11642.272588] [] wb_workfn+0x201/0x341 +[11642.273523] [] ? wb_workfn+0x201/0x341 +[11642.274479] [] process_one_work+0x256/0x48b +[11642.275497] [] worker_thread+0x1f5/0x2a7 +[11642.276518] [] ? rescuer_thread+0x289/0x289 +[11642.277520] [] ? rescuer_thread+0x289/0x289 +[11642.278517] [] kthread+0xd4/0xdc +[11642.279371] [] ? kthread_parkme+0x24/0x24 +[11642.280468] [] ret_from_fork+0x3f/0x70 +[11642.281607] [] ? kthread_parkme+0x24/0x24 +[11642.282604] 3 locks held by kworker/u32:8/15289: +[11642.283423] #0: ("writeback"){++++.+}, at: [] process_one_work+0x165/0x48b +[11642.285629] #1: ((&(&wb->dwork)->work)){+.+.+.}, at: [] process_one_work+0x165/0x48b +[11642.287538] #2: (&type->s_umount_key#37){+++++.}, at: [] trylock_super+0x1b/0x4b +[11642.289423] INFO: task fdm-stress:26848 blocked for more than 120 seconds. +[11642.290547] Not tainted 4.4.0-rc6-btrfs-next-21+ #1 +[11642.291453] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[11642.292864] fdm-stress D ffff88022c107c20 0 26848 26591 0x00000000 +[11642.294118] ffff88022c107c20 000000038108affa 0000000000014ec0 ffff88023ed54ec0 +[11642.295602] ffff88013ab1ca40 ffff88022c108000 ffff8800b2fc19d0 00000000000e0fff +[11642.297098] ffff8800b2fc19b0 ffff88022c107c88 ffff88022c107c38 ffffffff8147b541 +[11642.298433] Call Trace: +[11642.298896] [] schedule+0x82/0x9a +[11642.299738] [] lock_extent_bits+0xfe/0x1a3 [btrfs] +[11642.300833] [] ? add_wait_queue_exclusive+0x44/0x44 +[11642.301943] [] lock_and_cleanup_extent_if_need+0x68/0x18e [btrfs] +[11642.303270] [] __btrfs_buffered_write+0x238/0x4c1 [btrfs] +[11642.304552] [] ? btrfs_file_write_iter+0x17c/0x408 [btrfs] +[11642.305782] [] btrfs_file_write_iter+0x2f4/0x408 [btrfs] +[11642.306878] [] __vfs_write+0x7c/0xa5 +[11642.307729] [] vfs_write+0x9d/0xe8 +[11642.308602] [] SyS_write+0x50/0x7e +[11642.309410] [] entry_SYSCALL_64_fastpath+0x12/0x6b +[11642.310403] 3 locks held by fdm-stress/26848: +[11642.311108] #0: (&f->f_pos_lock){+.+.+.}, at: [] __fdget_pos+0x3a/0x40 +[11642.312578] #1: (sb_writers#11){.+.+.+}, at: [] __sb_start_write+0x5f/0xb0 +[11642.314170] #2: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [] btrfs_file_write_iter+0x73/0x408 [btrfs] +[11642.316796] INFO: task fdm-stress:26849 blocked for more than 120 seconds. +[11642.317842] Not tainted 4.4.0-rc6-btrfs-next-21+ #1 +[11642.318691] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[11642.319959] fdm-stress D ffff8801964ffa68 0 26849 26591 0x00000000 +[11642.321312] ffff8801964ffa68 00ff8801e9975f80 0000000000014ec0 ffff88023ed94ec0 +[11642.322555] ffff8800b00b4840 ffff880196500000 ffff8801e9975f20 0000000000000002 +[11642.323715] ffff8801e9975f18 ffff8800b00b4840 ffff8801964ffa80 ffffffff8147b541 +[11642.325096] Call Trace: +[11642.325532] [] schedule+0x82/0x9a +[11642.326303] [] schedule_timeout+0x43/0x109 +[11642.327180] [] ? mark_held_locks+0x5e/0x74 +[11642.328114] [] ? _raw_spin_unlock_irq+0x2c/0x4a +[11642.329051] [] ? trace_hardirqs_on_caller+0x17b/0x197 +[11642.330053] [] __wait_for_common+0x109/0x147 +[11642.330952] [] ? __wait_for_common+0x109/0x147 +[11642.331869] [] ? usleep_range+0x4a/0x4a +[11642.332925] [] ? wake_up_q+0x47/0x47 +[11642.333736] [] wait_for_completion+0x24/0x26 +[11642.334672] [] btrfs_wait_ordered_extents+0x1c8/0x217 [btrfs] +[11642.335858] [] btrfs_mksubvol+0x224/0x45d [btrfs] +[11642.336854] [] ? add_wait_queue_exclusive+0x44/0x44 +[11642.337820] [] btrfs_ioctl_snap_create_transid+0x148/0x17a [btrfs] +[11642.339026] [] btrfs_ioctl_snap_create_v2+0xc7/0x110 [btrfs] +[11642.340214] [] btrfs_ioctl+0x590/0x27bd [btrfs] +[11642.341123] [] ? mutex_unlock+0xe/0x10 +[11642.341934] [] ? ext4_file_write_iter+0x2a3/0x36f [ext4] +[11642.342936] [] ? __lock_is_held+0x3c/0x57 +[11642.343772] [] ? rcu_read_unlock+0x3e/0x5d +[11642.344673] [] do_vfs_ioctl+0x458/0x4dc +[11642.346024] [] ? __fget_light+0x62/0x71 +[11642.346873] [] SyS_ioctl+0x57/0x79 +[11642.347720] [] entry_SYSCALL_64_fastpath+0x12/0x6b +[11642.350222] 4 locks held by fdm-stress/26849: +[11642.350898] #0: (sb_writers#11){.+.+.+}, at: [] __sb_start_write+0x5f/0xb0 +[11642.352375] #1: (&type->i_mutex_dir_key#4/1){+.+.+.}, at: [] btrfs_mksubvol+0x4b/0x45d [btrfs] +[11642.354072] #2: (&fs_info->subvol_sem){++++..}, at: [] btrfs_mksubvol+0xf4/0x45d [btrfs] +[11642.355647] #3: (&root->ordered_extent_mutex){+.+...}, at: [] btrfs_wait_ordered_extents+0x50/0x217 [btrfs] +[11642.357516] INFO: task fdm-stress:26850 blocked for more than 120 seconds. +[11642.358508] Not tainted 4.4.0-rc6-btrfs-next-21+ #1 +[11642.359376] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[11642.368625] fdm-stress D ffff88021f167688 0 26850 26591 0x00000000 +[11642.369716] ffff88021f167688 0000000000000001 0000000000014ec0 ffff88023edd4ec0 +[11642.370950] ffff880128a98680 ffff88021f168000 ffff88023edd4ec0 7fffffffffffffff +[11642.372210] 0000000000000002 ffffffff8147b7f9 ffff88021f1676a0 ffffffff8147b541 +[11642.373430] Call Trace: +[11642.373853] [] ? bit_wait+0x2f/0x2f +[11642.374623] [] schedule+0x82/0x9a +[11642.375948] [] schedule_timeout+0x43/0x109 +[11642.376862] [] ? bit_wait+0x2f/0x2f +[11642.377637] [] ? trace_hardirqs_on_caller+0x17b/0x197 +[11642.378610] [] ? trace_hardirqs_on+0xd/0xf +[11642.379457] [] ? timekeeping_get_ns+0xe/0x33 +[11642.380366] [] ? ktime_get+0x41/0x52 +[11642.381353] [] io_schedule_timeout+0xa0/0x102 +[11642.382255] [] ? io_schedule_timeout+0xa0/0x102 +[11642.383162] [] bit_wait_io+0x1b/0x39 +[11642.383945] [] __wait_on_bit_lock+0x4c/0x90 +[11642.384875] [] __lock_page+0x66/0x68 +[11642.385749] [] ? autoremove_wake_function+0x3a/0x3a +[11642.386721] [] lock_page+0x31/0x34 [btrfs] +[11642.387596] [] extent_write_cache_pages.isra.19.constprop.35+0x1af/0x2f4 [btrfs] +[11642.389030] [] extent_writepages+0x4b/0x5c [btrfs] +[11642.389973] [] ? rcu_read_lock_sched_held+0x61/0x69 +[11642.390939] [] ? btrfs_writepage_start_hook+0xce/0xce [btrfs] +[11642.392271] [] ? __clear_extent_bit+0x26e/0x2c0 [btrfs] +[11642.393305] [] btrfs_writepages+0x28/0x2a [btrfs] +[11642.394239] [] do_writepages+0x23/0x2c +[11642.395045] [] __filemap_fdatawrite_range+0x5a/0x61 +[11642.395991] [] filemap_fdatawrite_range+0x13/0x15 +[11642.397144] [] btrfs_start_ordered_extent+0xd0/0x1a1 [btrfs] +[11642.398392] [] ? clear_extent_bit+0x17/0x19 [btrfs] +[11642.399363] [] btrfs_get_blocks_direct+0x12b/0x61c [btrfs] +[11642.400445] [] ? dio_bio_add_page+0x3d/0x54 +[11642.401309] [] ? submit_page_section+0x7b/0x111 +[11642.402213] [] do_blockdev_direct_IO+0x685/0xc24 +[11642.403139] [] ? btrfs_page_exists_in_range+0x1a1/0x1a1 [btrfs] +[11642.404360] [] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs] +[11642.406187] [] __blockdev_direct_IO+0x31/0x33 +[11642.407070] [] ? __blockdev_direct_IO+0x31/0x33 +[11642.407990] [] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs] +[11642.409192] [] btrfs_direct_IO+0x1c7/0x27e [btrfs] +[11642.410146] [] ? btrfs_get_extent_fiemap+0x1c0/0x1c0 [btrfs] +[11642.411291] [] generic_file_read_iter+0x89/0x4e1 +[11642.412263] [] ? mark_lock+0x24/0x201 +[11642.413057] [] __vfs_read+0x79/0x9d +[11642.413897] [] vfs_read+0x8f/0xd2 +[11642.414708] [] SyS_read+0x50/0x7e +[11642.415573] [] entry_SYSCALL_64_fastpath+0x12/0x6b +[11642.416572] 1 lock held by fdm-stress/26850: +[11642.417345] #0: (&f->f_pos_lock){+.+.+.}, at: [] __fdget_pos+0x3a/0x40 +[11642.418703] INFO: task fdm-stress:26851 blocked for more than 120 seconds. +[11642.419698] Not tainted 4.4.0-rc6-btrfs-next-21+ #1 +[11642.420612] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +[11642.421807] fdm-stress D ffff880196483d28 0 26851 26591 0x00000000 +[11642.422878] ffff880196483d28 00ff8801c8f60740 0000000000014ec0 ffff88023ed94ec0 +[11642.424149] ffff8801c8f60740 ffff880196484000 0000000000000246 ffff8801c8f60740 +[11642.425374] ffff8801bb711840 ffff8801bb711878 ffff880196483d40 ffffffff8147b541 +[11642.426591] Call Trace: +[11642.427013] [] schedule+0x82/0x9a +[11642.427856] [] schedule_preempt_disabled+0x18/0x24 +[11642.428852] [] mutex_lock_nested+0x1d7/0x3b4 +[11642.429743] [] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs] +[11642.430911] [] btrfs_wait_ordered_extents+0x50/0x217 [btrfs] +[11642.432102] [] ? btrfs_wait_ordered_roots+0x57/0x191 [btrfs] +[11642.433259] [] ? btrfs_wait_ordered_extents+0x50/0x217 [btrfs] +[11642.434431] [] btrfs_wait_ordered_roots+0xcd/0x191 [btrfs] +[11642.436079] [] btrfs_sync_fs+0xe0/0x1ad [btrfs] +[11642.437009] [] ? SyS_tee+0x23c/0x23c +[11642.437860] [] sync_fs_one_sb+0x20/0x22 +[11642.438723] [] iterate_supers+0x75/0xc2 +[11642.439597] [] sys_sync+0x52/0x80 +[11642.440454] [] entry_SYSCALL_64_fastpath+0x12/0x6b +[11642.441533] 3 locks held by fdm-stress/26851: +[11642.442370] #0: (&type->s_umount_key#37){+++++.}, at: [] iterate_supers+0x5f/0xc2 +[11642.444043] #1: (&fs_info->ordered_operations_mutex){+.+...}, at: [] btrfs_wait_ordered_roots+0x44/0x191 [btrfs] +[11642.446010] #2: (&root->ordered_extent_mutex){+.+...}, at: [] btrfs_wait_ordered_extents+0x50/0x217 [btrfs] + +This happened because under specific timings the path for direct IO reads +can deadlock with concurrent buffered writes. The diagram below shows how +this happens for an example file that has the following layout: + + [ extent A ] [ extent B ] [ .... + 0K 4K 8K + + CPU 1 CPU 2 CPU 3 + +DIO read against range + [0K, 8K[ starts + +btrfs_direct_IO() + --> calls btrfs_get_blocks_direct() + which finds the extent map for the + extent A and leaves the range + [0K, 4K[ locked in the inode's + io tree + + buffered write against + range [4K, 8K[ starts + + __btrfs_buffered_write() + --> dirties page at 4K + + a user space + task calls sync + for e.g or + writepages() is + invoked by mm + + writepages() + run_delalloc_range() + cow_file_range() + --> ordered extent X + for the buffered + write is created + and + writeback starts + + --> calls btrfs_get_blocks_direct() + again, without submitting first + a bio for reading extent A, and + finds the extent map for extent B + + --> calls lock_extent_direct() + + --> locks range [4K, 8K[ + --> finds ordered extent X + covering range [4K, 8K[ + --> unlocks range [4K, 8K[ + + buffered write against + range [0K, 8K[ starts + + __btrfs_buffered_write() + prepare_pages() + --> locks pages with + offsets 0 and 4K + lock_and_cleanup_extent_if_need() + --> blocks attempting to + lock range [0K, 8K[ in + the inode's io tree, + because the range [0, 4K[ + is already locked by the + direct IO task at CPU 1 + + --> calls + btrfs_start_ordered_extent(oe X) + + btrfs_start_ordered_extent(oe X) + + --> At this point writeback for ordered + extent X has not finished yet + + filemap_fdatawrite_range() + btrfs_writepages() + extent_writepages() + extent_write_cache_pages() + --> finds page with offset 0 + with the writeback tag + (and not dirty) + --> tries to lock it + --> deadlock, task at CPU 2 + has the page locked and + is blocked on the io range + [0, 4K[ that was locked + earlier by this task + +So fix this by falling back to a buffered read in the direct IO read path +when an ordered extent for a buffered write is found. + +Signed-off-by: Filipe Manana +Reviewed-by: Liu Bo +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 25 +++++++++++++++++++++++-- + 1 file changed, 23 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -7423,7 +7423,26 @@ static int lock_extent_direct(struct ino + cached_state, GFP_NOFS); + + if (ordered) { +- btrfs_start_ordered_extent(inode, ordered, 1); ++ /* ++ * If we are doing a DIO read and the ordered extent we ++ * found is for a buffered write, we can not wait for it ++ * to complete and retry, because if we do so we can ++ * deadlock with concurrent buffered writes on page ++ * locks. This happens only if our DIO read covers more ++ * than one extent map, if at this point has already ++ * created an ordered extent for a previous extent map ++ * and locked its range in the inode's io tree, and a ++ * concurrent write against that previous extent map's ++ * range and this range started (we unlock the ranges ++ * in the io tree only when the bios complete and ++ * buffered writes always lock pages before attempting ++ * to lock range in the io tree). ++ */ ++ if (writing || ++ test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) ++ btrfs_start_ordered_extent(inode, ordered, 1); ++ else ++ ret = -ENOTBLK; + btrfs_put_ordered_extent(ordered); + } else { + /* +@@ -7440,9 +7459,11 @@ static int lock_extent_direct(struct ino + * that page. + */ + ret = -ENOTBLK; +- break; + } + ++ if (ret) ++ break; ++ + cond_resched(); + } + diff --git a/queue-4.5/btrfs-fix-extent_same-allowing-destination-offset-beyond-i_size.patch b/queue-4.5/btrfs-fix-extent_same-allowing-destination-offset-beyond-i_size.patch new file mode 100644 index 00000000000..bcd570d0d0f --- /dev/null +++ b/queue-4.5/btrfs-fix-extent_same-allowing-destination-offset-beyond-i_size.patch @@ -0,0 +1,81 @@ +From f4dfe6871006c62abdccc77b2818b11f376e98e2 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 12 Feb 2016 14:44:00 +0000 +Subject: Btrfs: fix extent_same allowing destination offset beyond i_size + +From: Filipe Manana + +commit f4dfe6871006c62abdccc77b2818b11f376e98e2 upstream. + +When using the same file as the source and destination for a dedup +(extent_same ioctl) operation we were allowing it to dedup to a +destination offset beyond the file's size, which doesn't make sense and +it's not allowed for the case where the source and destination files are +not the same file. This made de deduplication operation successful only +when the source range corresponded to a hole, a prealloc extent or an +extent with all bytes having a value of 0x00. This was also leaving a +file hole (between i_size and destination offset) without the +corresponding file extent items, which can be reproduced with the +following steps for example: + + $ mkfs.btrfs -f /dev/sdi + $ mount /dev/sdi /mnt/sdi + + $ xfs_io -f -c "pwrite -S 0xab 304457 404990" /mnt/sdi/foobar + wrote 404990/404990 bytes at offset 304457 + 395 KiB, 99 ops; 0.0000 sec (31.150 MiB/sec and 7984.5149 ops/sec) + + $ /git/hub/duperemove/btrfs-extent-same 24576 /mnt/sdi/foobar 28672 /mnt/sdi/foobar 929792 + Deduping 2 total files + (28672, 24576): /mnt/sdi/foobar + (929792, 24576): /mnt/sdi/foobar + 1 files asked to be deduped + i: 0, status: 0, bytes_deduped: 24576 + 24576 total bytes deduped in this operation + + $ umount /mnt/sdi + $ btrfsck /dev/sdi + Checking filesystem on /dev/sdi + UUID: 98c528aa-0833-427d-9403-b98032ffbf9d + checking extents + checking free space cache + checking fs roots + root 5 inode 257 errors 100, file extent discount + Found file extent holes: + start: 712704, len: 217088 + found 540673 bytes used err is 1 + total csum bytes: 400 + total tree bytes: 131072 + total fs tree bytes: 32768 + total extent tree bytes: 16384 + btree space waste bytes: 123675 + file data blocks allocated: 671744 + referenced 671744 + btrfs-progs v4.2.3 + +So fix this by not allowing the destination to go beyond the file's size, +just as we do for the same where the source and destination files are not +the same. + +A test for xfstests follows. + +Signed-off-by: Filipe Manana +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -3069,6 +3069,9 @@ static int btrfs_extent_same(struct inod + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; ++ ret = extent_same_check_offsets(src, dst_loff, &len, olen); ++ if (ret) ++ goto out_unlock; + + /* + * Single inode case wants the same checks, except we diff --git a/queue-4.5/btrfs-fix-race-when-checking-if-we-can-skip-fsync-ing-an-inode.patch b/queue-4.5/btrfs-fix-race-when-checking-if-we-can-skip-fsync-ing-an-inode.patch new file mode 100644 index 00000000000..641b53101af --- /dev/null +++ b/queue-4.5/btrfs-fix-race-when-checking-if-we-can-skip-fsync-ing-an-inode.patch @@ -0,0 +1,51 @@ +From affc0ff902d539ebe9bba405d330410314f46e9f Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 24 Feb 2016 07:35:05 +0000 +Subject: Btrfs: fix race when checking if we can skip fsync'ing an inode + +From: Filipe Manana + +commit affc0ff902d539ebe9bba405d330410314f46e9f upstream. + +If we're about to do a fast fsync for an inode and btrfs_inode_in_log() +returns false, it's possible that we had an ordered extent in progress +(btrfs_finish_ordered_io() not run yet) when we noticed that the inode's +last_trans field was not greater than the id of the last committed +transaction, but shortly after, before we checked if there were any +ongoing ordered extents, the ordered extent had just completed and +removed itself from the inode's ordered tree, in which case we end up not +logging the inode, losing some data if a power failure or crash happens +after the fsync handler returns and before the transaction is committed. + +Fix this by checking first if there are any ongoing ordered extents +before comparing the inode's last_trans with the id of the last committed +transaction - when it completes, an ordered extent always updates the +inode's last_trans before it removes itself from the inode's ordered +tree (at btrfs_finish_ordered_io()). + +Signed-off-by: Filipe Manana +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1996,10 +1996,11 @@ int btrfs_sync_file(struct file *file, l + */ + smp_mb(); + if (btrfs_inode_in_log(inode, root->fs_info->generation) || +- (BTRFS_I(inode)->last_trans <= +- root->fs_info->last_trans_committed && +- (full_sync || +- !btrfs_have_ordered_extents_in_range(inode, start, len)))) { ++ (full_sync && BTRFS_I(inode)->last_trans <= ++ root->fs_info->last_trans_committed) || ++ (!btrfs_have_ordered_extents_in_range(inode, start, len) && ++ BTRFS_I(inode)->last_trans ++ <= root->fs_info->last_trans_committed)) { + /* + * We'v had everything committed since the last time we were + * modified so clear this flag in case it was set for whatever diff --git a/queue-4.5/series b/queue-4.5/series index f7a8be4f0e6..803be872d93 100644 --- a/queue-4.5/series +++ b/queue-4.5/series @@ -87,3 +87,7 @@ btrfs-remove-error-message-from-search-ioctl-for-nonexistent-tree.patch btrfs-change-max_inline-default-to-2048.patch btrfs-fix-unreplayable-log-after-snapshot-delete-parent-dir-fsync.patch btrfs-fix-file-loss-on-log-replay-after-renaming-a-file-and-fsync.patch +btrfs-fix-extent_same-allowing-destination-offset-beyond-i_size.patch +btrfs-fix-deadlock-between-direct-io-reads-and-buffered-writes.patch +btrfs-fix-race-when-checking-if-we-can-skip-fsync-ing-an-inode.patch +btrfs-do-not-collect-ordered-extents-when-logging-that-inode-exists.patch -- 2.47.3