From: Greg Kroah-Hartman Date: Mon, 27 May 2019 14:09:45 +0000 (+0200) Subject: 4.4-stable patches X-Git-Tag: v5.1.6~38 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=4c58ed0579ec1e49025d3855b258f0ccb881da9a;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: btrfs-fix-race-between-ranged-fsync-and-writeback-of-adjacent-ranges.patch btrfs-sysfs-don-t-leak-memory-when-failing-add-fsid.patch fbdev-fix-divide-error-in-fb_var_to_videomode.patch gfs2-fix-sign-extension-bug-in-gfs2_update_stats.patch hugetlb-use-same-fault-hash-key-for-shared-and-private-mappings.patch --- diff --git a/queue-4.4/btrfs-fix-race-between-ranged-fsync-and-writeback-of-adjacent-ranges.patch b/queue-4.4/btrfs-fix-race-between-ranged-fsync-and-writeback-of-adjacent-ranges.patch new file mode 100644 index 00000000000..c7ea099ecfe --- /dev/null +++ b/queue-4.4/btrfs-fix-race-between-ranged-fsync-and-writeback-of-adjacent-ranges.patch @@ -0,0 +1,243 @@ +From 0c713cbab6200b0ab6473b50435e450a6e1de85d Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 6 May 2019 16:44:02 +0100 +Subject: Btrfs: fix race between ranged fsync and writeback of adjacent ranges + +From: Filipe Manana + +commit 0c713cbab6200b0ab6473b50435e450a6e1de85d upstream. + +When we do a full fsync (the bit BTRFS_INODE_NEEDS_FULL_SYNC is set in the +inode) that happens to be ranged, which happens during a msync() or writes +for files opened with O_SYNC for example, we can end up with a corrupt log, +due to different file extent items representing ranges that overlap with +each other, or hit some assertion failures. + +When doing a ranged fsync we only flush delalloc and wait for ordered +exents within that range. If while we are logging items from our inode +ordered extents for adjacent ranges complete, we end up in a race that can +make us insert the file extent items that overlap with others we logged +previously and the assertion failures. + +For example, if tree-log.c:copy_items() receives a leaf that has the +following file extents items, all with a length of 4K and therefore there +is an implicit hole in the range 68K to 72K - 1: + + (257 EXTENT_ITEM 64K), (257 EXTENT_ITEM 72K), (257 EXTENT_ITEM 76K), ... + +It copies them to the log tree. However due to the need to detect implicit +holes, it may release the path, in order to look at the previous leaf to +detect an implicit hole, and then later it will search again in the tree +for the first file extent item key, with the goal of locking again the +leaf (which might have changed due to concurrent changes to other inodes). + +However when it locks again the leaf containing the first key, the key +corresponding to the extent at offset 72K may not be there anymore since +there is an ordered extent for that range that is finishing (that is, +somewhere in the middle of btrfs_finish_ordered_io()), and it just +removed the file extent item but has not yet replaced it with a new file +extent item, so the part of copy_items() that does hole detection will +decide that there is a hole in the range starting from 68K to 76K - 1, +and therefore insert a file extent item to represent that hole, having +a key offset of 68K. After that we now have a log tree with 2 different +extent items that have overlapping ranges: + + 1) The file extent item copied before copy_items() released the path, + which has a key offset of 72K and a length of 4K, representing the + file range 72K to 76K - 1. + + 2) And a file extent item representing a hole that has a key offset of + 68K and a length of 8K, representing the range 68K to 76K - 1. This + item was inserted after releasing the path, and overlaps with the + extent item inserted before. + +The overlapping extent items can cause all sorts of unpredictable and +incorrect behaviour, either when replayed or if a fast (non full) fsync +happens later, which can trigger a BUG_ON() when calling +btrfs_set_item_key_safe() through __btrfs_drop_extents(), producing a +trace like the following: + + [61666.783269] ------------[ cut here ]------------ + [61666.783943] kernel BUG at fs/btrfs/ctree.c:3182! + [61666.784644] invalid opcode: 0000 [#1] PREEMPT SMP + (...) + [61666.786253] task: ffff880117b88c40 task.stack: ffffc90008168000 + [61666.786253] RIP: 0010:btrfs_set_item_key_safe+0x7c/0xd2 [btrfs] + [61666.786253] RSP: 0018:ffffc9000816b958 EFLAGS: 00010246 + [61666.786253] RAX: 0000000000000000 RBX: 000000000000000f RCX: 0000000000030000 + [61666.786253] RDX: 0000000000000000 RSI: ffffc9000816ba4f RDI: ffffc9000816b937 + [61666.786253] RBP: ffffc9000816b998 R08: ffff88011dae2428 R09: 0000000000001000 + [61666.786253] R10: 0000160000000000 R11: 6db6db6db6db6db7 R12: ffff88011dae2418 + [61666.786253] R13: ffffc9000816ba4f R14: ffff8801e10c4118 R15: ffff8801e715c000 + [61666.786253] FS: 00007f6060a18700(0000) GS:ffff88023f5c0000(0000) knlGS:0000000000000000 + [61666.786253] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [61666.786253] CR2: 00007f6060a28000 CR3: 0000000213e69000 CR4: 00000000000006e0 + [61666.786253] Call Trace: + [61666.786253] __btrfs_drop_extents+0x5e3/0xaad [btrfs] + [61666.786253] ? time_hardirqs_on+0x9/0x14 + [61666.786253] btrfs_log_changed_extents+0x294/0x4e0 [btrfs] + [61666.786253] ? release_extent_buffer+0x38/0xb4 [btrfs] + [61666.786253] btrfs_log_inode+0xb6e/0xcdc [btrfs] + [61666.786253] ? lock_acquire+0x131/0x1c5 + [61666.786253] ? btrfs_log_inode_parent+0xee/0x659 [btrfs] + [61666.786253] ? arch_local_irq_save+0x9/0xc + [61666.786253] ? btrfs_log_inode_parent+0x1f5/0x659 [btrfs] + [61666.786253] btrfs_log_inode_parent+0x223/0x659 [btrfs] + [61666.786253] ? arch_local_irq_save+0x9/0xc + [61666.786253] ? lockref_get_not_zero+0x2c/0x34 + [61666.786253] ? rcu_read_unlock+0x3e/0x5d + [61666.786253] btrfs_log_dentry_safe+0x60/0x7b [btrfs] + [61666.786253] btrfs_sync_file+0x317/0x42c [btrfs] + [61666.786253] vfs_fsync_range+0x8c/0x9e + [61666.786253] SyS_msync+0x13c/0x1c9 + [61666.786253] entry_SYSCALL_64_fastpath+0x18/0xad + +A sample of a corrupt log tree leaf with overlapping extents I got from +running btrfs/072: + + item 14 key (295 108 200704) itemoff 2599 itemsize 53 + extent data disk bytenr 0 nr 0 + extent data offset 0 nr 458752 ram 458752 + item 15 key (295 108 659456) itemoff 2546 itemsize 53 + extent data disk bytenr 4343541760 nr 770048 + extent data offset 606208 nr 163840 ram 770048 + item 16 key (295 108 663552) itemoff 2493 itemsize 53 + extent data disk bytenr 4343541760 nr 770048 + extent data offset 610304 nr 155648 ram 770048 + item 17 key (295 108 819200) itemoff 2440 itemsize 53 + extent data disk bytenr 4334788608 nr 4096 + extent data offset 0 nr 4096 ram 4096 + +The file extent item at offset 659456 (item 15) ends at offset 823296 +(659456 + 163840) while the next file extent item (item 16) starts at +offset 663552. + +Another different problem that the race can trigger is a failure in the +assertions at tree-log.c:copy_items(), which expect that the first file +extent item key we found before releasing the path exists after we have +released path and that the last key we found before releasing the path +also exists after releasing the path: + + $ cat -n fs/btrfs/tree-log.c + 4080 if (need_find_last_extent) { + 4081 /* btrfs_prev_leaf could return 1 without releasing the path */ + 4082 btrfs_release_path(src_path); + 4083 ret = btrfs_search_slot(NULL, inode->root, &first_key, + 4084 src_path, 0, 0); + 4085 if (ret < 0) + 4086 return ret; + 4087 ASSERT(ret == 0); + (...) + 4103 if (i >= btrfs_header_nritems(src_path->nodes[0])) { + 4104 ret = btrfs_next_leaf(inode->root, src_path); + 4105 if (ret < 0) + 4106 return ret; + 4107 ASSERT(ret == 0); + 4108 src = src_path->nodes[0]; + 4109 i = 0; + 4110 need_find_last_extent = true; + 4111 } + (...) + +The second assertion implicitly expects that the last key before the path +release still exists, because the surrounding while loop only stops after +we have found that key. When this assertion fails it produces a stack like +this: + + [139590.037075] assertion failed: ret == 0, file: fs/btrfs/tree-log.c, line: 4107 + [139590.037406] ------------[ cut here ]------------ + [139590.037707] kernel BUG at fs/btrfs/ctree.h:3546! + [139590.038034] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC PTI + [139590.038340] CPU: 1 PID: 31841 Comm: fsstress Tainted: G W 5.0.0-btrfs-next-46 #1 + (...) + [139590.039354] RIP: 0010:assfail.constprop.24+0x18/0x1a [btrfs] + (...) + [139590.040397] RSP: 0018:ffffa27f48f2b9b0 EFLAGS: 00010282 + [139590.040730] RAX: 0000000000000041 RBX: ffff897c635d92c8 RCX: 0000000000000000 + [139590.041105] RDX: 0000000000000000 RSI: ffff897d36a96868 RDI: ffff897d36a96868 + [139590.041470] RBP: ffff897d1b9a0708 R08: 0000000000000000 R09: 0000000000000000 + [139590.041815] R10: 0000000000000008 R11: 0000000000000000 R12: 0000000000000013 + [139590.042159] R13: 0000000000000227 R14: ffff897cffcbba88 R15: 0000000000000001 + [139590.042501] FS: 00007f2efc8dee80(0000) GS:ffff897d36a80000(0000) knlGS:0000000000000000 + [139590.042847] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [139590.043199] CR2: 00007f8c064935e0 CR3: 0000000232252002 CR4: 00000000003606e0 + [139590.043547] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [139590.043899] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [139590.044250] Call Trace: + [139590.044631] copy_items+0xa3f/0x1000 [btrfs] + [139590.045009] ? generic_bin_search.constprop.32+0x61/0x200 [btrfs] + [139590.045396] btrfs_log_inode+0x7b3/0xd70 [btrfs] + [139590.045773] btrfs_log_inode_parent+0x2b3/0xce0 [btrfs] + [139590.046143] ? do_raw_spin_unlock+0x49/0xc0 + [139590.046510] btrfs_log_dentry_safe+0x4a/0x70 [btrfs] + [139590.046872] btrfs_sync_file+0x3b6/0x440 [btrfs] + [139590.047243] btrfs_file_write_iter+0x45b/0x5c0 [btrfs] + [139590.047592] __vfs_write+0x129/0x1c0 + [139590.047932] vfs_write+0xc2/0x1b0 + [139590.048270] ksys_write+0x55/0xc0 + [139590.048608] do_syscall_64+0x60/0x1b0 + [139590.048946] entry_SYSCALL_64_after_hwframe+0x49/0xbe + [139590.049287] RIP: 0033:0x7f2efc4be190 + (...) + [139590.050342] RSP: 002b:00007ffe743243a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 + [139590.050701] RAX: ffffffffffffffda RBX: 0000000000008d58 RCX: 00007f2efc4be190 + [139590.051067] RDX: 0000000000008d58 RSI: 00005567eca0f370 RDI: 0000000000000003 + [139590.051459] RBP: 0000000000000024 R08: 0000000000000003 R09: 0000000000008d60 + [139590.051863] R10: 0000000000000078 R11: 0000000000000246 R12: 0000000000000003 + [139590.052252] R13: 00000000003d3507 R14: 00005567eca0f370 R15: 0000000000000000 + (...) + [139590.055128] ---[ end trace 193f35d0215cdeeb ]--- + +So fix this race between a full ranged fsync and writeback of adjacent +ranges by flushing all delalloc and waiting for all ordered extents to +complete before logging the inode. This is the simplest way to solve the +problem because currently the full fsync path does not deal with ranges +at all (it assumes a full range from 0 to LLONG_MAX) and it always needs +to look at adjacent ranges for hole detection. For use cases of ranged +fsyncs this can make a few fsyncs slower but on the other hand it can +make some following fsyncs to other ranges do less work or no need to do +anything at all. A full fsync is rare anyway and happens only once after +loading/creating an inode and once after less common operations such as a +shrinking truncate. + +This is an issue that exists for a long time, and was often triggered by +generic/127, because it does mmap'ed writes and msync (which triggers a +ranged fsync). Adding support for the tree checker to detect overlapping +extents (next patch in the series) and trigger a WARN() when such cases +are found, and then calling btrfs_check_leaf_full() at the end of +btrfs_insert_file_extent() made the issue much easier to detect. Running +btrfs/072 with that change to the tree checker and making fsstress open +files always with O_SYNC made it much easier to trigger the issue (as +triggering it with generic/127 is very rare). + +CC: stable@vger.kernel.org # 3.16+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1901,6 +1901,18 @@ int btrfs_sync_file(struct file *file, l + u64 len; + + /* ++ * If the inode needs a full sync, make sure we use a full range to ++ * avoid log tree corruption, due to hole detection racing with ordered ++ * extent completion for adjacent ranges, and assertion failures during ++ * hole detection. ++ */ ++ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, ++ &BTRFS_I(inode)->runtime_flags)) { ++ start = 0; ++ end = LLONG_MAX; ++ } ++ ++ /* + * The range length can be represented by u64, we have to do the typecasts + * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() + */ diff --git a/queue-4.4/btrfs-sysfs-don-t-leak-memory-when-failing-add-fsid.patch b/queue-4.4/btrfs-sysfs-don-t-leak-memory-when-failing-add-fsid.patch new file mode 100644 index 00000000000..c29d83f8651 --- /dev/null +++ b/queue-4.4/btrfs-sysfs-don-t-leak-memory-when-failing-add-fsid.patch @@ -0,0 +1,53 @@ +From e32773357d5cc271b1d23550b3ed026eb5c2a468 Mon Sep 17 00:00:00 2001 +From: "Tobin C. Harding" +Date: Mon, 13 May 2019 13:39:12 +1000 +Subject: btrfs: sysfs: don't leak memory when failing add fsid + +From: Tobin C. Harding + +commit e32773357d5cc271b1d23550b3ed026eb5c2a468 upstream. + +A failed call to kobject_init_and_add() must be followed by a call to +kobject_put(). Currently in the error path when adding fs_devices we +are missing this call. This could be fixed by calling +btrfs_sysfs_remove_fsid() if btrfs_sysfs_add_fsid() returns an error or +by adding a call to kobject_put() directly in btrfs_sysfs_add_fsid(). +Here we choose the second option because it prevents the slightly +unusual error path handling requirements of kobject from leaking out +into btrfs functions. + +Add a call to kobject_put() in the error path of kobject_add_and_init(). +This causes the release method to be called if kobject_init_and_add() +fails. open_tree() is the function that calls btrfs_sysfs_add_fsid() +and the error code in this function is already written with the +assumption that the release method is called during the error path of +open_tree() (as seen by the call to btrfs_sysfs_remove_fsid() under the +fail_fsdev_sysfs label). + +Cc: stable@vger.kernel.org # v4.4+ +Reviewed-by: Greg Kroah-Hartman +Signed-off-by: Tobin C. Harding +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/sysfs.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -733,7 +733,12 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs + fs_devs->fsid_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->fsid_kobj, + &btrfs_ktype, parent, "%pU", fs_devs->fsid); +- return error; ++ if (error) { ++ kobject_put(&fs_devs->fsid_kobj); ++ return error; ++ } ++ ++ return 0; + } + + int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) diff --git a/queue-4.4/fbdev-fix-divide-error-in-fb_var_to_videomode.patch b/queue-4.4/fbdev-fix-divide-error-in-fb_var_to_videomode.patch new file mode 100644 index 00000000000..548e9af99ca --- /dev/null +++ b/queue-4.4/fbdev-fix-divide-error-in-fb_var_to_videomode.patch @@ -0,0 +1,81 @@ +From cf84807f6dd0be5214378e66460cfc9187f532f9 Mon Sep 17 00:00:00 2001 +From: Shile Zhang +Date: Mon, 1 Apr 2019 17:47:00 +0200 +Subject: fbdev: fix divide error in fb_var_to_videomode + +From: Shile Zhang + +commit cf84807f6dd0be5214378e66460cfc9187f532f9 upstream. + +To fix following divide-by-zero error found by Syzkaller: + + divide error: 0000 [#1] SMP PTI + CPU: 7 PID: 8447 Comm: test Kdump: loaded Not tainted 4.19.24-8.al7.x86_64 #1 + Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014 + RIP: 0010:fb_var_to_videomode+0xae/0xc0 + Code: 04 44 03 46 78 03 4e 7c 44 03 46 68 03 4e 70 89 ce d1 ee 69 c0 e8 03 00 00 f6 c2 01 0f 45 ce 83 e2 02 8d 34 09 0f 45 ce 31 d2 <41> f7 f0 31 d2 f7 f1 89 47 08 f3 c3 66 0f 1f 44 00 00 0f 1f 44 00 + RSP: 0018:ffffb7e189347bf0 EFLAGS: 00010246 + RAX: 00000000e1692410 RBX: ffffb7e189347d60 RCX: 0000000000000000 + RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffb7e189347c10 + RBP: ffff99972a091c00 R08: 0000000000000000 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000100 + R13: 0000000000010000 R14: 00007ffd66baf6d0 R15: 0000000000000000 + FS: 00007f2054d11740(0000) GS:ffff99972fbc0000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007f205481fd20 CR3: 00000004288a0001 CR4: 00000000001606a0 + Call Trace: + fb_set_var+0x257/0x390 + ? lookup_fast+0xbb/0x2b0 + ? fb_open+0xc0/0x140 + ? chrdev_open+0xa6/0x1a0 + do_fb_ioctl+0x445/0x5a0 + do_vfs_ioctl+0x92/0x5f0 + ? __alloc_fd+0x3d/0x160 + ksys_ioctl+0x60/0x90 + __x64_sys_ioctl+0x16/0x20 + do_syscall_64+0x5b/0x190 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + RIP: 0033:0x7f20548258d7 + Code: 44 00 00 48 8b 05 b9 15 2d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 89 15 2d 00 f7 d8 64 89 01 48 + +It can be triggered easily with following test code: + + #include + #include + #include + int main(void) + { + struct fb_var_screeninfo var = {.activate = 0x100, .pixclock = 60}; + int fd = open("/dev/fb0", O_RDWR); + if (fd < 0) + return 1; + + if (ioctl(fd, FBIOPUT_VSCREENINFO, &var)) + return 1; + + return 0; + } + +Signed-off-by: Shile Zhang +Cc: Fredrik Noring +Cc: Daniel Vetter +Reviewed-by: Mukesh Ojha +Signed-off-by: Bartlomiej Zolnierkiewicz +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/video/fbdev/core/modedb.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/video/fbdev/core/modedb.c ++++ b/drivers/video/fbdev/core/modedb.c +@@ -933,6 +933,9 @@ void fb_var_to_videomode(struct fb_video + if (var->vmode & FB_VMODE_DOUBLE) + vtotal *= 2; + ++ if (!htotal || !vtotal) ++ return; ++ + hfreq = pixclock/htotal; + mode->refresh = hfreq/vtotal; + } diff --git a/queue-4.4/gfs2-fix-sign-extension-bug-in-gfs2_update_stats.patch b/queue-4.4/gfs2-fix-sign-extension-bug-in-gfs2_update_stats.patch new file mode 100644 index 00000000000..d9b73693919 --- /dev/null +++ b/queue-4.4/gfs2-fix-sign-extension-bug-in-gfs2_update_stats.patch @@ -0,0 +1,49 @@ +From 5a5ec83d6ac974b12085cd99b196795f14079037 Mon Sep 17 00:00:00 2001 +From: Andreas Gruenbacher +Date: Fri, 17 May 2019 19:18:43 +0100 +Subject: gfs2: Fix sign extension bug in gfs2_update_stats + +From: Andreas Gruenbacher + +commit 5a5ec83d6ac974b12085cd99b196795f14079037 upstream. + +Commit 4d207133e9c3 changed the types of the statistic values in struct +gfs2_lkstats from s64 to u64. Because of that, what should be a signed +value in gfs2_update_stats turned into an unsigned value. When shifted +right, we end up with a large positive value instead of a small negative +value, which results in an incorrect variance estimate. + +Fixes: 4d207133e9c3 ("gfs2: Make statistics unsigned, suitable for use with do_div()") +Signed-off-by: Andreas Gruenbacher +Cc: stable@vger.kernel.org # v4.4+ +Signed-off-by: Greg Kroah-Hartman + +--- + fs/gfs2/lock_dlm.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/fs/gfs2/lock_dlm.c ++++ b/fs/gfs2/lock_dlm.c +@@ -32,9 +32,10 @@ extern struct workqueue_struct *gfs2_con + * @delta is the difference between the current rtt sample and the + * running average srtt. We add 1/8 of that to the srtt in order to + * update the current srtt estimate. The variance estimate is a bit +- * more complicated. We subtract the abs value of the @delta from +- * the current variance estimate and add 1/4 of that to the running +- * total. ++ * more complicated. We subtract the current variance estimate from ++ * the abs value of the @delta and add 1/4 of that to the running ++ * total. That's equivalent to 3/4 of the current variance ++ * estimate plus 1/4 of the abs of @delta. + * + * Note that the index points at the array entry containing the smoothed + * mean value, and the variance is always in the following entry +@@ -50,7 +51,7 @@ static inline void gfs2_update_stats(str + s64 delta = sample - s->stats[index]; + s->stats[index] += (delta >> 3); + index++; +- s->stats[index] += ((abs(delta) - s->stats[index]) >> 2); ++ s->stats[index] += (s64)(abs(delta) - s->stats[index]) >> 2; + } + + /** diff --git a/queue-4.4/hugetlb-use-same-fault-hash-key-for-shared-and-private-mappings.patch b/queue-4.4/hugetlb-use-same-fault-hash-key-for-shared-and-private-mappings.patch new file mode 100644 index 00000000000..793d7c23c39 --- /dev/null +++ b/queue-4.4/hugetlb-use-same-fault-hash-key-for-shared-and-private-mappings.patch @@ -0,0 +1,155 @@ +From 1b426bac66e6cc83c9f2d92b96e4e72acf43419a Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Mon, 13 May 2019 17:19:41 -0700 +Subject: hugetlb: use same fault hash key for shared and private mappings + +From: Mike Kravetz + +commit 1b426bac66e6cc83c9f2d92b96e4e72acf43419a upstream. + +hugetlb uses a fault mutex hash table to prevent page faults of the +same pages concurrently. The key for shared and private mappings is +different. Shared keys off address_space and file index. Private keys +off mm and virtual address. Consider a private mappings of a populated +hugetlbfs file. A fault will map the page from the file and if needed +do a COW to map a writable page. + +Hugetlbfs hole punch uses the fault mutex to prevent mappings of file +pages. It uses the address_space file index key. However, private +mappings will use a different key and could race with this code to map +the file page. This causes problems (BUG) for the page cache remove +code as it expects the page to be unmapped. A sample stack is: + +page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) +kernel BUG at mm/filemap.c:169! +... +RIP: 0010:unaccount_page_cache_page+0x1b8/0x200 +... +Call Trace: +__delete_from_page_cache+0x39/0x220 +delete_from_page_cache+0x45/0x70 +remove_inode_hugepages+0x13c/0x380 +? __add_to_page_cache_locked+0x162/0x380 +hugetlbfs_fallocate+0x403/0x540 +? _cond_resched+0x15/0x30 +? __inode_security_revalidate+0x5d/0x70 +? selinux_file_permission+0x100/0x130 +vfs_fallocate+0x13f/0x270 +ksys_fallocate+0x3c/0x80 +__x64_sys_fallocate+0x1a/0x20 +do_syscall_64+0x5b/0x180 +entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +There seems to be another potential COW issue/race with this approach +of different private and shared keys as noted in commit 8382d914ebf7 +("mm, hugetlb: improve page-fault scalability"). + +Since every hugetlb mapping (even anon and private) is actually a file +mapping, just use the address_space index key for all mappings. This +results in potentially more hash collisions. However, this should not +be the common case. + +Link: http://lkml.kernel.org/r/20190328234704.27083-3-mike.kravetz@oracle.com +Link: http://lkml.kernel.org/r/20190412165235.t4sscoujczfhuiyt@linux-r8p5 +Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages") +Signed-off-by: Mike Kravetz +Reviewed-by: Naoya Horiguchi +Reviewed-by: Davidlohr Bueso +Cc: Joonsoo Kim +Cc: "Kirill A . Shutemov" +Cc: Michal Hocko +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/hugetlbfs/inode.c | 7 ++----- + include/linux/hugetlb.h | 4 +--- + mm/hugetlb.c | 19 +++++-------------- + 3 files changed, 8 insertions(+), 22 deletions(-) + +--- a/fs/hugetlbfs/inode.c ++++ b/fs/hugetlbfs/inode.c +@@ -414,9 +414,7 @@ static void remove_inode_hugepages(struc + if (next >= end) + break; + +- hash = hugetlb_fault_mutex_hash(h, current->mm, +- &pseudo_vma, +- mapping, next, 0); ++ hash = hugetlb_fault_mutex_hash(h, mapping, next, 0); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + lock_page(page); +@@ -633,8 +631,7 @@ static long hugetlbfs_fallocate(struct f + addr = index * hpage_size; + + /* mutex taken here, fault path and hole punch */ +- hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, +- index, addr); ++ hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + /* See if already present in mapping to avoid alloc/free */ +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -91,9 +91,7 @@ void putback_active_hugepage(struct page + void free_huge_page(struct page *page); + void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve); + extern struct mutex *hugetlb_fault_mutex_table; +-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, +- struct vm_area_struct *vma, +- struct address_space *mapping, ++u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, + pgoff_t idx, unsigned long address); + + #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -3703,21 +3703,14 @@ backout_unlocked: + } + + #ifdef CONFIG_SMP +-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, +- struct vm_area_struct *vma, +- struct address_space *mapping, ++u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, + pgoff_t idx, unsigned long address) + { + unsigned long key[2]; + u32 hash; + +- if (vma->vm_flags & VM_SHARED) { +- key[0] = (unsigned long) mapping; +- key[1] = idx; +- } else { +- key[0] = (unsigned long) mm; +- key[1] = address >> huge_page_shift(h); +- } ++ key[0] = (unsigned long) mapping; ++ key[1] = idx; + + hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + +@@ -3728,9 +3721,7 @@ u32 hugetlb_fault_mutex_hash(struct hsta + * For uniprocesor systems we always use a single mutex, so just + * return 0 and avoid the hashing overhead. + */ +-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, +- struct vm_area_struct *vma, +- struct address_space *mapping, ++u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, + pgoff_t idx, unsigned long address) + { + return 0; +@@ -3776,7 +3767,7 @@ int hugetlb_fault(struct mm_struct *mm, + * get spurious allocation failures if two CPUs race to instantiate + * the same page in the page cache. + */ +- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); ++ hash = hugetlb_fault_mutex_hash(h, mapping, idx, address); + mutex_lock(&hugetlb_fault_mutex_table[hash]); + + entry = huge_ptep_get(ptep); diff --git a/queue-4.4/series b/queue-4.4/series index e479b95dc34..7c35cc81bf2 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -79,3 +79,8 @@ kvm-x86-fix-return-value-for-reserved-efer.patch bio-fix-improper-use-of-smp_mb__before_atomic.patch revert-scsi-sd-keep-disk-read-only-when-re-reading-partition.patch crypto-vmx-ctr-always-increment-iv-as-quadword.patch +gfs2-fix-sign-extension-bug-in-gfs2_update_stats.patch +btrfs-fix-race-between-ranged-fsync-and-writeback-of-adjacent-ranges.patch +btrfs-sysfs-don-t-leak-memory-when-failing-add-fsid.patch +fbdev-fix-divide-error-in-fb_var_to_videomode.patch +hugetlb-use-same-fault-hash-key-for-shared-and-private-mappings.patch