From c58dd7a7d3ea84de1c3246678772633445c4cf6b Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 2 Jul 2020 20:21:59 -0400 Subject: [PATCH] Fixes for 4.9 Signed-off-by: Sasha Levin --- ...ange-num_bytes-and-disk_num_bytes-ar.patch | 86 ++++++++ ...k-group-ref-counter-leak-after-failu.patch | 119 +++++++++++ ...lock-group-relocation-failure-due-to.patch | 201 ++++++++++++++++++ ...back-the-scrub-rate-pci-register-on-.patch | 47 ++++ ...-fix-swap-cache-node-allocation-mask.patch | 97 +++++++++ queue-4.9/series | 5 + 6 files changed, 555 insertions(+) create mode 100644 queue-4.9/btrfs-cow_file_range-num_bytes-and-disk_num_bytes-ar.patch create mode 100644 queue-4.9/btrfs-fix-a-block-group-ref-counter-leak-after-failu.patch create mode 100644 queue-4.9/btrfs-fix-data-block-group-relocation-failure-due-to.patch create mode 100644 queue-4.9/edac-amd64-read-back-the-scrub-rate-pci-register-on-.patch create mode 100644 queue-4.9/mm-fix-swap-cache-node-allocation-mask.patch create mode 100644 queue-4.9/series diff --git a/queue-4.9/btrfs-cow_file_range-num_bytes-and-disk_num_bytes-ar.patch b/queue-4.9/btrfs-cow_file_range-num_bytes-and-disk_num_bytes-ar.patch new file mode 100644 index 00000000000..3c53301c643 --- /dev/null +++ b/queue-4.9/btrfs-cow_file_range-num_bytes-and-disk_num_bytes-ar.patch @@ -0,0 +1,86 @@ +From c38c187d3bf0e7d8d2827d1c450f85e8987c38a9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 15 Feb 2018 12:29:38 +0800 +Subject: btrfs: cow_file_range() num_bytes and disk_num_bytes are same + +From: Anand Jain + +[ Upstream commit 3752d22fcea160cc2493e34f5e0e41cdd7fdd921 ] + +This patch deletes local variable disk_num_bytes as its value +is same as num_bytes in the function cow_file_range(). + +Signed-off-by: Anand Jain +Reviewed-by: Nikolay Borisov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/inode.c | 17 ++++++++--------- + 1 file changed, 8 insertions(+), 9 deletions(-) + +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index c425443c31fea..6d63050abe214 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -947,7 +947,6 @@ static noinline int cow_file_range(struct inode *inode, + u64 alloc_hint = 0; + u64 num_bytes; + unsigned long ram_size; +- u64 disk_num_bytes; + u64 cur_alloc_size; + u64 blocksize = root->sectorsize; + struct btrfs_key ins; +@@ -963,7 +962,6 @@ static noinline int cow_file_range(struct inode *inode, + + num_bytes = ALIGN(end - start + 1, blocksize); + num_bytes = max(blocksize, num_bytes); +- disk_num_bytes = num_bytes; + + /* if this is a small write inside eof, kick off defrag */ + if (num_bytes < SZ_64K && +@@ -992,16 +990,15 @@ static noinline int cow_file_range(struct inode *inode, + } + } + +- BUG_ON(disk_num_bytes > +- btrfs_super_total_bytes(root->fs_info->super_copy)); ++ BUG_ON(num_bytes > btrfs_super_total_bytes(root->fs_info->super_copy)); + + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + +- while (disk_num_bytes > 0) { ++ while (num_bytes > 0) { + unsigned long op; + +- cur_alloc_size = disk_num_bytes; ++ cur_alloc_size = num_bytes; + ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, + root->sectorsize, 0, alloc_hint, + &ins, 1, 1); +@@ -1058,7 +1055,7 @@ static noinline int cow_file_range(struct inode *inode, + + btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + +- if (disk_num_bytes < cur_alloc_size) ++ if (num_bytes < cur_alloc_size) + break; + + /* we're not doing compressed IO, don't unlock the first +@@ -1076,8 +1073,10 @@ static noinline int cow_file_range(struct inode *inode, + delalloc_end, locked_page, + EXTENT_LOCKED | EXTENT_DELALLOC, + op); +- disk_num_bytes -= cur_alloc_size; +- num_bytes -= cur_alloc_size; ++ if (num_bytes < cur_alloc_size) ++ num_bytes = 0; ++ else ++ num_bytes -= cur_alloc_size; + alloc_hint = ins.objectid + ins.offset; + start += cur_alloc_size; + } +-- +2.25.1 + diff --git a/queue-4.9/btrfs-fix-a-block-group-ref-counter-leak-after-failu.patch b/queue-4.9/btrfs-fix-a-block-group-ref-counter-leak-after-failu.patch new file mode 100644 index 00000000000..716b7e696c1 --- /dev/null +++ b/queue-4.9/btrfs-fix-a-block-group-ref-counter-leak-after-failu.patch @@ -0,0 +1,119 @@ +From 55659cf78daf65c3b85ae5d618bf81bfccd8b581 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Jun 2020 19:12:06 +0100 +Subject: btrfs: fix a block group ref counter leak after failure to remove + block group + +From: Filipe Manana + +[ Upstream commit 9fecd13202f520f3f25d5b1c313adb740fe19773 ] + +When removing a block group, if we fail to delete the block group's item +from the extent tree, we jump to the 'out' label and end up decrementing +the block group's reference count once only (by 1), resulting in a counter +leak because the block group at that point was already removed from the +block group cache rbtree - so we have to decrement the reference count +twice, once for the rbtree and once for our lookup at the start of the +function. + +There is a second bug where if removing the free space tree entries (the +call to remove_block_group_free_space()) fails we end up jumping to the +'out_put_group' label but end up decrementing the reference count only +once, when we should have done it twice, since we have already removed +the block group from the block group cache rbtree. This happens because +the reference count decrement for the rbtree reference happens after +attempting to remove the free space tree entries, which is far away from +the place where we remove the block group from the rbtree. + +To make things less error prone, decrement the reference count for the +rbtree immediately after removing the block group from it. This also +eleminates the need for two different exit labels on error, renaming +'out_put_label' to just 'out' and removing the old 'out'. + +Fixes: f6033c5e333238 ("btrfs: fix block group leak when removing fails") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Nikolay Borisov +Reviewed-by: Anand Jain +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/extent-tree.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index a83f353e44188..c0033a0d00787 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -10645,7 +10645,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; +- goto out_put_group; ++ goto out; + } + + /* +@@ -10684,7 +10684,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + ret = btrfs_orphan_add(trans, inode); + if (ret) { + btrfs_add_delayed_iput(inode); +- goto out_put_group; ++ goto out; + } + clear_nlink(inode); + /* One for the block groups ref */ +@@ -10707,13 +10707,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + if (ret < 0) +- goto out_put_group; ++ goto out; + if (ret > 0) + btrfs_release_path(path); + if (ret == 0) { + ret = btrfs_del_item(trans, tree_root, path); + if (ret) +- goto out_put_group; ++ goto out; + btrfs_release_path(path); + } + +@@ -10722,6 +10722,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + &root->fs_info->block_group_cache_tree); + RB_CLEAR_NODE(&block_group->cache_node); + ++ /* Once for the block groups rbtree */ ++ btrfs_put_block_group(block_group); ++ + if (root->fs_info->first_logical_byte == block_group->key.objectid) + root->fs_info->first_logical_byte = (u64)-1; + spin_unlock(&root->fs_info->block_group_cache_lock); +@@ -10871,10 +10874,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + + ret = remove_block_group_free_space(trans, root->fs_info, block_group); + if (ret) +- goto out_put_group; +- +- /* Once for the block groups rbtree */ +- btrfs_put_block_group(block_group); ++ goto out; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) +@@ -10884,10 +10884,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + + ret = btrfs_del_item(trans, root, path); + +-out_put_group: ++out: + /* Once for the lookup reference */ + btrfs_put_block_group(block_group); +-out: + btrfs_free_path(path); + return ret; + } +-- +2.25.1 + diff --git a/queue-4.9/btrfs-fix-data-block-group-relocation-failure-due-to.patch b/queue-4.9/btrfs-fix-data-block-group-relocation-failure-due-to.patch new file mode 100644 index 00000000000..1aca4111e16 --- /dev/null +++ b/queue-4.9/btrfs-fix-data-block-group-relocation-failure-due-to.patch @@ -0,0 +1,201 @@ +From 1184718438be44e5164c2c5a8c60c0d0043b0f3d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 8 Jun 2020 13:32:55 +0100 +Subject: btrfs: fix data block group relocation failure due to concurrent + scrub + +From: Filipe Manana + +[ Upstream commit 432cd2a10f1c10cead91fe706ff5dc52f06d642a ] + +When running relocation of a data block group while scrub is running in +parallel, it is possible that the relocation will fail and abort the +current transaction with an -EINVAL error: + + [134243.988595] BTRFS info (device sdc): found 14 extents, stage: move data extents + [134243.999871] ------------[ cut here ]------------ + [134244.000741] BTRFS: Transaction aborted (error -22) + [134244.001692] WARNING: CPU: 0 PID: 26954 at fs/btrfs/ctree.c:1071 __btrfs_cow_block+0x6a7/0x790 [btrfs] + [134244.003380] Modules linked in: btrfs blake2b_generic xor raid6_pq (...) + [134244.012577] CPU: 0 PID: 26954 Comm: btrfs Tainted: G W 5.6.0-rc7-btrfs-next-58 #5 + [134244.014162] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014 + [134244.016184] RIP: 0010:__btrfs_cow_block+0x6a7/0x790 [btrfs] + [134244.017151] Code: 48 c7 c7 (...) + [134244.020549] RSP: 0018:ffffa41607863888 EFLAGS: 00010286 + [134244.021515] RAX: 0000000000000000 RBX: ffff9614bdfe09c8 RCX: 0000000000000000 + [134244.022822] RDX: 0000000000000001 RSI: ffffffffb3d63980 RDI: 0000000000000001 + [134244.024124] RBP: ffff961589e8c000 R08: 0000000000000000 R09: 0000000000000001 + [134244.025424] R10: ffffffffc0ae5955 R11: 0000000000000000 R12: ffff9614bd530d08 + [134244.026725] R13: ffff9614ced41b88 R14: ffff9614bdfe2a48 R15: 0000000000000000 + [134244.028024] FS: 00007f29b63c08c0(0000) GS:ffff9615ba600000(0000) knlGS:0000000000000000 + [134244.029491] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [134244.030560] CR2: 00007f4eb339b000 CR3: 0000000130d6e006 CR4: 00000000003606f0 + [134244.031997] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [134244.033153] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [134244.034484] Call Trace: + [134244.034984] btrfs_cow_block+0x12b/0x2b0 [btrfs] + [134244.035859] do_relocation+0x30b/0x790 [btrfs] + [134244.036681] ? do_raw_spin_unlock+0x49/0xc0 + [134244.037460] ? _raw_spin_unlock+0x29/0x40 + [134244.038235] relocate_tree_blocks+0x37b/0x730 [btrfs] + [134244.039245] relocate_block_group+0x388/0x770 [btrfs] + [134244.040228] btrfs_relocate_block_group+0x161/0x2e0 [btrfs] + [134244.041323] btrfs_relocate_chunk+0x36/0x110 [btrfs] + [134244.041345] btrfs_balance+0xc06/0x1860 [btrfs] + [134244.043382] ? btrfs_ioctl_balance+0x27c/0x310 [btrfs] + [134244.045586] btrfs_ioctl_balance+0x1ed/0x310 [btrfs] + [134244.045611] btrfs_ioctl+0x1880/0x3760 [btrfs] + [134244.049043] ? do_raw_spin_unlock+0x49/0xc0 + [134244.049838] ? _raw_spin_unlock+0x29/0x40 + [134244.050587] ? __handle_mm_fault+0x11b3/0x14b0 + [134244.051417] ? ksys_ioctl+0x92/0xb0 + [134244.052070] ksys_ioctl+0x92/0xb0 + [134244.052701] ? trace_hardirqs_off_thunk+0x1a/0x1c + [134244.053511] __x64_sys_ioctl+0x16/0x20 + [134244.054206] do_syscall_64+0x5c/0x280 + [134244.054891] entry_SYSCALL_64_after_hwframe+0x49/0xbe + [134244.055819] RIP: 0033:0x7f29b51c9dd7 + [134244.056491] Code: 00 00 00 (...) + [134244.059767] RSP: 002b:00007ffcccc1dd08 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 + [134244.061168] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f29b51c9dd7 + [134244.062474] RDX: 00007ffcccc1dda0 RSI: 00000000c4009420 RDI: 0000000000000003 + [134244.063771] RBP: 0000000000000003 R08: 00005565cea4b000 R09: 0000000000000000 + [134244.065032] R10: 0000000000000541 R11: 0000000000000202 R12: 00007ffcccc2060a + [134244.066327] R13: 00007ffcccc1dda0 R14: 0000000000000002 R15: 00007ffcccc1dec0 + [134244.067626] irq event stamp: 0 + [134244.068202] hardirqs last enabled at (0): [<0000000000000000>] 0x0 + [134244.069351] hardirqs last disabled at (0): [] copy_process+0x74f/0x2020 + [134244.070909] softirqs last enabled at (0): [] copy_process+0x74f/0x2020 + [134244.072392] softirqs last disabled at (0): [<0000000000000000>] 0x0 + [134244.073432] ---[ end trace bd7c03622e0b0a99 ]--- + +The -EINVAL error comes from the following chain of function calls: + + __btrfs_cow_block() <-- aborts the transaction + btrfs_reloc_cow_block() + replace_file_extents() + get_new_location() <-- returns -EINVAL + +When relocating a data block group, for each allocated extent of the block +group, we preallocate another extent (at prealloc_file_extent_cluster()), +associated with the data relocation inode, and then dirty all its pages. +These preallocated extents have, and must have, the same size that extents +from the data block group being relocated have. + +Later before we start the relocation stage that updates pointers (bytenr +field of file extent items) to point to the the new extents, we trigger +writeback for the data relocation inode. The expectation is that writeback +will write the pages to the previously preallocated extents, that it +follows the NOCOW path. That is generally the case, however, if a scrub +is running it may have turned the block group that contains those extents +into RO mode, in which case writeback falls back to the COW path. + +However in the COW path instead of allocating exactly one extent with the +expected size, the allocator may end up allocating several smaller extents +due to free space fragmentation - because we tell it at cow_file_range() +that the minimum allocation size can match the filesystem's sector size. +This later breaks the relocation's expectation that an extent associated +to a file extent item in the data relocation inode has the same size as +the respective extent pointed by a file extent item in another tree - in +this case the extent to which the relocation inode poins to is smaller, +causing relocation.c:get_new_location() to return -EINVAL. + +For example, if we are relocating a data block group X that has a logical +address of X and the block group has an extent allocated at the logical +address X + 128KiB with a size of 64KiB: + +1) At prealloc_file_extent_cluster() we allocate an extent for the data + relocation inode with a size of 64KiB and associate it to the file + offset 128KiB (X + 128KiB - X) of the data relocation inode. This + preallocated extent was allocated at block group Z; + +2) A scrub running in parallel turns block group Z into RO mode and + starts scrubing its extents; + +3) Relocation triggers writeback for the data relocation inode; + +4) When running delalloc (btrfs_run_delalloc_range()), we try first the + NOCOW path because the data relocation inode has BTRFS_INODE_PREALLOC + set in its flags. However, because block group Z is in RO mode, the + NOCOW path (run_delalloc_nocow()) falls back into the COW path, by + calling cow_file_range(); + +5) At cow_file_range(), in the first iteration of the while loop we call + btrfs_reserve_extent() to allocate a 64KiB extent and pass it a minimum + allocation size of 4KiB (fs_info->sectorsize). Due to free space + fragmentation, btrfs_reserve_extent() ends up allocating two extents + of 32KiB each, each one on a different iteration of that while loop; + +6) Writeback of the data relocation inode completes; + +7) Relocation proceeds and ends up at relocation.c:replace_file_extents(), + with a leaf which has a file extent item that points to the data extent + from block group X, that has a logical address (bytenr) of X + 128KiB + and a size of 64KiB. Then it calls get_new_location(), which does a + lookup in the data relocation tree for a file extent item starting at + offset 128KiB (X + 128KiB - X) and belonging to the data relocation + inode. It finds a corresponding file extent item, however that item + points to an extent that has a size of 32KiB, which doesn't match the + expected size of 64KiB, resuling in -EINVAL being returned from this + function and propagated up to __btrfs_cow_block(), which aborts the + current transaction. + +To fix this make sure that at cow_file_range() when we call the allocator +we pass it a minimum allocation size corresponding the desired extent size +if the inode belongs to the data relocation tree, otherwise pass it the +filesystem's sector size as the minimum allocation size. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/inode.c | 19 ++++++++++++++++++- + 1 file changed, 18 insertions(+), 1 deletion(-) + +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 6d63050abe214..dfc0b3adf57af 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -947,6 +947,7 @@ static noinline int cow_file_range(struct inode *inode, + u64 alloc_hint = 0; + u64 num_bytes; + unsigned long ram_size; ++ u64 min_alloc_size; + u64 cur_alloc_size; + u64 blocksize = root->sectorsize; + struct btrfs_key ins; +@@ -995,12 +996,28 @@ static noinline int cow_file_range(struct inode *inode, + alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + ++ /* ++ * Relocation relies on the relocated extents to have exactly the same ++ * size as the original extents. Normally writeback for relocation data ++ * extents follows a NOCOW path because relocation preallocates the ++ * extents. However, due to an operation such as scrub turning a block ++ * group to RO mode, it may fallback to COW mode, so we must make sure ++ * an extent allocated during COW has exactly the requested size and can ++ * not be split into smaller extents, otherwise relocation breaks and ++ * fails during the stage where it updates the bytenr of file extent ++ * items. ++ */ ++ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) ++ min_alloc_size = num_bytes; ++ else ++ min_alloc_size = root->sectorsize; ++ + while (num_bytes > 0) { + unsigned long op; + + cur_alloc_size = num_bytes; + ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, +- root->sectorsize, 0, alloc_hint, ++ min_alloc_size, 0, alloc_hint, + &ins, 1, 1); + if (ret < 0) + goto out_unlock; +-- +2.25.1 + diff --git a/queue-4.9/edac-amd64-read-back-the-scrub-rate-pci-register-on-.patch b/queue-4.9/edac-amd64-read-back-the-scrub-rate-pci-register-on-.patch new file mode 100644 index 00000000000..c6a3fba069b --- /dev/null +++ b/queue-4.9/edac-amd64-read-back-the-scrub-rate-pci-register-on-.patch @@ -0,0 +1,47 @@ +From 85e6ae137b1c9bf163a0ed518c0d29b8452ebb3b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jun 2020 20:25:25 +0200 +Subject: EDAC/amd64: Read back the scrub rate PCI register on F15h + +From: Borislav Petkov + +[ Upstream commit ee470bb25d0dcdf126f586ec0ae6dca66cb340a4 ] + +Commit: + + da92110dfdfa ("EDAC, amd64_edac: Extend scrub rate support to F15hM60h") + +added support for F15h, model 0x60 CPUs but in doing so, missed to read +back SCRCTRL PCI config register on F15h CPUs which are *not* model +0x60. Add that read so that doing + + $ cat /sys/devices/system/edac/mc/mc0/sdram_scrub_rate + +can show the previously set DRAM scrub rate. + +Fixes: da92110dfdfa ("EDAC, amd64_edac: Extend scrub rate support to F15hM60h") +Reported-by: Anders Andersson +Signed-off-by: Borislav Petkov +Cc: #v4.4.. +Link: https://lkml.kernel.org/r/CAKkunMbNWppx_i6xSdDHLseA2QQmGJqj_crY=NF-GZML5np4Vw@mail.gmail.com +Signed-off-by: Sasha Levin +--- + drivers/edac/amd64_edac.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c +index 1c5f23224b3cb..020dd07d1c23a 100644 +--- a/drivers/edac/amd64_edac.c ++++ b/drivers/edac/amd64_edac.c +@@ -243,6 +243,8 @@ static int get_scrub_rate(struct mem_ctl_info *mci) + + if (pvt->model == 0x60) + amd64_read_pci_cfg(pvt->F2, F15H_M60H_SCRCTRL, &scrubval); ++ else ++ amd64_read_pci_cfg(pvt->F3, SCRCTRL, &scrubval); + } else + amd64_read_pci_cfg(pvt->F3, SCRCTRL, &scrubval); + +-- +2.25.1 + diff --git a/queue-4.9/mm-fix-swap-cache-node-allocation-mask.patch b/queue-4.9/mm-fix-swap-cache-node-allocation-mask.patch new file mode 100644 index 00000000000..a2d1e0918b4 --- /dev/null +++ b/queue-4.9/mm-fix-swap-cache-node-allocation-mask.patch @@ -0,0 +1,97 @@ +From 05169e6bafcbfbb07962e053706e06db389f5ada Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 25 Jun 2020 20:29:59 -0700 +Subject: mm: fix swap cache node allocation mask + +From: Hugh Dickins + +[ Upstream commit 243bce09c91b0145aeaedd5afba799d81841c030 ] + +Chris Murphy reports that a slightly overcommitted load, testing swap +and zram along with i915, splats and keeps on splatting, when it had +better fail less noisily: + + gnome-shell: page allocation failure: order:0, + mode:0x400d0(__GFP_IO|__GFP_FS|__GFP_COMP|__GFP_RECLAIMABLE), + nodemask=(null),cpuset=/,mems_allowed=0 + CPU: 2 PID: 1155 Comm: gnome-shell Not tainted 5.7.0-1.fc33.x86_64 #1 + Call Trace: + dump_stack+0x64/0x88 + warn_alloc.cold+0x75/0xd9 + __alloc_pages_slowpath.constprop.0+0xcfa/0xd30 + __alloc_pages_nodemask+0x2df/0x320 + alloc_slab_page+0x195/0x310 + allocate_slab+0x3c5/0x440 + ___slab_alloc+0x40c/0x5f0 + __slab_alloc+0x1c/0x30 + kmem_cache_alloc+0x20e/0x220 + xas_nomem+0x28/0x70 + add_to_swap_cache+0x321/0x400 + __read_swap_cache_async+0x105/0x240 + swap_cluster_readahead+0x22c/0x2e0 + shmem_swapin+0x8e/0xc0 + shmem_swapin_page+0x196/0x740 + shmem_getpage_gfp+0x3a2/0xa60 + shmem_read_mapping_page_gfp+0x32/0x60 + shmem_get_pages+0x155/0x5e0 [i915] + __i915_gem_object_get_pages+0x68/0xa0 [i915] + i915_vma_pin+0x3fe/0x6c0 [i915] + eb_add_vma+0x10b/0x2c0 [i915] + i915_gem_do_execbuffer+0x704/0x3430 [i915] + i915_gem_execbuffer2_ioctl+0x1ea/0x3e0 [i915] + drm_ioctl_kernel+0x86/0xd0 [drm] + drm_ioctl+0x206/0x390 [drm] + ksys_ioctl+0x82/0xc0 + __x64_sys_ioctl+0x16/0x20 + do_syscall_64+0x5b/0xf0 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Reported on 5.7, but it goes back really to 3.1: when +shmem_read_mapping_page_gfp() was implemented for use by i915, and +allowed for __GFP_NORETRY and __GFP_NOWARN flags in most places, but +missed swapin's "& GFP_KERNEL" mask for page tree node allocation in +__read_swap_cache_async() - that was to mask off HIGHUSER_MOVABLE bits +from what page cache uses, but GFP_RECLAIM_MASK is now what's needed. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=208085 +Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2006151330070.11064@eggly.anvils +Fixes: 68da9f055755 ("tmpfs: pass gfp to shmem_getpage_gfp") +Signed-off-by: Hugh Dickins +Reviewed-by: Vlastimil Babka +Reviewed-by: Matthew Wilcox (Oracle) +Reported-by: Chris Murphy +Analyzed-by: Vlastimil Babka +Analyzed-by: Matthew Wilcox +Tested-by: Chris Murphy +Cc: [3.1+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + mm/swap_state.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/mm/swap_state.c b/mm/swap_state.c +index 35d7e0ee1c77c..f5cb6b23cedaf 100644 +--- a/mm/swap_state.c ++++ b/mm/swap_state.c +@@ -19,6 +19,7 @@ + #include + + #include ++#include "internal.h" + + /* + * swapper_space is a fiction, retained to simplify the path through +@@ -326,7 +327,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, + /* + * call radix_tree_preload() while we can wait. + */ +- err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); ++ err = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK); + if (err) + break; + +-- +2.25.1 + diff --git a/queue-4.9/series b/queue-4.9/series new file mode 100644 index 00000000000..5b2b3c753ef --- /dev/null +++ b/queue-4.9/series @@ -0,0 +1,5 @@ +btrfs-fix-a-block-group-ref-counter-leak-after-failu.patch +btrfs-cow_file_range-num_bytes-and-disk_num_bytes-ar.patch +btrfs-fix-data-block-group-relocation-failure-due-to.patch +mm-fix-swap-cache-node-allocation-mask.patch +edac-amd64-read-back-the-scrub-rate-pci-register-on-.patch -- 2.47.3