From: Greg Kroah-Hartman Date: Sat, 28 Nov 2020 12:38:41 +0000 (+0100) Subject: 5.9-stable patches X-Git-Tag: v4.4.247~41 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=06d3ca5474323d7adfefbe76a53b32caa2488c9e;p=thirdparty%2Fkernel%2Fstable-queue.git 5.9-stable patches added patches: btrfs-don-t-access-possibly-stale-fs_info-data-for-printing-duplicate-device.patch btrfs-fix-lockdep-splat-when-reading-qgroup-config-on-mount.patch btrfs-fix-missing-delalloc-new-bit-for-new-delalloc-ranges.patch btrfs-tree-checker-add-missing-return-after-error-in-root_item.patch btrfs-tree-checker-add-missing-returns-after-data_ref-alignment-checks.patch ib-hfi1-ensure-correct-mm-is-used-at-all-times.patch rdma-i40iw-address-an-mmap-handler-exploit-in-i40iw.patch --- diff --git a/queue-4.4/series b/queue-4.4/series new file mode 100644 index 00000000000..e69de29bb2d diff --git a/queue-5.9/btrfs-don-t-access-possibly-stale-fs_info-data-for-printing-duplicate-device.patch b/queue-5.9/btrfs-don-t-access-possibly-stale-fs_info-data-for-printing-duplicate-device.patch new file mode 100644 index 00000000000..4f185adea87 --- /dev/null +++ b/queue-5.9/btrfs-don-t-access-possibly-stale-fs_info-data-for-printing-duplicate-device.patch @@ -0,0 +1,171 @@ +From 0697d9a610998b8bdee6b2390836cb2391d8fd1a Mon Sep 17 00:00:00 2001 +From: Johannes Thumshirn +Date: Wed, 18 Nov 2020 18:03:26 +0900 +Subject: btrfs: don't access possibly stale fs_info data for printing duplicate device + +From: Johannes Thumshirn + +commit 0697d9a610998b8bdee6b2390836cb2391d8fd1a upstream. + +Syzbot reported a possible use-after-free when printing a duplicate device +warning device_list_add(). + +At this point it can happen that a btrfs_device::fs_info is not correctly +setup yet, so we're accessing stale data, when printing the warning +message using the btrfs_printk() wrappers. + + ================================================================== + BUG: KASAN: use-after-free in btrfs_printk+0x3eb/0x435 fs/btrfs/super.c:245 + Read of size 8 at addr ffff8880878e06a8 by task syz-executor225/7068 + + CPU: 1 PID: 7068 Comm: syz-executor225 Not tainted 5.9.0-rc5-syzkaller #0 + Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 + Call Trace: + __dump_stack lib/dump_stack.c:77 [inline] + dump_stack+0x1d6/0x29e lib/dump_stack.c:118 + print_address_description+0x66/0x620 mm/kasan/report.c:383 + __kasan_report mm/kasan/report.c:513 [inline] + kasan_report+0x132/0x1d0 mm/kasan/report.c:530 + btrfs_printk+0x3eb/0x435 fs/btrfs/super.c:245 + device_list_add+0x1a88/0x1d60 fs/btrfs/volumes.c:943 + btrfs_scan_one_device+0x196/0x490 fs/btrfs/volumes.c:1359 + btrfs_mount_root+0x48f/0xb60 fs/btrfs/super.c:1634 + legacy_get_tree+0xea/0x180 fs/fs_context.c:592 + vfs_get_tree+0x88/0x270 fs/super.c:1547 + fc_mount fs/namespace.c:978 [inline] + vfs_kern_mount+0xc9/0x160 fs/namespace.c:1008 + btrfs_mount+0x33c/0xae0 fs/btrfs/super.c:1732 + legacy_get_tree+0xea/0x180 fs/fs_context.c:592 + vfs_get_tree+0x88/0x270 fs/super.c:1547 + do_new_mount fs/namespace.c:2875 [inline] + path_mount+0x179d/0x29e0 fs/namespace.c:3192 + do_mount fs/namespace.c:3205 [inline] + __do_sys_mount fs/namespace.c:3413 [inline] + __se_sys_mount+0x126/0x180 fs/namespace.c:3390 + do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + RIP: 0033:0x44840a + RSP: 002b:00007ffedfffd608 EFLAGS: 00000293 ORIG_RAX: 00000000000000a5 + RAX: ffffffffffffffda RBX: 00007ffedfffd670 RCX: 000000000044840a + RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007ffedfffd630 + RBP: 00007ffedfffd630 R08: 00007ffedfffd670 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000000293 R12: 000000000000001a + R13: 0000000000000004 R14: 0000000000000003 R15: 0000000000000003 + + Allocated by task 6945: + kasan_save_stack mm/kasan/common.c:48 [inline] + kasan_set_track mm/kasan/common.c:56 [inline] + __kasan_kmalloc+0x100/0x130 mm/kasan/common.c:461 + kmalloc_node include/linux/slab.h:577 [inline] + kvmalloc_node+0x81/0x110 mm/util.c:574 + kvmalloc include/linux/mm.h:757 [inline] + kvzalloc include/linux/mm.h:765 [inline] + btrfs_mount_root+0xd0/0xb60 fs/btrfs/super.c:1613 + legacy_get_tree+0xea/0x180 fs/fs_context.c:592 + vfs_get_tree+0x88/0x270 fs/super.c:1547 + fc_mount fs/namespace.c:978 [inline] + vfs_kern_mount+0xc9/0x160 fs/namespace.c:1008 + btrfs_mount+0x33c/0xae0 fs/btrfs/super.c:1732 + legacy_get_tree+0xea/0x180 fs/fs_context.c:592 + vfs_get_tree+0x88/0x270 fs/super.c:1547 + do_new_mount fs/namespace.c:2875 [inline] + path_mount+0x179d/0x29e0 fs/namespace.c:3192 + do_mount fs/namespace.c:3205 [inline] + __do_sys_mount fs/namespace.c:3413 [inline] + __se_sys_mount+0x126/0x180 fs/namespace.c:3390 + do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + Freed by task 6945: + kasan_save_stack mm/kasan/common.c:48 [inline] + kasan_set_track+0x3d/0x70 mm/kasan/common.c:56 + kasan_set_free_info+0x17/0x30 mm/kasan/generic.c:355 + __kasan_slab_free+0xdd/0x110 mm/kasan/common.c:422 + __cache_free mm/slab.c:3418 [inline] + kfree+0x113/0x200 mm/slab.c:3756 + deactivate_locked_super+0xa7/0xf0 fs/super.c:335 + btrfs_mount_root+0x72b/0xb60 fs/btrfs/super.c:1678 + legacy_get_tree+0xea/0x180 fs/fs_context.c:592 + vfs_get_tree+0x88/0x270 fs/super.c:1547 + fc_mount fs/namespace.c:978 [inline] + vfs_kern_mount+0xc9/0x160 fs/namespace.c:1008 + btrfs_mount+0x33c/0xae0 fs/btrfs/super.c:1732 + legacy_get_tree+0xea/0x180 fs/fs_context.c:592 + vfs_get_tree+0x88/0x270 fs/super.c:1547 + do_new_mount fs/namespace.c:2875 [inline] + path_mount+0x179d/0x29e0 fs/namespace.c:3192 + do_mount fs/namespace.c:3205 [inline] + __do_sys_mount fs/namespace.c:3413 [inline] + __se_sys_mount+0x126/0x180 fs/namespace.c:3390 + do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + + The buggy address belongs to the object at ffff8880878e0000 + which belongs to the cache kmalloc-16k of size 16384 + The buggy address is located 1704 bytes inside of + 16384-byte region [ffff8880878e0000, ffff8880878e4000) + The buggy address belongs to the page: + page:0000000060704f30 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x878e0 + head:0000000060704f30 order:3 compound_mapcount:0 compound_pincount:0 + flags: 0xfffe0000010200(slab|head) + raw: 00fffe0000010200 ffffea00028e9a08 ffffea00021e3608 ffff8880aa440b00 + raw: 0000000000000000 ffff8880878e0000 0000000100000001 0000000000000000 + page dumped because: kasan: bad access detected + + Memory state around the buggy address: + ffff8880878e0580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ffff8880878e0600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + >ffff8880878e0680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ^ + ffff8880878e0700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ffff8880878e0780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ================================================================== + +The syzkaller reproducer for this use-after-free crafts a filesystem image +and loop mounts it twice in a loop. The mount will fail as the crafted +image has an invalid chunk tree. When this happens btrfs_mount_root() will +call deactivate_locked_super(), which then cleans up fs_info and +fs_info::sb. If a second thread now adds the same block-device to the +filesystem, it will get detected as a duplicate device and +device_list_add() will reject the duplicate and print a warning. But as +the fs_info pointer passed in is non-NULL this will result in a +use-after-free. + +Instead of printing possibly uninitialized or already freed memory in +btrfs_printk(), explicitly pass in a NULL fs_info so the printing of the +device name will be skipped altogether. + +There was a slightly different approach discussed in +https://lore.kernel.org/linux-btrfs/20200114060920.4527-1-anand.jain@oracle.com/t/#u + +Link: https://lore.kernel.org/linux-btrfs/000000000000c9e14b05afcc41ba@google.com +Reported-by: syzbot+582e66e5edf36a22c7b0@syzkaller.appspotmail.com +CC: stable@vger.kernel.org # 4.19+ +Reviewed-by: Nikolay Borisov +Reviewed-by: Anand Jain +Signed-off-by: Johannes Thumshirn +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/volumes.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -941,7 +941,13 @@ static noinline struct btrfs_device *dev + if (device->bdev != path_bdev) { + bdput(path_bdev); + mutex_unlock(&fs_devices->device_list_mutex); +- btrfs_warn_in_rcu(device->fs_info, ++ /* ++ * device->fs_info may not be reliable here, so ++ * pass in a NULL instead. This avoids a ++ * possible use-after-free when the fs_info and ++ * fs_info->sb are already torn down. ++ */ ++ btrfs_warn_in_rcu(NULL, + "duplicate device %s devid %llu generation %llu scanned by %s (%d)", + path, devid, found_transid, + current->comm, diff --git a/queue-5.9/btrfs-fix-lockdep-splat-when-reading-qgroup-config-on-mount.patch b/queue-5.9/btrfs-fix-lockdep-splat-when-reading-qgroup-config-on-mount.patch new file mode 100644 index 00000000000..07f87918784 --- /dev/null +++ b/queue-5.9/btrfs-fix-lockdep-splat-when-reading-qgroup-config-on-mount.patch @@ -0,0 +1,162 @@ +From 3d05cad3c357a2b749912914356072b38435edfa Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 23 Nov 2020 14:28:44 +0000 +Subject: btrfs: fix lockdep splat when reading qgroup config on mount + +From: Filipe Manana + +commit 3d05cad3c357a2b749912914356072b38435edfa upstream. + +Lockdep reported the following splat when running test btrfs/190 from +fstests: + + [ 9482.126098] ====================================================== + [ 9482.126184] WARNING: possible circular locking dependency detected + [ 9482.126281] 5.10.0-rc4-btrfs-next-73 #1 Not tainted + [ 9482.126365] ------------------------------------------------------ + [ 9482.126456] mount/24187 is trying to acquire lock: + [ 9482.126534] ffffa0c869a7dac0 (&fs_info->qgroup_rescan_lock){+.+.}-{3:3}, at: qgroup_rescan_init+0x43/0xf0 [btrfs] + [ 9482.126647] + but task is already holding lock: + [ 9482.126777] ffffa0c892ebd3a0 (btrfs-quota-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x27/0x120 [btrfs] + [ 9482.126886] + which lock already depends on the new lock. + + [ 9482.127078] + the existing dependency chain (in reverse order) is: + [ 9482.127213] + -> #1 (btrfs-quota-00){++++}-{3:3}: + [ 9482.127366] lock_acquire+0xd8/0x490 + [ 9482.127436] down_read_nested+0x45/0x220 + [ 9482.127528] __btrfs_tree_read_lock+0x27/0x120 [btrfs] + [ 9482.127613] btrfs_read_lock_root_node+0x41/0x130 [btrfs] + [ 9482.127702] btrfs_search_slot+0x514/0xc30 [btrfs] + [ 9482.127788] update_qgroup_status_item+0x72/0x140 [btrfs] + [ 9482.127877] btrfs_qgroup_rescan_worker+0xde/0x680 [btrfs] + [ 9482.127964] btrfs_work_helper+0xf1/0x600 [btrfs] + [ 9482.128039] process_one_work+0x24e/0x5e0 + [ 9482.128110] worker_thread+0x50/0x3b0 + [ 9482.128181] kthread+0x153/0x170 + [ 9482.128256] ret_from_fork+0x22/0x30 + [ 9482.128327] + -> #0 (&fs_info->qgroup_rescan_lock){+.+.}-{3:3}: + [ 9482.128464] check_prev_add+0x91/0xc60 + [ 9482.128551] __lock_acquire+0x1740/0x3110 + [ 9482.128623] lock_acquire+0xd8/0x490 + [ 9482.130029] __mutex_lock+0xa3/0xb30 + [ 9482.130590] qgroup_rescan_init+0x43/0xf0 [btrfs] + [ 9482.131577] btrfs_read_qgroup_config+0x43a/0x550 [btrfs] + [ 9482.132175] open_ctree+0x1228/0x18a0 [btrfs] + [ 9482.132756] btrfs_mount_root.cold+0x13/0xed [btrfs] + [ 9482.133325] legacy_get_tree+0x30/0x60 + [ 9482.133866] vfs_get_tree+0x28/0xe0 + [ 9482.134392] fc_mount+0xe/0x40 + [ 9482.134908] vfs_kern_mount.part.0+0x71/0x90 + [ 9482.135428] btrfs_mount+0x13b/0x3e0 [btrfs] + [ 9482.135942] legacy_get_tree+0x30/0x60 + [ 9482.136444] vfs_get_tree+0x28/0xe0 + [ 9482.136949] path_mount+0x2d7/0xa70 + [ 9482.137438] do_mount+0x75/0x90 + [ 9482.137923] __x64_sys_mount+0x8e/0xd0 + [ 9482.138400] do_syscall_64+0x33/0x80 + [ 9482.138873] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [ 9482.139346] + other info that might help us debug this: + + [ 9482.140735] Possible unsafe locking scenario: + + [ 9482.141594] CPU0 CPU1 + [ 9482.142011] ---- ---- + [ 9482.142411] lock(btrfs-quota-00); + [ 9482.142806] lock(&fs_info->qgroup_rescan_lock); + [ 9482.143216] lock(btrfs-quota-00); + [ 9482.143629] lock(&fs_info->qgroup_rescan_lock); + [ 9482.144056] + *** DEADLOCK *** + + [ 9482.145242] 2 locks held by mount/24187: + [ 9482.145637] #0: ffffa0c8411c40e8 (&type->s_umount_key#44/1){+.+.}-{3:3}, at: alloc_super+0xb9/0x400 + [ 9482.146061] #1: ffffa0c892ebd3a0 (btrfs-quota-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x27/0x120 [btrfs] + [ 9482.146509] + stack backtrace: + [ 9482.147350] CPU: 1 PID: 24187 Comm: mount Not tainted 5.10.0-rc4-btrfs-next-73 #1 + [ 9482.147788] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 + [ 9482.148709] Call Trace: + [ 9482.149169] dump_stack+0x8d/0xb5 + [ 9482.149628] check_noncircular+0xff/0x110 + [ 9482.150090] check_prev_add+0x91/0xc60 + [ 9482.150561] ? kvm_clock_read+0x14/0x30 + [ 9482.151017] ? kvm_sched_clock_read+0x5/0x10 + [ 9482.151470] __lock_acquire+0x1740/0x3110 + [ 9482.151941] ? __btrfs_tree_read_lock+0x27/0x120 [btrfs] + [ 9482.152402] lock_acquire+0xd8/0x490 + [ 9482.152887] ? qgroup_rescan_init+0x43/0xf0 [btrfs] + [ 9482.153354] __mutex_lock+0xa3/0xb30 + [ 9482.153826] ? qgroup_rescan_init+0x43/0xf0 [btrfs] + [ 9482.154301] ? qgroup_rescan_init+0x43/0xf0 [btrfs] + [ 9482.154768] ? qgroup_rescan_init+0x43/0xf0 [btrfs] + [ 9482.155226] qgroup_rescan_init+0x43/0xf0 [btrfs] + [ 9482.155690] btrfs_read_qgroup_config+0x43a/0x550 [btrfs] + [ 9482.156160] open_ctree+0x1228/0x18a0 [btrfs] + [ 9482.156643] btrfs_mount_root.cold+0x13/0xed [btrfs] + [ 9482.157108] ? rcu_read_lock_sched_held+0x5d/0x90 + [ 9482.157567] ? kfree+0x31f/0x3e0 + [ 9482.158030] legacy_get_tree+0x30/0x60 + [ 9482.158489] vfs_get_tree+0x28/0xe0 + [ 9482.158947] fc_mount+0xe/0x40 + [ 9482.159403] vfs_kern_mount.part.0+0x71/0x90 + [ 9482.159875] btrfs_mount+0x13b/0x3e0 [btrfs] + [ 9482.160335] ? rcu_read_lock_sched_held+0x5d/0x90 + [ 9482.160805] ? kfree+0x31f/0x3e0 + [ 9482.161260] ? legacy_get_tree+0x30/0x60 + [ 9482.161714] legacy_get_tree+0x30/0x60 + [ 9482.162166] vfs_get_tree+0x28/0xe0 + [ 9482.162616] path_mount+0x2d7/0xa70 + [ 9482.163070] do_mount+0x75/0x90 + [ 9482.163525] __x64_sys_mount+0x8e/0xd0 + [ 9482.163986] do_syscall_64+0x33/0x80 + [ 9482.164437] entry_SYSCALL_64_after_hwframe+0x44/0xa9 + [ 9482.164902] RIP: 0033:0x7f51e907caaa + +This happens because at btrfs_read_qgroup_config() we can call +qgroup_rescan_init() while holding a read lock on a quota btree leaf, +acquired by the previous call to btrfs_search_slot_for_read(), and +qgroup_rescan_init() acquires the mutex qgroup_rescan_lock. + +A qgroup rescan worker does the opposite: it acquires the mutex +qgroup_rescan_lock, at btrfs_qgroup_rescan_worker(), and then tries to +update the qgroup status item in the quota btree through the call to +update_qgroup_status_item(). This inversion of locking order +between the qgroup_rescan_lock mutex and quota btree locks causes the +splat. + +Fix this simply by releasing and freeing the path before calling +qgroup_rescan_init() at btrfs_read_qgroup_config(). + +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -497,13 +497,13 @@ next2: + break; + } + out: ++ btrfs_free_path(path); + fs_info->qgroup_flags |= flags; + if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) + clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && + ret >= 0) + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); +- btrfs_free_path(path); + + if (ret < 0) { + ulist_free(fs_info->qgroup_ulist); diff --git a/queue-5.9/btrfs-fix-missing-delalloc-new-bit-for-new-delalloc-ranges.patch b/queue-5.9/btrfs-fix-missing-delalloc-new-bit-for-new-delalloc-ranges.patch new file mode 100644 index 00000000000..2c76d7b86f4 --- /dev/null +++ b/queue-5.9/btrfs-fix-missing-delalloc-new-bit-for-new-delalloc-ranges.patch @@ -0,0 +1,280 @@ +From c334730988ee07908ba4eb816ce78d3fe06fecaa Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 4 Nov 2020 11:07:31 +0000 +Subject: btrfs: fix missing delalloc new bit for new delalloc ranges + +From: Filipe Manana + +commit c334730988ee07908ba4eb816ce78d3fe06fecaa upstream. + +When doing a buffered write, through one of the write family syscalls, we +look for ranges which currently don't have allocated extents and set the +'delalloc new' bit on them, so that we can report a correct number of used +blocks to the stat(2) syscall until delalloc is flushed and ordered extents +complete. + +However there are a few other places where we can do a buffered write +against a range that is mapped to a hole (no extent allocated) and where +we do not set the 'new delalloc' bit. Those places are: + +- Doing a memory mapped write against a hole; + +- Cloning an inline extent into a hole starting at file offset 0; + +- Calling btrfs_cont_expand() when the i_size of the file is not aligned + to the sector size and is located in a hole. For example when cloning + to a destination offset beyond EOF. + +So after such cases, until the corresponding delalloc range is flushed and +the respective ordered extents complete, we can report an incorrect number +of blocks used through the stat(2) syscall. + +In some cases we can end up reporting 0 used blocks to stat(2), which is a +particular bad value to report as it may mislead tools to think a file is +completely sparse when its i_size is not zero, making them skip reading +any data, an undesired consequence for tools such as archivers and other +backup tools, as reported a long time ago in the following thread (and +other past threads): + + https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html + +Example reproducer: + + $ cat reproducer.sh + #!/bin/bash + + MNT=/mnt/sdi + DEV=/dev/sdi + + mkfs.btrfs -f $DEV > /dev/null + # mkfs.xfs -f $DEV > /dev/null + # mkfs.ext4 -F $DEV > /dev/null + # mkfs.f2fs -f $DEV > /dev/null + mount $DEV $MNT + + xfs_io -f -c "truncate 64K" \ + -c "mmap -w 0 64K" \ + -c "mwrite -S 0xab 0 64K" \ + -c "munmap" \ + $MNT/foo + + blocks_used=$(stat -c %b $MNT/foo) + echo "blocks used: $blocks_used" + + if [ $blocks_used -eq 0 ]; then + echo "ERROR: blocks used is 0" + fi + + umount $DEV + + $ ./reproducer.sh + blocks used: 0 + ERROR: blocks used is 0 + +So move the logic that decides to set the 'delalloc bit' bit into the +function btrfs_set_extent_delalloc(), since that is what we use for all +those missing cases as well as for the cases that currently work well. + +This change is also preparatory work for an upcoming patch that fixes +other problems related to tracking and reporting the number of bytes used +by an inode. + +CC: stable@vger.kernel.org # 4.19+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 57 ------------------------------------------ + fs/btrfs/inode.c | 58 +++++++++++++++++++++++++++++++++++++++++++ + fs/btrfs/tests/inode-tests.c | 12 +++++--- + 3 files changed, 66 insertions(+), 61 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -452,46 +452,6 @@ static void btrfs_drop_pages(struct page + } + } + +-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, +- const u64 start, +- const u64 len, +- struct extent_state **cached_state) +-{ +- u64 search_start = start; +- const u64 end = start + len - 1; +- +- while (search_start < end) { +- const u64 search_len = end - search_start + 1; +- struct extent_map *em; +- u64 em_len; +- int ret = 0; +- +- em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); +- if (IS_ERR(em)) +- return PTR_ERR(em); +- +- if (em->block_start != EXTENT_MAP_HOLE) +- goto next; +- +- em_len = em->len; +- if (em->start < search_start) +- em_len -= search_start - em->start; +- if (em_len > search_len) +- em_len = search_len; +- +- ret = set_extent_bit(&inode->io_tree, search_start, +- search_start + em_len - 1, +- EXTENT_DELALLOC_NEW, +- NULL, cached_state, GFP_NOFS); +-next: +- search_start = extent_map_end(em); +- free_extent_map(em); +- if (ret) +- return ret; +- } +- return 0; +-} +- + /* + * after copy_from_user, pages need to be dirtied and we need to make + * sure holes are created between the current EOF and the start of +@@ -528,23 +488,6 @@ int btrfs_dirty_pages(struct btrfs_inode + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, cached); + +- if (!btrfs_is_free_space_inode(inode)) { +- if (start_pos >= isize && +- !(inode->flags & BTRFS_INODE_PREALLOC)) { +- /* +- * There can't be any extents following eof in this case +- * so just set the delalloc new bit for the range +- * directly. +- */ +- extra_bits |= EXTENT_DELALLOC_NEW; +- } else { +- err = btrfs_find_new_delalloc_bytes(inode, start_pos, +- num_bytes, cached); +- if (err) +- return err; +- } +- } +- + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, + extra_bits, cached); + if (err) +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -2262,11 +2262,69 @@ static noinline int add_pending_csums(st + return 0; + } + ++static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, ++ const u64 start, ++ const u64 len, ++ struct extent_state **cached_state) ++{ ++ u64 search_start = start; ++ const u64 end = start + len - 1; ++ ++ while (search_start < end) { ++ const u64 search_len = end - search_start + 1; ++ struct extent_map *em; ++ u64 em_len; ++ int ret = 0; ++ ++ em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); ++ if (IS_ERR(em)) ++ return PTR_ERR(em); ++ ++ if (em->block_start != EXTENT_MAP_HOLE) ++ goto next; ++ ++ em_len = em->len; ++ if (em->start < search_start) ++ em_len -= search_start - em->start; ++ if (em_len > search_len) ++ em_len = search_len; ++ ++ ret = set_extent_bit(&inode->io_tree, search_start, ++ search_start + em_len - 1, ++ EXTENT_DELALLOC_NEW, ++ NULL, cached_state, GFP_NOFS); ++next: ++ search_start = extent_map_end(em); ++ free_extent_map(em); ++ if (ret) ++ return ret; ++ } ++ return 0; ++} ++ + int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + unsigned int extra_bits, + struct extent_state **cached_state) + { + WARN_ON(PAGE_ALIGNED(end)); ++ ++ if (start >= i_size_read(&inode->vfs_inode) && ++ !(inode->flags & BTRFS_INODE_PREALLOC)) { ++ /* ++ * There can't be any extents following eof in this case so just ++ * set the delalloc new bit for the range directly. ++ */ ++ extra_bits |= EXTENT_DELALLOC_NEW; ++ } else { ++ int ret; ++ ++ ret = btrfs_find_new_delalloc_bytes(inode, start, ++ end + 1 - start, ++ cached_state); ++ if (ret) ++ return ret; ++ } ++ + return set_extent_delalloc(&inode->io_tree, start, end, extra_bits, + cached_state); + } +--- a/fs/btrfs/tests/inode-tests.c ++++ b/fs/btrfs/tests/inode-tests.c +@@ -986,7 +986,8 @@ static int test_extent_accounting(u32 se + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, +- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); ++ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | ++ EXTENT_UPTODATE, 0, 0, NULL); + if (ret) { + test_err("clear_extent_bit returned %d", ret); + goto out; +@@ -1053,7 +1054,8 @@ static int test_extent_accounting(u32 se + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, +- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); ++ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | ++ EXTENT_UPTODATE, 0, 0, NULL); + if (ret) { + test_err("clear_extent_bit returned %d", ret); + goto out; +@@ -1085,7 +1087,8 @@ static int test_extent_accounting(u32 se + + /* Empty */ + ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, +- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); ++ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | ++ EXTENT_UPTODATE, 0, 0, NULL); + if (ret) { + test_err("clear_extent_bit returned %d", ret); + goto out; +@@ -1100,7 +1103,8 @@ static int test_extent_accounting(u32 se + out: + if (ret) + clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, +- EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL); ++ EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | ++ EXTENT_UPTODATE, 0, 0, NULL); + iput(inode); + btrfs_free_dummy_root(root); + btrfs_free_dummy_fs_info(fs_info); diff --git a/queue-5.9/btrfs-tree-checker-add-missing-return-after-error-in-root_item.patch b/queue-5.9/btrfs-tree-checker-add-missing-return-after-error-in-root_item.patch new file mode 100644 index 00000000000..5804554fe85 --- /dev/null +++ b/queue-5.9/btrfs-tree-checker-add-missing-return-after-error-in-root_item.patch @@ -0,0 +1,36 @@ +From 1a49a97df657c63a4e8ffcd1ea9b6ed95581789b Mon Sep 17 00:00:00 2001 +From: Daniel Xu +Date: Thu, 12 Nov 2020 17:55:06 -0800 +Subject: btrfs: tree-checker: add missing return after error in root_item + +From: Daniel Xu + +commit 1a49a97df657c63a4e8ffcd1ea9b6ed95581789b upstream. + +There's a missing return statement after an error is found in the +root_item, this can cause further problems when a crafted image triggers +the error. + +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=210181 +Fixes: 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check") +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Qu Wenruo +Signed-off-by: Daniel Xu +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-checker.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -1068,6 +1068,7 @@ static int check_root_item(struct extent + "invalid root item size, have %u expect %zu or %u", + btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_legacy_root_item_size()); ++ return -EUCLEAN; + } + + /* diff --git a/queue-5.9/btrfs-tree-checker-add-missing-returns-after-data_ref-alignment-checks.patch b/queue-5.9/btrfs-tree-checker-add-missing-returns-after-data_ref-alignment-checks.patch new file mode 100644 index 00000000000..8758d1e9cb3 --- /dev/null +++ b/queue-5.9/btrfs-tree-checker-add-missing-returns-after-data_ref-alignment-checks.patch @@ -0,0 +1,41 @@ +From 6d06b0ad94d3dd7e3503d8ad39c39c4634884611 Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Mon, 16 Nov 2020 19:53:52 +0100 +Subject: btrfs: tree-checker: add missing returns after data_ref alignment checks + +From: David Sterba + +commit 6d06b0ad94d3dd7e3503d8ad39c39c4634884611 upstream. + +There are sectorsize alignment checks that are reported but then +check_extent_data_ref continues. This was not intended, wrong alignment +is not a minor problem and we should return with error. + +CC: stable@vger.kernel.org # 5.4+ +Fixes: 0785a9aacf9d ("btrfs: tree-checker: Add EXTENT_DATA_REF check") +Reviewed-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-checker.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -1424,6 +1424,7 @@ static int check_extent_data_ref(struct + "invalid item size, have %u expect aligned to %zu for key type %u", + btrfs_item_size_nr(leaf, slot), + sizeof(*dref), key->type); ++ return -EUCLEAN; + } + if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) { + generic_err(leaf, slot, +@@ -1452,6 +1453,7 @@ static int check_extent_data_ref(struct + extent_err(leaf, slot, + "invalid extent data backref offset, have %llu expect aligned to %u", + offset, leaf->fs_info->sectorsize); ++ return -EUCLEAN; + } + } + return 0; diff --git a/queue-5.9/ib-hfi1-ensure-correct-mm-is-used-at-all-times.patch b/queue-5.9/ib-hfi1-ensure-correct-mm-is-used-at-all-times.patch new file mode 100644 index 00000000000..f5a1e0a36a8 --- /dev/null +++ b/queue-5.9/ib-hfi1-ensure-correct-mm-is-used-at-all-times.patch @@ -0,0 +1,444 @@ +From 3d2a9d642512c21a12d19b9250e7a835dcb41a79 Mon Sep 17 00:00:00 2001 +From: Dennis Dalessandro +Date: Wed, 25 Nov 2020 16:01:12 -0500 +Subject: IB/hfi1: Ensure correct mm is used at all times + +From: Dennis Dalessandro + +commit 3d2a9d642512c21a12d19b9250e7a835dcb41a79 upstream. + +Two earlier bug fixes have created a security problem in the hfi1 +driver. One fix aimed to solve an issue where current->mm was not valid +when closing the hfi1 cdev. It attempted to do this by saving a cached +value of the current->mm pointer at file open time. This is a problem if +another process with access to the FD calls in via write() or ioctl() to +pin pages via the hfi driver. The other fix tried to solve a use after +free by taking a reference on the mm. + +To fix this correctly we use the existing cached value of the mm in the +mmu notifier. Now we can check in the insert, evict, etc. routines that +current->mm matched what the notifier was registered for. If not, then +don't allow access. The register of the mmu notifier will save the mm +pointer. + +Since in do_exit() the exit_mm() is called before exit_files(), which +would call our close routine a reference is needed on the mm. We rely on +the mmgrab done by the registration of the notifier, whereas before it was +explicit. The mmu notifier deregistration happens when the user context is +torn down, the creation of which triggered the registration. + +Also of note is we do not do any explicit work to protect the interval +tree notifier. It doesn't seem that this is going to be needed since we +aren't actually doing anything with current->mm. The interval tree +notifier stuff still has a FIXME noted from a previous commit that will be +addressed in a follow on patch. + +Cc: +Fixes: e0cf75deab81 ("IB/hfi1: Fix mm_struct use after free") +Fixes: 3faa3d9a308e ("IB/hfi1: Make use of mm consistent") +Link: https://lore.kernel.org/r/20201125210112.104301.51331.stgit@awfm-01.aw.intel.com +Suggested-by: Jann Horn +Reported-by: Jason Gunthorpe +Reviewed-by: Ira Weiny +Reviewed-by: Mike Marciniszyn +Signed-off-by: Dennis Dalessandro +Signed-off-by: Jason Gunthorpe +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/hw/hfi1/file_ops.c | 4 - + drivers/infiniband/hw/hfi1/hfi.h | 2 + drivers/infiniband/hw/hfi1/mmu_rb.c | 66 +++++++++++++++--------------- + drivers/infiniband/hw/hfi1/mmu_rb.h | 16 ++++++- + drivers/infiniband/hw/hfi1/user_exp_rcv.c | 12 +++-- + drivers/infiniband/hw/hfi1/user_exp_rcv.h | 6 ++ + drivers/infiniband/hw/hfi1/user_sdma.c | 13 +++-- + drivers/infiniband/hw/hfi1/user_sdma.h | 7 ++- + 8 files changed, 78 insertions(+), 48 deletions(-) + +--- a/drivers/infiniband/hw/hfi1/file_ops.c ++++ b/drivers/infiniband/hw/hfi1/file_ops.c +@@ -1,4 +1,5 @@ + /* ++ * Copyright(c) 2020 Cornelis Networks, Inc. + * Copyright(c) 2015-2020 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or +@@ -206,8 +207,6 @@ static int hfi1_file_open(struct inode * + spin_lock_init(&fd->tid_lock); + spin_lock_init(&fd->invalid_lock); + fd->rec_cpu_num = -1; /* no cpu affinity by default */ +- fd->mm = current->mm; +- mmgrab(fd->mm); + fd->dd = dd; + fp->private_data = fd; + return 0; +@@ -711,7 +710,6 @@ static int hfi1_file_close(struct inode + + deallocate_ctxt(uctxt); + done: +- mmdrop(fdata->mm); + + if (atomic_dec_and_test(&dd->user_refcount)) + complete(&dd->user_comp); +--- a/drivers/infiniband/hw/hfi1/hfi.h ++++ b/drivers/infiniband/hw/hfi1/hfi.h +@@ -1,6 +1,7 @@ + #ifndef _HFI1_KERNEL_H + #define _HFI1_KERNEL_H + /* ++ * Copyright(c) 2020 Cornelis Networks, Inc. + * Copyright(c) 2015-2020 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or +@@ -1451,7 +1452,6 @@ struct hfi1_filedata { + u32 invalid_tid_idx; + /* protect invalid_tids array and invalid_tid_idx */ + spinlock_t invalid_lock; +- struct mm_struct *mm; + }; + + extern struct xarray hfi1_dev_table; +--- a/drivers/infiniband/hw/hfi1/mmu_rb.c ++++ b/drivers/infiniband/hw/hfi1/mmu_rb.c +@@ -1,4 +1,5 @@ + /* ++ * Copyright(c) 2020 Cornelis Networks, Inc. + * Copyright(c) 2016 - 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or +@@ -48,23 +49,11 @@ + #include + #include + #include ++#include + + #include "mmu_rb.h" + #include "trace.h" + +-struct mmu_rb_handler { +- struct mmu_notifier mn; +- struct rb_root_cached root; +- void *ops_arg; +- spinlock_t lock; /* protect the RB tree */ +- struct mmu_rb_ops *ops; +- struct mm_struct *mm; +- struct list_head lru_list; +- struct work_struct del_work; +- struct list_head del_list; +- struct workqueue_struct *wq; +-}; +- + static unsigned long mmu_node_start(struct mmu_rb_node *); + static unsigned long mmu_node_last(struct mmu_rb_node *); + static int mmu_notifier_range_start(struct mmu_notifier *, +@@ -92,37 +81,36 @@ static unsigned long mmu_node_last(struc + return PAGE_ALIGN(node->addr + node->len) - 1; + } + +-int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, ++int hfi1_mmu_rb_register(void *ops_arg, + struct mmu_rb_ops *ops, + struct workqueue_struct *wq, + struct mmu_rb_handler **handler) + { +- struct mmu_rb_handler *handlr; ++ struct mmu_rb_handler *h; + int ret; + +- handlr = kmalloc(sizeof(*handlr), GFP_KERNEL); +- if (!handlr) ++ h = kmalloc(sizeof(*h), GFP_KERNEL); ++ if (!h) + return -ENOMEM; + +- handlr->root = RB_ROOT_CACHED; +- handlr->ops = ops; +- handlr->ops_arg = ops_arg; +- INIT_HLIST_NODE(&handlr->mn.hlist); +- spin_lock_init(&handlr->lock); +- handlr->mn.ops = &mn_opts; +- handlr->mm = mm; +- INIT_WORK(&handlr->del_work, handle_remove); +- INIT_LIST_HEAD(&handlr->del_list); +- INIT_LIST_HEAD(&handlr->lru_list); +- handlr->wq = wq; ++ h->root = RB_ROOT_CACHED; ++ h->ops = ops; ++ h->ops_arg = ops_arg; ++ INIT_HLIST_NODE(&h->mn.hlist); ++ spin_lock_init(&h->lock); ++ h->mn.ops = &mn_opts; ++ INIT_WORK(&h->del_work, handle_remove); ++ INIT_LIST_HEAD(&h->del_list); ++ INIT_LIST_HEAD(&h->lru_list); ++ h->wq = wq; + +- ret = mmu_notifier_register(&handlr->mn, handlr->mm); ++ ret = mmu_notifier_register(&h->mn, current->mm); + if (ret) { +- kfree(handlr); ++ kfree(h); + return ret; + } + +- *handler = handlr; ++ *handler = h; + return 0; + } + +@@ -134,7 +122,7 @@ void hfi1_mmu_rb_unregister(struct mmu_r + struct list_head del_list; + + /* Unregister first so we don't get any more notifications. */ +- mmu_notifier_unregister(&handler->mn, handler->mm); ++ mmu_notifier_unregister(&handler->mn, handler->mn.mm); + + /* + * Make sure the wq delete handler is finished running. It will not +@@ -166,6 +154,10 @@ int hfi1_mmu_rb_insert(struct mmu_rb_han + int ret = 0; + + trace_hfi1_mmu_rb_insert(mnode->addr, mnode->len); ++ ++ if (current->mm != handler->mn.mm) ++ return -EPERM; ++ + spin_lock_irqsave(&handler->lock, flags); + node = __mmu_rb_search(handler, mnode->addr, mnode->len); + if (node) { +@@ -180,6 +172,7 @@ int hfi1_mmu_rb_insert(struct mmu_rb_han + __mmu_int_rb_remove(mnode, &handler->root); + list_del(&mnode->list); /* remove from LRU list */ + } ++ mnode->handler = handler; + unlock: + spin_unlock_irqrestore(&handler->lock, flags); + return ret; +@@ -217,6 +210,9 @@ bool hfi1_mmu_rb_remove_unless_exact(str + unsigned long flags; + bool ret = false; + ++ if (current->mm != handler->mn.mm) ++ return ret; ++ + spin_lock_irqsave(&handler->lock, flags); + node = __mmu_rb_search(handler, addr, len); + if (node) { +@@ -239,6 +235,9 @@ void hfi1_mmu_rb_evict(struct mmu_rb_han + unsigned long flags; + bool stop = false; + ++ if (current->mm != handler->mn.mm) ++ return; ++ + INIT_LIST_HEAD(&del_list); + + spin_lock_irqsave(&handler->lock, flags); +@@ -272,6 +271,9 @@ void hfi1_mmu_rb_remove(struct mmu_rb_ha + { + unsigned long flags; + ++ if (current->mm != handler->mn.mm) ++ return; ++ + /* Validity of handler and node pointers has been checked by caller. */ + trace_hfi1_mmu_rb_remove(node->addr, node->len); + spin_lock_irqsave(&handler->lock, flags); +--- a/drivers/infiniband/hw/hfi1/mmu_rb.h ++++ b/drivers/infiniband/hw/hfi1/mmu_rb.h +@@ -1,4 +1,5 @@ + /* ++ * Copyright(c) 2020 Cornelis Networks, Inc. + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or +@@ -54,6 +55,7 @@ struct mmu_rb_node { + unsigned long len; + unsigned long __last; + struct rb_node node; ++ struct mmu_rb_handler *handler; + struct list_head list; + }; + +@@ -71,7 +73,19 @@ struct mmu_rb_ops { + void *evict_arg, bool *stop); + }; + +-int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm, ++struct mmu_rb_handler { ++ struct mmu_notifier mn; ++ struct rb_root_cached root; ++ void *ops_arg; ++ spinlock_t lock; /* protect the RB tree */ ++ struct mmu_rb_ops *ops; ++ struct list_head lru_list; ++ struct work_struct del_work; ++ struct list_head del_list; ++ struct workqueue_struct *wq; ++}; ++ ++int hfi1_mmu_rb_register(void *ops_arg, + struct mmu_rb_ops *ops, + struct workqueue_struct *wq, + struct mmu_rb_handler **handler); +--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c ++++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c +@@ -1,4 +1,5 @@ + /* ++ * Copyright(c) 2020 Cornelis Networks, Inc. + * Copyright(c) 2015-2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or +@@ -173,15 +174,18 @@ static void unpin_rcv_pages(struct hfi1_ + { + struct page **pages; + struct hfi1_devdata *dd = fd->uctxt->dd; ++ struct mm_struct *mm; + + if (mapped) { + pci_unmap_single(dd->pcidev, node->dma_addr, + node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); + pages = &node->pages[idx]; ++ mm = mm_from_tid_node(node); + } else { + pages = &tidbuf->pages[idx]; ++ mm = current->mm; + } +- hfi1_release_user_pages(fd->mm, pages, npages, mapped); ++ hfi1_release_user_pages(mm, pages, npages, mapped); + fd->tid_n_pinned -= npages; + } + +@@ -216,12 +220,12 @@ static int pin_rcv_pages(struct hfi1_fil + * pages, accept the amount pinned so far and program only that. + * User space knows how to deal with partially programmed buffers. + */ +- if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) { ++ if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) { + kfree(pages); + return -ENOMEM; + } + +- pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages); ++ pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages); + if (pinned <= 0) { + kfree(pages); + return pinned; +@@ -756,7 +760,7 @@ static int set_rcvarray_entry(struct hfi + + if (fd->use_mn) { + ret = mmu_interval_notifier_insert( +- &node->notifier, fd->mm, ++ &node->notifier, current->mm, + tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE, + &tid_mn_ops); + if (ret) +--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h ++++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h +@@ -1,6 +1,7 @@ + #ifndef _HFI1_USER_EXP_RCV_H + #define _HFI1_USER_EXP_RCV_H + /* ++ * Copyright(c) 2020 - Cornelis Networks, Inc. + * Copyright(c) 2015 - 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or +@@ -95,4 +96,9 @@ int hfi1_user_exp_rcv_clear(struct hfi1_ + int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd, + struct hfi1_tid_info *tinfo); + ++static inline struct mm_struct *mm_from_tid_node(struct tid_rb_node *node) ++{ ++ return node->notifier.mm; ++} ++ + #endif /* _HFI1_USER_EXP_RCV_H */ +--- a/drivers/infiniband/hw/hfi1/user_sdma.c ++++ b/drivers/infiniband/hw/hfi1/user_sdma.c +@@ -1,4 +1,5 @@ + /* ++ * Copyright(c) 2020 - Cornelis Networks, Inc. + * Copyright(c) 2015 - 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or +@@ -188,7 +189,6 @@ int hfi1_user_sdma_alloc_queues(struct h + atomic_set(&pq->n_reqs, 0); + init_waitqueue_head(&pq->wait); + atomic_set(&pq->n_locked, 0); +- pq->mm = fd->mm; + + iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, + activate_packet_queue, NULL, NULL); +@@ -230,7 +230,7 @@ int hfi1_user_sdma_alloc_queues(struct h + + cq->nentries = hfi1_sdma_comp_ring_size; + +- ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq, ++ ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq, + &pq->handler); + if (ret) { + dd_dev_err(dd, "Failed to register with MMU %d", ret); +@@ -980,13 +980,13 @@ static int pin_sdma_pages(struct user_sd + + npages -= node->npages; + retry: +- if (!hfi1_can_pin_pages(pq->dd, pq->mm, ++ if (!hfi1_can_pin_pages(pq->dd, current->mm, + atomic_read(&pq->n_locked), npages)) { + cleared = sdma_cache_evict(pq, npages); + if (cleared >= npages) + goto retry; + } +- pinned = hfi1_acquire_user_pages(pq->mm, ++ pinned = hfi1_acquire_user_pages(current->mm, + ((unsigned long)iovec->iov.iov_base + + (node->npages * PAGE_SIZE)), npages, 0, + pages + node->npages); +@@ -995,7 +995,7 @@ retry: + return pinned; + } + if (pinned != npages) { +- unpin_vector_pages(pq->mm, pages, node->npages, pinned); ++ unpin_vector_pages(current->mm, pages, node->npages, pinned); + return -EFAULT; + } + kfree(node->pages); +@@ -1008,7 +1008,8 @@ retry: + static void unpin_sdma_pages(struct sdma_mmu_node *node) + { + if (node->npages) { +- unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages); ++ unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0, ++ node->npages); + atomic_sub(node->npages, &node->pq->n_locked); + } + } +--- a/drivers/infiniband/hw/hfi1/user_sdma.h ++++ b/drivers/infiniband/hw/hfi1/user_sdma.h +@@ -1,6 +1,7 @@ + #ifndef _HFI1_USER_SDMA_H + #define _HFI1_USER_SDMA_H + /* ++ * Copyright(c) 2020 - Cornelis Networks, Inc. + * Copyright(c) 2015 - 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or +@@ -133,7 +134,6 @@ struct hfi1_user_sdma_pkt_q { + unsigned long unpinned; + struct mmu_rb_handler *handler; + atomic_t n_locked; +- struct mm_struct *mm; + }; + + struct hfi1_user_sdma_comp_q { +@@ -250,4 +250,9 @@ int hfi1_user_sdma_process_request(struc + struct iovec *iovec, unsigned long dim, + unsigned long *count); + ++static inline struct mm_struct *mm_from_sdma_node(struct sdma_mmu_node *node) ++{ ++ return node->rb.handler->mn.mm; ++} ++ + #endif /* _HFI1_USER_SDMA_H */ diff --git a/queue-5.9/rdma-i40iw-address-an-mmap-handler-exploit-in-i40iw.patch b/queue-5.9/rdma-i40iw-address-an-mmap-handler-exploit-in-i40iw.patch new file mode 100644 index 00000000000..e692987ef5f --- /dev/null +++ b/queue-5.9/rdma-i40iw-address-an-mmap-handler-exploit-in-i40iw.patch @@ -0,0 +1,106 @@ +From 2ed381439e89fa6d1a0839ef45ccd45d99d8e915 Mon Sep 17 00:00:00 2001 +From: Shiraz Saleem +Date: Tue, 24 Nov 2020 18:56:16 -0600 +Subject: RDMA/i40iw: Address an mmap handler exploit in i40iw + +From: Shiraz Saleem + +commit 2ed381439e89fa6d1a0839ef45ccd45d99d8e915 upstream. + +i40iw_mmap manipulates the vma->vm_pgoff to differentiate a push page mmap +vs a doorbell mmap, and uses it to compute the pfn in remap_pfn_range +without any validation. This is vulnerable to an mmap exploit as described +in: https://lore.kernel.org/r/20201119093523.7588-1-zhudi21@huawei.com + +The push feature is disabled in the driver currently and therefore no push +mmaps are issued from user-space. The feature does not work as expected in +the x722 product. + +Remove the push module parameter and all VMA attribute manipulations for +this feature in i40iw_mmap. Update i40iw_mmap to only allow DB user +mmapings at offset = 0. Check vm_pgoff for zero and if the mmaps are bound +to a single page. + +Cc: +Fixes: d37498417947 ("i40iw: add files for iwarp interface") +Link: https://lore.kernel.org/r/20201125005616.1800-2-shiraz.saleem@intel.com +Reported-by: Di Zhu +Signed-off-by: Shiraz Saleem +Signed-off-by: Jason Gunthorpe +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/infiniband/hw/i40iw/i40iw_main.c | 5 ---- + drivers/infiniband/hw/i40iw/i40iw_verbs.c | 37 +++++------------------------- + 2 files changed, 7 insertions(+), 35 deletions(-) + +--- a/drivers/infiniband/hw/i40iw/i40iw_main.c ++++ b/drivers/infiniband/hw/i40iw/i40iw_main.c +@@ -54,10 +54,6 @@ + #define DRV_VERSION __stringify(DRV_VERSION_MAJOR) "." \ + __stringify(DRV_VERSION_MINOR) "." __stringify(DRV_VERSION_BUILD) + +-static int push_mode; +-module_param(push_mode, int, 0644); +-MODULE_PARM_DESC(push_mode, "Low latency mode: 0=disabled (default), 1=enabled)"); +- + static int debug; + module_param(debug, int, 0644); + MODULE_PARM_DESC(debug, "debug flags: 0=disabled (default), 0x7fffffff=all"); +@@ -1580,7 +1576,6 @@ static enum i40iw_status_code i40iw_setu + if (status) + goto exit; + iwdev->obj_next = iwdev->obj_mem; +- iwdev->push_mode = push_mode; + + init_waitqueue_head(&iwdev->vchnl_waitq); + init_waitqueue_head(&dev->vf_reqs); +--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c ++++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c +@@ -167,39 +167,16 @@ static void i40iw_dealloc_ucontext(struc + */ + static int i40iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) + { +- struct i40iw_ucontext *ucontext; +- u64 db_addr_offset, push_offset, pfn; ++ struct i40iw_ucontext *ucontext = to_ucontext(context); ++ u64 dbaddr; + +- ucontext = to_ucontext(context); +- if (ucontext->iwdev->sc_dev.is_pf) { +- db_addr_offset = I40IW_DB_ADDR_OFFSET; +- push_offset = I40IW_PUSH_OFFSET; +- if (vma->vm_pgoff) +- vma->vm_pgoff += I40IW_PF_FIRST_PUSH_PAGE_INDEX - 1; +- } else { +- db_addr_offset = I40IW_VF_DB_ADDR_OFFSET; +- push_offset = I40IW_VF_PUSH_OFFSET; +- if (vma->vm_pgoff) +- vma->vm_pgoff += I40IW_VF_FIRST_PUSH_PAGE_INDEX - 1; +- } ++ if (vma->vm_pgoff || vma->vm_end - vma->vm_start != PAGE_SIZE) ++ return -EINVAL; + +- vma->vm_pgoff += db_addr_offset >> PAGE_SHIFT; ++ dbaddr = I40IW_DB_ADDR_OFFSET + pci_resource_start(ucontext->iwdev->ldev->pcidev, 0); + +- if (vma->vm_pgoff == (db_addr_offset >> PAGE_SHIFT)) { +- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); +- } else { +- if ((vma->vm_pgoff - (push_offset >> PAGE_SHIFT)) % 2) +- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); +- else +- vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); +- } +- +- pfn = vma->vm_pgoff + +- (pci_resource_start(ucontext->iwdev->ldev->pcidev, 0) >> +- PAGE_SHIFT); +- +- return rdma_user_mmap_io(context, vma, pfn, PAGE_SIZE, +- vma->vm_page_prot, NULL); ++ return rdma_user_mmap_io(context, vma, dbaddr >> PAGE_SHIFT, PAGE_SIZE, ++ pgprot_noncached(vma->vm_page_prot), NULL); + } + + /** diff --git a/queue-5.9/series b/queue-5.9/series index e421fbb407b..85cf3033dcf 100644 --- a/queue-5.9/series +++ b/queue-5.9/series @@ -3,3 +3,10 @@ io_uring-order-refnode-recycling.patch spi-bcm-qspi-fix-use-after-free-on-unbind.patch spi-bcm2835-fix-use-after-free-on-unbind.patch ipv4-use-is_enabled-instead-of-ifdef.patch +ib-hfi1-ensure-correct-mm-is-used-at-all-times.patch +rdma-i40iw-address-an-mmap-handler-exploit-in-i40iw.patch +btrfs-fix-missing-delalloc-new-bit-for-new-delalloc-ranges.patch +btrfs-tree-checker-add-missing-return-after-error-in-root_item.patch +btrfs-tree-checker-add-missing-returns-after-data_ref-alignment-checks.patch +btrfs-don-t-access-possibly-stale-fs_info-data-for-printing-duplicate-device.patch +btrfs-fix-lockdep-splat-when-reading-qgroup-config-on-mount.patch