]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 28 Nov 2020 12:38:41 +0000 (13:38 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 28 Nov 2020 12:38:41 +0000 (13:38 +0100)
added patches:
btrfs-don-t-access-possibly-stale-fs_info-data-for-printing-duplicate-device.patch
btrfs-fix-lockdep-splat-when-reading-qgroup-config-on-mount.patch
btrfs-fix-missing-delalloc-new-bit-for-new-delalloc-ranges.patch
btrfs-tree-checker-add-missing-return-after-error-in-root_item.patch
btrfs-tree-checker-add-missing-returns-after-data_ref-alignment-checks.patch
ib-hfi1-ensure-correct-mm-is-used-at-all-times.patch
rdma-i40iw-address-an-mmap-handler-exploit-in-i40iw.patch

queue-4.4/series [new file with mode: 0644]
queue-5.9/btrfs-don-t-access-possibly-stale-fs_info-data-for-printing-duplicate-device.patch [new file with mode: 0644]
queue-5.9/btrfs-fix-lockdep-splat-when-reading-qgroup-config-on-mount.patch [new file with mode: 0644]
queue-5.9/btrfs-fix-missing-delalloc-new-bit-for-new-delalloc-ranges.patch [new file with mode: 0644]
queue-5.9/btrfs-tree-checker-add-missing-return-after-error-in-root_item.patch [new file with mode: 0644]
queue-5.9/btrfs-tree-checker-add-missing-returns-after-data_ref-alignment-checks.patch [new file with mode: 0644]
queue-5.9/ib-hfi1-ensure-correct-mm-is-used-at-all-times.patch [new file with mode: 0644]
queue-5.9/rdma-i40iw-address-an-mmap-handler-exploit-in-i40iw.patch [new file with mode: 0644]
queue-5.9/series

diff --git a/queue-4.4/series b/queue-4.4/series
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/queue-5.9/btrfs-don-t-access-possibly-stale-fs_info-data-for-printing-duplicate-device.patch b/queue-5.9/btrfs-don-t-access-possibly-stale-fs_info-data-for-printing-duplicate-device.patch
new file mode 100644 (file)
index 0000000..4f185ad
--- /dev/null
@@ -0,0 +1,171 @@
+From 0697d9a610998b8bdee6b2390836cb2391d8fd1a Mon Sep 17 00:00:00 2001
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Date: Wed, 18 Nov 2020 18:03:26 +0900
+Subject: btrfs: don't access possibly stale fs_info data for printing duplicate device
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+commit 0697d9a610998b8bdee6b2390836cb2391d8fd1a upstream.
+
+Syzbot reported a possible use-after-free when printing a duplicate device
+warning device_list_add().
+
+At this point it can happen that a btrfs_device::fs_info is not correctly
+setup yet, so we're accessing stale data, when printing the warning
+message using the btrfs_printk() wrappers.
+
+  ==================================================================
+  BUG: KASAN: use-after-free in btrfs_printk+0x3eb/0x435 fs/btrfs/super.c:245
+  Read of size 8 at addr ffff8880878e06a8 by task syz-executor225/7068
+
+  CPU: 1 PID: 7068 Comm: syz-executor225 Not tainted 5.9.0-rc5-syzkaller #0
+  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+  Call Trace:
+   __dump_stack lib/dump_stack.c:77 [inline]
+   dump_stack+0x1d6/0x29e lib/dump_stack.c:118
+   print_address_description+0x66/0x620 mm/kasan/report.c:383
+   __kasan_report mm/kasan/report.c:513 [inline]
+   kasan_report+0x132/0x1d0 mm/kasan/report.c:530
+   btrfs_printk+0x3eb/0x435 fs/btrfs/super.c:245
+   device_list_add+0x1a88/0x1d60 fs/btrfs/volumes.c:943
+   btrfs_scan_one_device+0x196/0x490 fs/btrfs/volumes.c:1359
+   btrfs_mount_root+0x48f/0xb60 fs/btrfs/super.c:1634
+   legacy_get_tree+0xea/0x180 fs/fs_context.c:592
+   vfs_get_tree+0x88/0x270 fs/super.c:1547
+   fc_mount fs/namespace.c:978 [inline]
+   vfs_kern_mount+0xc9/0x160 fs/namespace.c:1008
+   btrfs_mount+0x33c/0xae0 fs/btrfs/super.c:1732
+   legacy_get_tree+0xea/0x180 fs/fs_context.c:592
+   vfs_get_tree+0x88/0x270 fs/super.c:1547
+   do_new_mount fs/namespace.c:2875 [inline]
+   path_mount+0x179d/0x29e0 fs/namespace.c:3192
+   do_mount fs/namespace.c:3205 [inline]
+   __do_sys_mount fs/namespace.c:3413 [inline]
+   __se_sys_mount+0x126/0x180 fs/namespace.c:3390
+   do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46
+   entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  RIP: 0033:0x44840a
+  RSP: 002b:00007ffedfffd608 EFLAGS: 00000293 ORIG_RAX: 00000000000000a5
+  RAX: ffffffffffffffda RBX: 00007ffedfffd670 RCX: 000000000044840a
+  RDX: 0000000020000000 RSI: 0000000020000100 RDI: 00007ffedfffd630
+  RBP: 00007ffedfffd630 R08: 00007ffedfffd670 R09: 0000000000000000
+  R10: 0000000000000000 R11: 0000000000000293 R12: 000000000000001a
+  R13: 0000000000000004 R14: 0000000000000003 R15: 0000000000000003
+
+  Allocated by task 6945:
+   kasan_save_stack mm/kasan/common.c:48 [inline]
+   kasan_set_track mm/kasan/common.c:56 [inline]
+   __kasan_kmalloc+0x100/0x130 mm/kasan/common.c:461
+   kmalloc_node include/linux/slab.h:577 [inline]
+   kvmalloc_node+0x81/0x110 mm/util.c:574
+   kvmalloc include/linux/mm.h:757 [inline]
+   kvzalloc include/linux/mm.h:765 [inline]
+   btrfs_mount_root+0xd0/0xb60 fs/btrfs/super.c:1613
+   legacy_get_tree+0xea/0x180 fs/fs_context.c:592
+   vfs_get_tree+0x88/0x270 fs/super.c:1547
+   fc_mount fs/namespace.c:978 [inline]
+   vfs_kern_mount+0xc9/0x160 fs/namespace.c:1008
+   btrfs_mount+0x33c/0xae0 fs/btrfs/super.c:1732
+   legacy_get_tree+0xea/0x180 fs/fs_context.c:592
+   vfs_get_tree+0x88/0x270 fs/super.c:1547
+   do_new_mount fs/namespace.c:2875 [inline]
+   path_mount+0x179d/0x29e0 fs/namespace.c:3192
+   do_mount fs/namespace.c:3205 [inline]
+   __do_sys_mount fs/namespace.c:3413 [inline]
+   __se_sys_mount+0x126/0x180 fs/namespace.c:3390
+   do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46
+   entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+  Freed by task 6945:
+   kasan_save_stack mm/kasan/common.c:48 [inline]
+   kasan_set_track+0x3d/0x70 mm/kasan/common.c:56
+   kasan_set_free_info+0x17/0x30 mm/kasan/generic.c:355
+   __kasan_slab_free+0xdd/0x110 mm/kasan/common.c:422
+   __cache_free mm/slab.c:3418 [inline]
+   kfree+0x113/0x200 mm/slab.c:3756
+   deactivate_locked_super+0xa7/0xf0 fs/super.c:335
+   btrfs_mount_root+0x72b/0xb60 fs/btrfs/super.c:1678
+   legacy_get_tree+0xea/0x180 fs/fs_context.c:592
+   vfs_get_tree+0x88/0x270 fs/super.c:1547
+   fc_mount fs/namespace.c:978 [inline]
+   vfs_kern_mount+0xc9/0x160 fs/namespace.c:1008
+   btrfs_mount+0x33c/0xae0 fs/btrfs/super.c:1732
+   legacy_get_tree+0xea/0x180 fs/fs_context.c:592
+   vfs_get_tree+0x88/0x270 fs/super.c:1547
+   do_new_mount fs/namespace.c:2875 [inline]
+   path_mount+0x179d/0x29e0 fs/namespace.c:3192
+   do_mount fs/namespace.c:3205 [inline]
+   __do_sys_mount fs/namespace.c:3413 [inline]
+   __se_sys_mount+0x126/0x180 fs/namespace.c:3390
+   do_syscall_64+0x31/0x70 arch/x86/entry/common.c:46
+   entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+  The buggy address belongs to the object at ffff8880878e0000
+   which belongs to the cache kmalloc-16k of size 16384
+  The buggy address is located 1704 bytes inside of
+   16384-byte region [ffff8880878e0000, ffff8880878e4000)
+  The buggy address belongs to the page:
+  page:0000000060704f30 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x878e0
+  head:0000000060704f30 order:3 compound_mapcount:0 compound_pincount:0
+  flags: 0xfffe0000010200(slab|head)
+  raw: 00fffe0000010200 ffffea00028e9a08 ffffea00021e3608 ffff8880aa440b00
+  raw: 0000000000000000 ffff8880878e0000 0000000100000001 0000000000000000
+  page dumped because: kasan: bad access detected
+
+  Memory state around the buggy address:
+   ffff8880878e0580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+   ffff8880878e0600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+  >ffff8880878e0680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+                                   ^
+   ffff8880878e0700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+   ffff8880878e0780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+  ==================================================================
+
+The syzkaller reproducer for this use-after-free crafts a filesystem image
+and loop mounts it twice in a loop. The mount will fail as the crafted
+image has an invalid chunk tree. When this happens btrfs_mount_root() will
+call deactivate_locked_super(), which then cleans up fs_info and
+fs_info::sb. If a second thread now adds the same block-device to the
+filesystem, it will get detected as a duplicate device and
+device_list_add() will reject the duplicate and print a warning. But as
+the fs_info pointer passed in is non-NULL this will result in a
+use-after-free.
+
+Instead of printing possibly uninitialized or already freed memory in
+btrfs_printk(), explicitly pass in a NULL fs_info so the printing of the
+device name will be skipped altogether.
+
+There was a slightly different approach discussed in
+https://lore.kernel.org/linux-btrfs/20200114060920.4527-1-anand.jain@oracle.com/t/#u
+
+Link: https://lore.kernel.org/linux-btrfs/000000000000c9e14b05afcc41ba@google.com
+Reported-by: syzbot+582e66e5edf36a22c7b0@syzkaller.appspotmail.com
+CC: stable@vger.kernel.org # 4.19+
+Reviewed-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/volumes.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -941,7 +941,13 @@ static noinline struct btrfs_device *dev
+                       if (device->bdev != path_bdev) {
+                               bdput(path_bdev);
+                               mutex_unlock(&fs_devices->device_list_mutex);
+-                              btrfs_warn_in_rcu(device->fs_info,
++                              /*
++                               * device->fs_info may not be reliable here, so
++                               * pass in a NULL instead. This avoids a
++                               * possible use-after-free when the fs_info and
++                               * fs_info->sb are already torn down.
++                               */
++                              btrfs_warn_in_rcu(NULL,
+       "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
+                                                 path, devid, found_transid,
+                                                 current->comm,
diff --git a/queue-5.9/btrfs-fix-lockdep-splat-when-reading-qgroup-config-on-mount.patch b/queue-5.9/btrfs-fix-lockdep-splat-when-reading-qgroup-config-on-mount.patch
new file mode 100644 (file)
index 0000000..07f8791
--- /dev/null
@@ -0,0 +1,162 @@
+From 3d05cad3c357a2b749912914356072b38435edfa Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 23 Nov 2020 14:28:44 +0000
+Subject: btrfs: fix lockdep splat when reading qgroup config on mount
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 3d05cad3c357a2b749912914356072b38435edfa upstream.
+
+Lockdep reported the following splat when running test btrfs/190 from
+fstests:
+
+  [ 9482.126098] ======================================================
+  [ 9482.126184] WARNING: possible circular locking dependency detected
+  [ 9482.126281] 5.10.0-rc4-btrfs-next-73 #1 Not tainted
+  [ 9482.126365] ------------------------------------------------------
+  [ 9482.126456] mount/24187 is trying to acquire lock:
+  [ 9482.126534] ffffa0c869a7dac0 (&fs_info->qgroup_rescan_lock){+.+.}-{3:3}, at: qgroup_rescan_init+0x43/0xf0 [btrfs]
+  [ 9482.126647]
+                but task is already holding lock:
+  [ 9482.126777] ffffa0c892ebd3a0 (btrfs-quota-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x27/0x120 [btrfs]
+  [ 9482.126886]
+                which lock already depends on the new lock.
+
+  [ 9482.127078]
+                the existing dependency chain (in reverse order) is:
+  [ 9482.127213]
+                -> #1 (btrfs-quota-00){++++}-{3:3}:
+  [ 9482.127366]        lock_acquire+0xd8/0x490
+  [ 9482.127436]        down_read_nested+0x45/0x220
+  [ 9482.127528]        __btrfs_tree_read_lock+0x27/0x120 [btrfs]
+  [ 9482.127613]        btrfs_read_lock_root_node+0x41/0x130 [btrfs]
+  [ 9482.127702]        btrfs_search_slot+0x514/0xc30 [btrfs]
+  [ 9482.127788]        update_qgroup_status_item+0x72/0x140 [btrfs]
+  [ 9482.127877]        btrfs_qgroup_rescan_worker+0xde/0x680 [btrfs]
+  [ 9482.127964]        btrfs_work_helper+0xf1/0x600 [btrfs]
+  [ 9482.128039]        process_one_work+0x24e/0x5e0
+  [ 9482.128110]        worker_thread+0x50/0x3b0
+  [ 9482.128181]        kthread+0x153/0x170
+  [ 9482.128256]        ret_from_fork+0x22/0x30
+  [ 9482.128327]
+                -> #0 (&fs_info->qgroup_rescan_lock){+.+.}-{3:3}:
+  [ 9482.128464]        check_prev_add+0x91/0xc60
+  [ 9482.128551]        __lock_acquire+0x1740/0x3110
+  [ 9482.128623]        lock_acquire+0xd8/0x490
+  [ 9482.130029]        __mutex_lock+0xa3/0xb30
+  [ 9482.130590]        qgroup_rescan_init+0x43/0xf0 [btrfs]
+  [ 9482.131577]        btrfs_read_qgroup_config+0x43a/0x550 [btrfs]
+  [ 9482.132175]        open_ctree+0x1228/0x18a0 [btrfs]
+  [ 9482.132756]        btrfs_mount_root.cold+0x13/0xed [btrfs]
+  [ 9482.133325]        legacy_get_tree+0x30/0x60
+  [ 9482.133866]        vfs_get_tree+0x28/0xe0
+  [ 9482.134392]        fc_mount+0xe/0x40
+  [ 9482.134908]        vfs_kern_mount.part.0+0x71/0x90
+  [ 9482.135428]        btrfs_mount+0x13b/0x3e0 [btrfs]
+  [ 9482.135942]        legacy_get_tree+0x30/0x60
+  [ 9482.136444]        vfs_get_tree+0x28/0xe0
+  [ 9482.136949]        path_mount+0x2d7/0xa70
+  [ 9482.137438]        do_mount+0x75/0x90
+  [ 9482.137923]        __x64_sys_mount+0x8e/0xd0
+  [ 9482.138400]        do_syscall_64+0x33/0x80
+  [ 9482.138873]        entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [ 9482.139346]
+                other info that might help us debug this:
+
+  [ 9482.140735]  Possible unsafe locking scenario:
+
+  [ 9482.141594]        CPU0                    CPU1
+  [ 9482.142011]        ----                    ----
+  [ 9482.142411]   lock(btrfs-quota-00);
+  [ 9482.142806]                                lock(&fs_info->qgroup_rescan_lock);
+  [ 9482.143216]                                lock(btrfs-quota-00);
+  [ 9482.143629]   lock(&fs_info->qgroup_rescan_lock);
+  [ 9482.144056]
+                 *** DEADLOCK ***
+
+  [ 9482.145242] 2 locks held by mount/24187:
+  [ 9482.145637]  #0: ffffa0c8411c40e8 (&type->s_umount_key#44/1){+.+.}-{3:3}, at: alloc_super+0xb9/0x400
+  [ 9482.146061]  #1: ffffa0c892ebd3a0 (btrfs-quota-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x27/0x120 [btrfs]
+  [ 9482.146509]
+                stack backtrace:
+  [ 9482.147350] CPU: 1 PID: 24187 Comm: mount Not tainted 5.10.0-rc4-btrfs-next-73 #1
+  [ 9482.147788] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+  [ 9482.148709] Call Trace:
+  [ 9482.149169]  dump_stack+0x8d/0xb5
+  [ 9482.149628]  check_noncircular+0xff/0x110
+  [ 9482.150090]  check_prev_add+0x91/0xc60
+  [ 9482.150561]  ? kvm_clock_read+0x14/0x30
+  [ 9482.151017]  ? kvm_sched_clock_read+0x5/0x10
+  [ 9482.151470]  __lock_acquire+0x1740/0x3110
+  [ 9482.151941]  ? __btrfs_tree_read_lock+0x27/0x120 [btrfs]
+  [ 9482.152402]  lock_acquire+0xd8/0x490
+  [ 9482.152887]  ? qgroup_rescan_init+0x43/0xf0 [btrfs]
+  [ 9482.153354]  __mutex_lock+0xa3/0xb30
+  [ 9482.153826]  ? qgroup_rescan_init+0x43/0xf0 [btrfs]
+  [ 9482.154301]  ? qgroup_rescan_init+0x43/0xf0 [btrfs]
+  [ 9482.154768]  ? qgroup_rescan_init+0x43/0xf0 [btrfs]
+  [ 9482.155226]  qgroup_rescan_init+0x43/0xf0 [btrfs]
+  [ 9482.155690]  btrfs_read_qgroup_config+0x43a/0x550 [btrfs]
+  [ 9482.156160]  open_ctree+0x1228/0x18a0 [btrfs]
+  [ 9482.156643]  btrfs_mount_root.cold+0x13/0xed [btrfs]
+  [ 9482.157108]  ? rcu_read_lock_sched_held+0x5d/0x90
+  [ 9482.157567]  ? kfree+0x31f/0x3e0
+  [ 9482.158030]  legacy_get_tree+0x30/0x60
+  [ 9482.158489]  vfs_get_tree+0x28/0xe0
+  [ 9482.158947]  fc_mount+0xe/0x40
+  [ 9482.159403]  vfs_kern_mount.part.0+0x71/0x90
+  [ 9482.159875]  btrfs_mount+0x13b/0x3e0 [btrfs]
+  [ 9482.160335]  ? rcu_read_lock_sched_held+0x5d/0x90
+  [ 9482.160805]  ? kfree+0x31f/0x3e0
+  [ 9482.161260]  ? legacy_get_tree+0x30/0x60
+  [ 9482.161714]  legacy_get_tree+0x30/0x60
+  [ 9482.162166]  vfs_get_tree+0x28/0xe0
+  [ 9482.162616]  path_mount+0x2d7/0xa70
+  [ 9482.163070]  do_mount+0x75/0x90
+  [ 9482.163525]  __x64_sys_mount+0x8e/0xd0
+  [ 9482.163986]  do_syscall_64+0x33/0x80
+  [ 9482.164437]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
+  [ 9482.164902] RIP: 0033:0x7f51e907caaa
+
+This happens because at btrfs_read_qgroup_config() we can call
+qgroup_rescan_init() while holding a read lock on a quota btree leaf,
+acquired by the previous call to btrfs_search_slot_for_read(), and
+qgroup_rescan_init() acquires the mutex qgroup_rescan_lock.
+
+A qgroup rescan worker does the opposite: it acquires the mutex
+qgroup_rescan_lock, at btrfs_qgroup_rescan_worker(), and then tries to
+update the qgroup status item in the quota btree through the call to
+update_qgroup_status_item(). This inversion of locking order
+between the qgroup_rescan_lock mutex and quota btree locks causes the
+splat.
+
+Fix this simply by releasing and freeing the path before calling
+qgroup_rescan_init() at btrfs_read_qgroup_config().
+
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/qgroup.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -497,13 +497,13 @@ next2:
+                       break;
+       }
+ out:
++      btrfs_free_path(path);
+       fs_info->qgroup_flags |= flags;
+       if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON))
+               clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+       else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN &&
+                ret >= 0)
+               ret = qgroup_rescan_init(fs_info, rescan_progress, 0);
+-      btrfs_free_path(path);
+       if (ret < 0) {
+               ulist_free(fs_info->qgroup_ulist);
diff --git a/queue-5.9/btrfs-fix-missing-delalloc-new-bit-for-new-delalloc-ranges.patch b/queue-5.9/btrfs-fix-missing-delalloc-new-bit-for-new-delalloc-ranges.patch
new file mode 100644 (file)
index 0000000..2c76d7b
--- /dev/null
@@ -0,0 +1,280 @@
+From c334730988ee07908ba4eb816ce78d3fe06fecaa Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 4 Nov 2020 11:07:31 +0000
+Subject: btrfs: fix missing delalloc new bit for new delalloc ranges
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit c334730988ee07908ba4eb816ce78d3fe06fecaa upstream.
+
+When doing a buffered write, through one of the write family syscalls, we
+look for ranges which currently don't have allocated extents and set the
+'delalloc new' bit on them, so that we can report a correct number of used
+blocks to the stat(2) syscall until delalloc is flushed and ordered extents
+complete.
+
+However there are a few other places where we can do a buffered write
+against a range that is mapped to a hole (no extent allocated) and where
+we do not set the 'new delalloc' bit. Those places are:
+
+- Doing a memory mapped write against a hole;
+
+- Cloning an inline extent into a hole starting at file offset 0;
+
+- Calling btrfs_cont_expand() when the i_size of the file is not aligned
+  to the sector size and is located in a hole. For example when cloning
+  to a destination offset beyond EOF.
+
+So after such cases, until the corresponding delalloc range is flushed and
+the respective ordered extents complete, we can report an incorrect number
+of blocks used through the stat(2) syscall.
+
+In some cases we can end up reporting 0 used blocks to stat(2), which is a
+particular bad value to report as it may mislead tools to think a file is
+completely sparse when its i_size is not zero, making them skip reading
+any data, an undesired consequence for tools such as archivers and other
+backup tools, as reported a long time ago in the following thread (and
+other past threads):
+
+  https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
+
+Example reproducer:
+
+  $ cat reproducer.sh
+  #!/bin/bash
+
+  MNT=/mnt/sdi
+  DEV=/dev/sdi
+
+  mkfs.btrfs -f $DEV > /dev/null
+  # mkfs.xfs -f $DEV > /dev/null
+  # mkfs.ext4 -F $DEV > /dev/null
+  # mkfs.f2fs -f $DEV > /dev/null
+  mount $DEV $MNT
+
+  xfs_io -f -c "truncate 64K"   \
+      -c "mmap -w 0 64K"        \
+      -c "mwrite -S 0xab 0 64K" \
+      -c "munmap"               \
+      $MNT/foo
+
+  blocks_used=$(stat -c %b $MNT/foo)
+  echo "blocks used: $blocks_used"
+
+  if [ $blocks_used -eq 0 ]; then
+      echo "ERROR: blocks used is 0"
+  fi
+
+  umount $DEV
+
+  $ ./reproducer.sh
+  blocks used: 0
+  ERROR: blocks used is 0
+
+So move the logic that decides to set the 'delalloc bit' bit into the
+function btrfs_set_extent_delalloc(), since that is what we use for all
+those missing cases as well as for the cases that currently work well.
+
+This change is also preparatory work for an upcoming patch that fixes
+other problems related to tracking and reporting the number of bytes used
+by an inode.
+
+CC: stable@vger.kernel.org # 4.19+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/file.c              |   57 ------------------------------------------
+ fs/btrfs/inode.c             |   58 +++++++++++++++++++++++++++++++++++++++++++
+ fs/btrfs/tests/inode-tests.c |   12 +++++---
+ 3 files changed, 66 insertions(+), 61 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -452,46 +452,6 @@ static void btrfs_drop_pages(struct page
+       }
+ }
+-static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+-                                       const u64 start,
+-                                       const u64 len,
+-                                       struct extent_state **cached_state)
+-{
+-      u64 search_start = start;
+-      const u64 end = start + len - 1;
+-
+-      while (search_start < end) {
+-              const u64 search_len = end - search_start + 1;
+-              struct extent_map *em;
+-              u64 em_len;
+-              int ret = 0;
+-
+-              em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
+-              if (IS_ERR(em))
+-                      return PTR_ERR(em);
+-
+-              if (em->block_start != EXTENT_MAP_HOLE)
+-                      goto next;
+-
+-              em_len = em->len;
+-              if (em->start < search_start)
+-                      em_len -= search_start - em->start;
+-              if (em_len > search_len)
+-                      em_len = search_len;
+-
+-              ret = set_extent_bit(&inode->io_tree, search_start,
+-                                   search_start + em_len - 1,
+-                                   EXTENT_DELALLOC_NEW,
+-                                   NULL, cached_state, GFP_NOFS);
+-next:
+-              search_start = extent_map_end(em);
+-              free_extent_map(em);
+-              if (ret)
+-                      return ret;
+-      }
+-      return 0;
+-}
+-
+ /*
+  * after copy_from_user, pages need to be dirtied and we need to make
+  * sure holes are created between the current EOF and the start of
+@@ -528,23 +488,6 @@ int btrfs_dirty_pages(struct btrfs_inode
+                        EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+                        0, 0, cached);
+-      if (!btrfs_is_free_space_inode(inode)) {
+-              if (start_pos >= isize &&
+-                  !(inode->flags & BTRFS_INODE_PREALLOC)) {
+-                      /*
+-                       * There can't be any extents following eof in this case
+-                       * so just set the delalloc new bit for the range
+-                       * directly.
+-                       */
+-                      extra_bits |= EXTENT_DELALLOC_NEW;
+-              } else {
+-                      err = btrfs_find_new_delalloc_bytes(inode, start_pos,
+-                                                          num_bytes, cached);
+-                      if (err)
+-                              return err;
+-              }
+-      }
+-
+       err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+                                       extra_bits, cached);
+       if (err)
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -2262,11 +2262,69 @@ static noinline int add_pending_csums(st
+       return 0;
+ }
++static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
++                                       const u64 start,
++                                       const u64 len,
++                                       struct extent_state **cached_state)
++{
++      u64 search_start = start;
++      const u64 end = start + len - 1;
++
++      while (search_start < end) {
++              const u64 search_len = end - search_start + 1;
++              struct extent_map *em;
++              u64 em_len;
++              int ret = 0;
++
++              em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
++              if (IS_ERR(em))
++                      return PTR_ERR(em);
++
++              if (em->block_start != EXTENT_MAP_HOLE)
++                      goto next;
++
++              em_len = em->len;
++              if (em->start < search_start)
++                      em_len -= search_start - em->start;
++              if (em_len > search_len)
++                      em_len = search_len;
++
++              ret = set_extent_bit(&inode->io_tree, search_start,
++                                   search_start + em_len - 1,
++                                   EXTENT_DELALLOC_NEW,
++                                   NULL, cached_state, GFP_NOFS);
++next:
++              search_start = extent_map_end(em);
++              free_extent_map(em);
++              if (ret)
++                      return ret;
++      }
++      return 0;
++}
++
+ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
+                             unsigned int extra_bits,
+                             struct extent_state **cached_state)
+ {
+       WARN_ON(PAGE_ALIGNED(end));
++
++      if (start >= i_size_read(&inode->vfs_inode) &&
++          !(inode->flags & BTRFS_INODE_PREALLOC)) {
++              /*
++               * There can't be any extents following eof in this case so just
++               * set the delalloc new bit for the range directly.
++               */
++              extra_bits |= EXTENT_DELALLOC_NEW;
++      } else {
++              int ret;
++
++              ret = btrfs_find_new_delalloc_bytes(inode, start,
++                                                  end + 1 - start,
++                                                  cached_state);
++              if (ret)
++                      return ret;
++      }
++
+       return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
+                                  cached_state);
+ }
+--- a/fs/btrfs/tests/inode-tests.c
++++ b/fs/btrfs/tests/inode-tests.c
+@@ -986,7 +986,8 @@ static int test_extent_accounting(u32 se
+       ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+                              BTRFS_MAX_EXTENT_SIZE >> 1,
+                              (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1,
+-                             EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
++                             EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
++                             EXTENT_UPTODATE, 0, 0, NULL);
+       if (ret) {
+               test_err("clear_extent_bit returned %d", ret);
+               goto out;
+@@ -1053,7 +1054,8 @@ static int test_extent_accounting(u32 se
+       ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+                              BTRFS_MAX_EXTENT_SIZE + sectorsize,
+                              BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1,
+-                             EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
++                             EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
++                             EXTENT_UPTODATE, 0, 0, NULL);
+       if (ret) {
+               test_err("clear_extent_bit returned %d", ret);
+               goto out;
+@@ -1085,7 +1087,8 @@ static int test_extent_accounting(u32 se
+       /* Empty */
+       ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+-                             EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
++                             EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
++                             EXTENT_UPTODATE, 0, 0, NULL);
+       if (ret) {
+               test_err("clear_extent_bit returned %d", ret);
+               goto out;
+@@ -1100,7 +1103,8 @@ static int test_extent_accounting(u32 se
+ out:
+       if (ret)
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+-                               EXTENT_DELALLOC | EXTENT_UPTODATE, 0, 0, NULL);
++                               EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
++                               EXTENT_UPTODATE, 0, 0, NULL);
+       iput(inode);
+       btrfs_free_dummy_root(root);
+       btrfs_free_dummy_fs_info(fs_info);
diff --git a/queue-5.9/btrfs-tree-checker-add-missing-return-after-error-in-root_item.patch b/queue-5.9/btrfs-tree-checker-add-missing-return-after-error-in-root_item.patch
new file mode 100644 (file)
index 0000000..5804554
--- /dev/null
@@ -0,0 +1,36 @@
+From 1a49a97df657c63a4e8ffcd1ea9b6ed95581789b Mon Sep 17 00:00:00 2001
+From: Daniel Xu <dxu@dxuuu.xyz>
+Date: Thu, 12 Nov 2020 17:55:06 -0800
+Subject: btrfs: tree-checker: add missing return after error in root_item
+
+From: Daniel Xu <dxu@dxuuu.xyz>
+
+commit 1a49a97df657c63a4e8ffcd1ea9b6ed95581789b upstream.
+
+There's a missing return statement after an error is found in the
+root_item, this can cause further problems when a crafted image triggers
+the error.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=210181
+Fixes: 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check")
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-checker.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -1068,6 +1068,7 @@ static int check_root_item(struct extent
+                           "invalid root item size, have %u expect %zu or %u",
+                           btrfs_item_size_nr(leaf, slot), sizeof(ri),
+                           btrfs_legacy_root_item_size());
++              return -EUCLEAN;
+       }
+       /*
diff --git a/queue-5.9/btrfs-tree-checker-add-missing-returns-after-data_ref-alignment-checks.patch b/queue-5.9/btrfs-tree-checker-add-missing-returns-after-data_ref-alignment-checks.patch
new file mode 100644 (file)
index 0000000..8758d1e
--- /dev/null
@@ -0,0 +1,41 @@
+From 6d06b0ad94d3dd7e3503d8ad39c39c4634884611 Mon Sep 17 00:00:00 2001
+From: David Sterba <dsterba@suse.com>
+Date: Mon, 16 Nov 2020 19:53:52 +0100
+Subject: btrfs: tree-checker: add missing returns after data_ref alignment checks
+
+From: David Sterba <dsterba@suse.com>
+
+commit 6d06b0ad94d3dd7e3503d8ad39c39c4634884611 upstream.
+
+There are sectorsize alignment checks that are reported but then
+check_extent_data_ref continues. This was not intended, wrong alignment
+is not a minor problem and we should return with error.
+
+CC: stable@vger.kernel.org # 5.4+
+Fixes: 0785a9aacf9d ("btrfs: tree-checker: Add EXTENT_DATA_REF check")
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/tree-checker.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -1424,6 +1424,7 @@ static int check_extent_data_ref(struct
+       "invalid item size, have %u expect aligned to %zu for key type %u",
+                           btrfs_item_size_nr(leaf, slot),
+                           sizeof(*dref), key->type);
++              return -EUCLEAN;
+       }
+       if (!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize)) {
+               generic_err(leaf, slot,
+@@ -1452,6 +1453,7 @@ static int check_extent_data_ref(struct
+                       extent_err(leaf, slot,
+       "invalid extent data backref offset, have %llu expect aligned to %u",
+                                  offset, leaf->fs_info->sectorsize);
++                      return -EUCLEAN;
+               }
+       }
+       return 0;
diff --git a/queue-5.9/ib-hfi1-ensure-correct-mm-is-used-at-all-times.patch b/queue-5.9/ib-hfi1-ensure-correct-mm-is-used-at-all-times.patch
new file mode 100644 (file)
index 0000000..f5a1e0a
--- /dev/null
@@ -0,0 +1,444 @@
+From 3d2a9d642512c21a12d19b9250e7a835dcb41a79 Mon Sep 17 00:00:00 2001
+From: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
+Date: Wed, 25 Nov 2020 16:01:12 -0500
+Subject: IB/hfi1: Ensure correct mm is used at all times
+
+From: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
+
+commit 3d2a9d642512c21a12d19b9250e7a835dcb41a79 upstream.
+
+Two earlier bug fixes have created a security problem in the hfi1
+driver. One fix aimed to solve an issue where current->mm was not valid
+when closing the hfi1 cdev. It attempted to do this by saving a cached
+value of the current->mm pointer at file open time. This is a problem if
+another process with access to the FD calls in via write() or ioctl() to
+pin pages via the hfi driver. The other fix tried to solve a use after
+free by taking a reference on the mm.
+
+To fix this correctly we use the existing cached value of the mm in the
+mmu notifier. Now we can check in the insert, evict, etc. routines that
+current->mm matched what the notifier was registered for. If not, then
+don't allow access. The register of the mmu notifier will save the mm
+pointer.
+
+Since in do_exit() the exit_mm() is called before exit_files(), which
+would call our close routine a reference is needed on the mm. We rely on
+the mmgrab done by the registration of the notifier, whereas before it was
+explicit. The mmu notifier deregistration happens when the user context is
+torn down, the creation of which triggered the registration.
+
+Also of note is we do not do any explicit work to protect the interval
+tree notifier. It doesn't seem that this is going to be needed since we
+aren't actually doing anything with current->mm. The interval tree
+notifier stuff still has a FIXME noted from a previous commit that will be
+addressed in a follow on patch.
+
+Cc: <stable@vger.kernel.org>
+Fixes: e0cf75deab81 ("IB/hfi1: Fix mm_struct use after free")
+Fixes: 3faa3d9a308e ("IB/hfi1: Make use of mm consistent")
+Link: https://lore.kernel.org/r/20201125210112.104301.51331.stgit@awfm-01.aw.intel.com
+Suggested-by: Jann Horn <jannh@google.com>
+Reported-by: Jason Gunthorpe <jgg@nvidia.com>
+Reviewed-by: Ira Weiny <ira.weiny@intel.com>
+Reviewed-by: Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com>
+Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/infiniband/hw/hfi1/file_ops.c     |    4 -
+ drivers/infiniband/hw/hfi1/hfi.h          |    2 
+ drivers/infiniband/hw/hfi1/mmu_rb.c       |   66 +++++++++++++++---------------
+ drivers/infiniband/hw/hfi1/mmu_rb.h       |   16 ++++++-
+ drivers/infiniband/hw/hfi1/user_exp_rcv.c |   12 +++--
+ drivers/infiniband/hw/hfi1/user_exp_rcv.h |    6 ++
+ drivers/infiniband/hw/hfi1/user_sdma.c    |   13 +++--
+ drivers/infiniband/hw/hfi1/user_sdma.h    |    7 ++-
+ 8 files changed, 78 insertions(+), 48 deletions(-)
+
+--- a/drivers/infiniband/hw/hfi1/file_ops.c
++++ b/drivers/infiniband/hw/hfi1/file_ops.c
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright(c) 2020 Cornelis Networks, Inc.
+  * Copyright(c) 2015-2020 Intel Corporation.
+  *
+  * This file is provided under a dual BSD/GPLv2 license.  When using or
+@@ -206,8 +207,6 @@ static int hfi1_file_open(struct inode *
+       spin_lock_init(&fd->tid_lock);
+       spin_lock_init(&fd->invalid_lock);
+       fd->rec_cpu_num = -1; /* no cpu affinity by default */
+-      fd->mm = current->mm;
+-      mmgrab(fd->mm);
+       fd->dd = dd;
+       fp->private_data = fd;
+       return 0;
+@@ -711,7 +710,6 @@ static int hfi1_file_close(struct inode
+       deallocate_ctxt(uctxt);
+ done:
+-      mmdrop(fdata->mm);
+       if (atomic_dec_and_test(&dd->user_refcount))
+               complete(&dd->user_comp);
+--- a/drivers/infiniband/hw/hfi1/hfi.h
++++ b/drivers/infiniband/hw/hfi1/hfi.h
+@@ -1,6 +1,7 @@
+ #ifndef _HFI1_KERNEL_H
+ #define _HFI1_KERNEL_H
+ /*
++ * Copyright(c) 2020 Cornelis Networks, Inc.
+  * Copyright(c) 2015-2020 Intel Corporation.
+  *
+  * This file is provided under a dual BSD/GPLv2 license.  When using or
+@@ -1451,7 +1452,6 @@ struct hfi1_filedata {
+       u32 invalid_tid_idx;
+       /* protect invalid_tids array and invalid_tid_idx */
+       spinlock_t invalid_lock;
+-      struct mm_struct *mm;
+ };
+ extern struct xarray hfi1_dev_table;
+--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
++++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright(c) 2020 Cornelis Networks, Inc.
+  * Copyright(c) 2016 - 2017 Intel Corporation.
+  *
+  * This file is provided under a dual BSD/GPLv2 license.  When using or
+@@ -48,23 +49,11 @@
+ #include <linux/rculist.h>
+ #include <linux/mmu_notifier.h>
+ #include <linux/interval_tree_generic.h>
++#include <linux/sched/mm.h>
+ #include "mmu_rb.h"
+ #include "trace.h"
+-struct mmu_rb_handler {
+-      struct mmu_notifier mn;
+-      struct rb_root_cached root;
+-      void *ops_arg;
+-      spinlock_t lock;        /* protect the RB tree */
+-      struct mmu_rb_ops *ops;
+-      struct mm_struct *mm;
+-      struct list_head lru_list;
+-      struct work_struct del_work;
+-      struct list_head del_list;
+-      struct workqueue_struct *wq;
+-};
+-
+ static unsigned long mmu_node_start(struct mmu_rb_node *);
+ static unsigned long mmu_node_last(struct mmu_rb_node *);
+ static int mmu_notifier_range_start(struct mmu_notifier *,
+@@ -92,37 +81,36 @@ static unsigned long mmu_node_last(struc
+       return PAGE_ALIGN(node->addr + node->len) - 1;
+ }
+-int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
++int hfi1_mmu_rb_register(void *ops_arg,
+                        struct mmu_rb_ops *ops,
+                        struct workqueue_struct *wq,
+                        struct mmu_rb_handler **handler)
+ {
+-      struct mmu_rb_handler *handlr;
++      struct mmu_rb_handler *h;
+       int ret;
+-      handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
+-      if (!handlr)
++      h = kmalloc(sizeof(*h), GFP_KERNEL);
++      if (!h)
+               return -ENOMEM;
+-      handlr->root = RB_ROOT_CACHED;
+-      handlr->ops = ops;
+-      handlr->ops_arg = ops_arg;
+-      INIT_HLIST_NODE(&handlr->mn.hlist);
+-      spin_lock_init(&handlr->lock);
+-      handlr->mn.ops = &mn_opts;
+-      handlr->mm = mm;
+-      INIT_WORK(&handlr->del_work, handle_remove);
+-      INIT_LIST_HEAD(&handlr->del_list);
+-      INIT_LIST_HEAD(&handlr->lru_list);
+-      handlr->wq = wq;
++      h->root = RB_ROOT_CACHED;
++      h->ops = ops;
++      h->ops_arg = ops_arg;
++      INIT_HLIST_NODE(&h->mn.hlist);
++      spin_lock_init(&h->lock);
++      h->mn.ops = &mn_opts;
++      INIT_WORK(&h->del_work, handle_remove);
++      INIT_LIST_HEAD(&h->del_list);
++      INIT_LIST_HEAD(&h->lru_list);
++      h->wq = wq;
+-      ret = mmu_notifier_register(&handlr->mn, handlr->mm);
++      ret = mmu_notifier_register(&h->mn, current->mm);
+       if (ret) {
+-              kfree(handlr);
++              kfree(h);
+               return ret;
+       }
+-      *handler = handlr;
++      *handler = h;
+       return 0;
+ }
+@@ -134,7 +122,7 @@ void hfi1_mmu_rb_unregister(struct mmu_r
+       struct list_head del_list;
+       /* Unregister first so we don't get any more notifications. */
+-      mmu_notifier_unregister(&handler->mn, handler->mm);
++      mmu_notifier_unregister(&handler->mn, handler->mn.mm);
+       /*
+        * Make sure the wq delete handler is finished running.  It will not
+@@ -166,6 +154,10 @@ int hfi1_mmu_rb_insert(struct mmu_rb_han
+       int ret = 0;
+       trace_hfi1_mmu_rb_insert(mnode->addr, mnode->len);
++
++      if (current->mm != handler->mn.mm)
++              return -EPERM;
++
+       spin_lock_irqsave(&handler->lock, flags);
+       node = __mmu_rb_search(handler, mnode->addr, mnode->len);
+       if (node) {
+@@ -180,6 +172,7 @@ int hfi1_mmu_rb_insert(struct mmu_rb_han
+               __mmu_int_rb_remove(mnode, &handler->root);
+               list_del(&mnode->list); /* remove from LRU list */
+       }
++      mnode->handler = handler;
+ unlock:
+       spin_unlock_irqrestore(&handler->lock, flags);
+       return ret;
+@@ -217,6 +210,9 @@ bool hfi1_mmu_rb_remove_unless_exact(str
+       unsigned long flags;
+       bool ret = false;
++      if (current->mm != handler->mn.mm)
++              return ret;
++
+       spin_lock_irqsave(&handler->lock, flags);
+       node = __mmu_rb_search(handler, addr, len);
+       if (node) {
+@@ -239,6 +235,9 @@ void hfi1_mmu_rb_evict(struct mmu_rb_han
+       unsigned long flags;
+       bool stop = false;
++      if (current->mm != handler->mn.mm)
++              return;
++
+       INIT_LIST_HEAD(&del_list);
+       spin_lock_irqsave(&handler->lock, flags);
+@@ -272,6 +271,9 @@ void hfi1_mmu_rb_remove(struct mmu_rb_ha
+ {
+       unsigned long flags;
++      if (current->mm != handler->mn.mm)
++              return;
++
+       /* Validity of handler and node pointers has been checked by caller. */
+       trace_hfi1_mmu_rb_remove(node->addr, node->len);
+       spin_lock_irqsave(&handler->lock, flags);
+--- a/drivers/infiniband/hw/hfi1/mmu_rb.h
++++ b/drivers/infiniband/hw/hfi1/mmu_rb.h
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright(c) 2020 Cornelis Networks, Inc.
+  * Copyright(c) 2016 Intel Corporation.
+  *
+  * This file is provided under a dual BSD/GPLv2 license.  When using or
+@@ -54,6 +55,7 @@ struct mmu_rb_node {
+       unsigned long len;
+       unsigned long __last;
+       struct rb_node node;
++      struct mmu_rb_handler *handler;
+       struct list_head list;
+ };
+@@ -71,7 +73,19 @@ struct mmu_rb_ops {
+                    void *evict_arg, bool *stop);
+ };
+-int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
++struct mmu_rb_handler {
++      struct mmu_notifier mn;
++      struct rb_root_cached root;
++      void *ops_arg;
++      spinlock_t lock;        /* protect the RB tree */
++      struct mmu_rb_ops *ops;
++      struct list_head lru_list;
++      struct work_struct del_work;
++      struct list_head del_list;
++      struct workqueue_struct *wq;
++};
++
++int hfi1_mmu_rb_register(void *ops_arg,
+                        struct mmu_rb_ops *ops,
+                        struct workqueue_struct *wq,
+                        struct mmu_rb_handler **handler);
+--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
++++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright(c) 2020 Cornelis Networks, Inc.
+  * Copyright(c) 2015-2018 Intel Corporation.
+  *
+  * This file is provided under a dual BSD/GPLv2 license.  When using or
+@@ -173,15 +174,18 @@ static void unpin_rcv_pages(struct hfi1_
+ {
+       struct page **pages;
+       struct hfi1_devdata *dd = fd->uctxt->dd;
++      struct mm_struct *mm;
+       if (mapped) {
+               pci_unmap_single(dd->pcidev, node->dma_addr,
+                                node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
+               pages = &node->pages[idx];
++              mm = mm_from_tid_node(node);
+       } else {
+               pages = &tidbuf->pages[idx];
++              mm = current->mm;
+       }
+-      hfi1_release_user_pages(fd->mm, pages, npages, mapped);
++      hfi1_release_user_pages(mm, pages, npages, mapped);
+       fd->tid_n_pinned -= npages;
+ }
+@@ -216,12 +220,12 @@ static int pin_rcv_pages(struct hfi1_fil
+        * pages, accept the amount pinned so far and program only that.
+        * User space knows how to deal with partially programmed buffers.
+        */
+-      if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) {
++      if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
+               kfree(pages);
+               return -ENOMEM;
+       }
+-      pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages);
++      pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
+       if (pinned <= 0) {
+               kfree(pages);
+               return pinned;
+@@ -756,7 +760,7 @@ static int set_rcvarray_entry(struct hfi
+       if (fd->use_mn) {
+               ret = mmu_interval_notifier_insert(
+-                      &node->notifier, fd->mm,
++                      &node->notifier, current->mm,
+                       tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
+                       &tid_mn_ops);
+               if (ret)
+--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
++++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+@@ -1,6 +1,7 @@
+ #ifndef _HFI1_USER_EXP_RCV_H
+ #define _HFI1_USER_EXP_RCV_H
+ /*
++ * Copyright(c) 2020 - Cornelis Networks, Inc.
+  * Copyright(c) 2015 - 2017 Intel Corporation.
+  *
+  * This file is provided under a dual BSD/GPLv2 license.  When using or
+@@ -95,4 +96,9 @@ int hfi1_user_exp_rcv_clear(struct hfi1_
+ int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
+                             struct hfi1_tid_info *tinfo);
++static inline struct mm_struct *mm_from_tid_node(struct tid_rb_node *node)
++{
++      return node->notifier.mm;
++}
++
+ #endif /* _HFI1_USER_EXP_RCV_H */
+--- a/drivers/infiniband/hw/hfi1/user_sdma.c
++++ b/drivers/infiniband/hw/hfi1/user_sdma.c
+@@ -1,4 +1,5 @@
+ /*
++ * Copyright(c) 2020 - Cornelis Networks, Inc.
+  * Copyright(c) 2015 - 2018 Intel Corporation.
+  *
+  * This file is provided under a dual BSD/GPLv2 license.  When using or
+@@ -188,7 +189,6 @@ int hfi1_user_sdma_alloc_queues(struct h
+       atomic_set(&pq->n_reqs, 0);
+       init_waitqueue_head(&pq->wait);
+       atomic_set(&pq->n_locked, 0);
+-      pq->mm = fd->mm;
+       iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
+                   activate_packet_queue, NULL, NULL);
+@@ -230,7 +230,7 @@ int hfi1_user_sdma_alloc_queues(struct h
+       cq->nentries = hfi1_sdma_comp_ring_size;
+-      ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
++      ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq,
+                                  &pq->handler);
+       if (ret) {
+               dd_dev_err(dd, "Failed to register with MMU %d", ret);
+@@ -980,13 +980,13 @@ static int pin_sdma_pages(struct user_sd
+       npages -= node->npages;
+ retry:
+-      if (!hfi1_can_pin_pages(pq->dd, pq->mm,
++      if (!hfi1_can_pin_pages(pq->dd, current->mm,
+                               atomic_read(&pq->n_locked), npages)) {
+               cleared = sdma_cache_evict(pq, npages);
+               if (cleared >= npages)
+                       goto retry;
+       }
+-      pinned = hfi1_acquire_user_pages(pq->mm,
++      pinned = hfi1_acquire_user_pages(current->mm,
+                                        ((unsigned long)iovec->iov.iov_base +
+                                        (node->npages * PAGE_SIZE)), npages, 0,
+                                        pages + node->npages);
+@@ -995,7 +995,7 @@ retry:
+               return pinned;
+       }
+       if (pinned != npages) {
+-              unpin_vector_pages(pq->mm, pages, node->npages, pinned);
++              unpin_vector_pages(current->mm, pages, node->npages, pinned);
+               return -EFAULT;
+       }
+       kfree(node->pages);
+@@ -1008,7 +1008,8 @@ retry:
+ static void unpin_sdma_pages(struct sdma_mmu_node *node)
+ {
+       if (node->npages) {
+-              unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
++              unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
++                                 node->npages);
+               atomic_sub(node->npages, &node->pq->n_locked);
+       }
+ }
+--- a/drivers/infiniband/hw/hfi1/user_sdma.h
++++ b/drivers/infiniband/hw/hfi1/user_sdma.h
+@@ -1,6 +1,7 @@
+ #ifndef _HFI1_USER_SDMA_H
+ #define _HFI1_USER_SDMA_H
+ /*
++ * Copyright(c) 2020 - Cornelis Networks, Inc.
+  * Copyright(c) 2015 - 2018 Intel Corporation.
+  *
+  * This file is provided under a dual BSD/GPLv2 license.  When using or
+@@ -133,7 +134,6 @@ struct hfi1_user_sdma_pkt_q {
+       unsigned long unpinned;
+       struct mmu_rb_handler *handler;
+       atomic_t n_locked;
+-      struct mm_struct *mm;
+ };
+ struct hfi1_user_sdma_comp_q {
+@@ -250,4 +250,9 @@ int hfi1_user_sdma_process_request(struc
+                                  struct iovec *iovec, unsigned long dim,
+                                  unsigned long *count);
++static inline struct mm_struct *mm_from_sdma_node(struct sdma_mmu_node *node)
++{
++      return node->rb.handler->mn.mm;
++}
++
+ #endif /* _HFI1_USER_SDMA_H */
diff --git a/queue-5.9/rdma-i40iw-address-an-mmap-handler-exploit-in-i40iw.patch b/queue-5.9/rdma-i40iw-address-an-mmap-handler-exploit-in-i40iw.patch
new file mode 100644 (file)
index 0000000..e692987
--- /dev/null
@@ -0,0 +1,106 @@
+From 2ed381439e89fa6d1a0839ef45ccd45d99d8e915 Mon Sep 17 00:00:00 2001
+From: Shiraz Saleem <shiraz.saleem@intel.com>
+Date: Tue, 24 Nov 2020 18:56:16 -0600
+Subject: RDMA/i40iw: Address an mmap handler exploit in i40iw
+
+From: Shiraz Saleem <shiraz.saleem@intel.com>
+
+commit 2ed381439e89fa6d1a0839ef45ccd45d99d8e915 upstream.
+
+i40iw_mmap manipulates the vma->vm_pgoff to differentiate a push page mmap
+vs a doorbell mmap, and uses it to compute the pfn in remap_pfn_range
+without any validation. This is vulnerable to an mmap exploit as described
+in: https://lore.kernel.org/r/20201119093523.7588-1-zhudi21@huawei.com
+
+The push feature is disabled in the driver currently and therefore no push
+mmaps are issued from user-space. The feature does not work as expected in
+the x722 product.
+
+Remove the push module parameter and all VMA attribute manipulations for
+this feature in i40iw_mmap. Update i40iw_mmap to only allow DB user
+mmapings at offset = 0. Check vm_pgoff for zero and if the mmaps are bound
+to a single page.
+
+Cc: <stable@kernel.org>
+Fixes: d37498417947 ("i40iw: add files for iwarp interface")
+Link: https://lore.kernel.org/r/20201125005616.1800-2-shiraz.saleem@intel.com
+Reported-by: Di Zhu <zhudi21@huawei.com>
+Signed-off-by: Shiraz Saleem <shiraz.saleem@intel.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/infiniband/hw/i40iw/i40iw_main.c  |    5 ----
+ drivers/infiniband/hw/i40iw/i40iw_verbs.c |   37 +++++-------------------------
+ 2 files changed, 7 insertions(+), 35 deletions(-)
+
+--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
++++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
+@@ -54,10 +54,6 @@
+ #define DRV_VERSION   __stringify(DRV_VERSION_MAJOR) "."              \
+       __stringify(DRV_VERSION_MINOR) "." __stringify(DRV_VERSION_BUILD)
+-static int push_mode;
+-module_param(push_mode, int, 0644);
+-MODULE_PARM_DESC(push_mode, "Low latency mode: 0=disabled (default), 1=enabled)");
+-
+ static int debug;
+ module_param(debug, int, 0644);
+ MODULE_PARM_DESC(debug, "debug flags: 0=disabled (default), 0x7fffffff=all");
+@@ -1580,7 +1576,6 @@ static enum i40iw_status_code i40iw_setu
+       if (status)
+               goto exit;
+       iwdev->obj_next = iwdev->obj_mem;
+-      iwdev->push_mode = push_mode;
+       init_waitqueue_head(&iwdev->vchnl_waitq);
+       init_waitqueue_head(&dev->vf_reqs);
+--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
++++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+@@ -167,39 +167,16 @@ static void i40iw_dealloc_ucontext(struc
+  */
+ static int i40iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+ {
+-      struct i40iw_ucontext *ucontext;
+-      u64 db_addr_offset, push_offset, pfn;
++      struct i40iw_ucontext *ucontext = to_ucontext(context);
++      u64 dbaddr;
+-      ucontext = to_ucontext(context);
+-      if (ucontext->iwdev->sc_dev.is_pf) {
+-              db_addr_offset = I40IW_DB_ADDR_OFFSET;
+-              push_offset = I40IW_PUSH_OFFSET;
+-              if (vma->vm_pgoff)
+-                      vma->vm_pgoff += I40IW_PF_FIRST_PUSH_PAGE_INDEX - 1;
+-      } else {
+-              db_addr_offset = I40IW_VF_DB_ADDR_OFFSET;
+-              push_offset = I40IW_VF_PUSH_OFFSET;
+-              if (vma->vm_pgoff)
+-                      vma->vm_pgoff += I40IW_VF_FIRST_PUSH_PAGE_INDEX - 1;
+-      }
++      if (vma->vm_pgoff || vma->vm_end - vma->vm_start != PAGE_SIZE)
++              return -EINVAL;
+-      vma->vm_pgoff += db_addr_offset >> PAGE_SHIFT;
++      dbaddr = I40IW_DB_ADDR_OFFSET + pci_resource_start(ucontext->iwdev->ldev->pcidev, 0);
+-      if (vma->vm_pgoff == (db_addr_offset >> PAGE_SHIFT)) {
+-              vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+-      } else {
+-              if ((vma->vm_pgoff - (push_offset >> PAGE_SHIFT)) % 2)
+-                      vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+-              else
+-                      vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+-      }
+-
+-      pfn = vma->vm_pgoff +
+-            (pci_resource_start(ucontext->iwdev->ldev->pcidev, 0) >>
+-             PAGE_SHIFT);
+-
+-      return rdma_user_mmap_io(context, vma, pfn, PAGE_SIZE,
+-                               vma->vm_page_prot, NULL);
++      return rdma_user_mmap_io(context, vma, dbaddr >> PAGE_SHIFT, PAGE_SIZE,
++                               pgprot_noncached(vma->vm_page_prot), NULL);
+ }
+ /**
index e421fbb407b3aafd7458a525c02835454c73a269..85cf3033dcf5d02b80130df33ea7471406b4716e 100644 (file)
@@ -3,3 +3,10 @@ io_uring-order-refnode-recycling.patch
 spi-bcm-qspi-fix-use-after-free-on-unbind.patch
 spi-bcm2835-fix-use-after-free-on-unbind.patch
 ipv4-use-is_enabled-instead-of-ifdef.patch
+ib-hfi1-ensure-correct-mm-is-used-at-all-times.patch
+rdma-i40iw-address-an-mmap-handler-exploit-in-i40iw.patch
+btrfs-fix-missing-delalloc-new-bit-for-new-delalloc-ranges.patch
+btrfs-tree-checker-add-missing-return-after-error-in-root_item.patch
+btrfs-tree-checker-add-missing-returns-after-data_ref-alignment-checks.patch
+btrfs-don-t-access-possibly-stale-fs_info-data-for-printing-duplicate-device.patch
+btrfs-fix-lockdep-splat-when-reading-qgroup-config-on-mount.patch