From df47820a8aca53cb3b574b5cd9b187531d33d70c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 27 Feb 2020 10:02:02 +0100 Subject: [PATCH] 5.5-stable patches added patches: drm-i915-execlists-always-force-a-context-reload-when-rewinding-ring_tail.patch ext4-add-cond_resched-to-__ext4_find_entry.patch ext4-fix-a-data-race-in-ext4_i-inode-i_disksize.patch ext4-fix-mount-failure-with-quota-configured-as-module.patch ext4-fix-potential-race-between-online-resizing-and-write-operations.patch ext4-fix-potential-race-between-s_flex_groups-online-resizing-and-access.patch ext4-fix-potential-race-between-s_group_info-online-resizing-and-access.patch ext4-fix-race-between-writepages-and-enabling-ext4_extents_fl.patch ext4-rename-s_journal_flag_rwsem-to-s_writepages_rwsem.patch kvm-nvmx-check-io-instruction-vm-exit-conditions.patch kvm-nvmx-refactor-io-bitmap-checks-into-helper-function.patch --- ...text-reload-when-rewinding-ring_tail.patch | 141 ++++++++ ...dd-cond_resched-to-__ext4_find_entry.patch | 71 ++++ ...data-race-in-ext4_i-inode-i_disksize.patch | 87 +++++ ...lure-with-quota-configured-as-module.patch | 36 ++ ...online-resizing-and-write-operations.patch | 316 ++++++++++++++++++ ...ex_groups-online-resizing-and-access.patch | 291 ++++++++++++++++ ...roup_info-online-resizing-and-access.patch | 180 ++++++++++ ...tepages-and-enabling-ext4_extents_fl.patch | 167 +++++++++ ...nal_flag_rwsem-to-s_writepages_rwsem.patch | 130 +++++++ ...ck-io-instruction-vm-exit-conditions.patch | 113 +++++++ ...o-bitmap-checks-into-helper-function.patch | 93 ++++++ queue-5.5/series | 11 + 12 files changed, 1636 insertions(+) create mode 100644 queue-5.5/drm-i915-execlists-always-force-a-context-reload-when-rewinding-ring_tail.patch create mode 100644 queue-5.5/ext4-add-cond_resched-to-__ext4_find_entry.patch create mode 100644 queue-5.5/ext4-fix-a-data-race-in-ext4_i-inode-i_disksize.patch create mode 100644 queue-5.5/ext4-fix-mount-failure-with-quota-configured-as-module.patch create mode 100644 queue-5.5/ext4-fix-potential-race-between-online-resizing-and-write-operations.patch create mode 100644 queue-5.5/ext4-fix-potential-race-between-s_flex_groups-online-resizing-and-access.patch create mode 100644 queue-5.5/ext4-fix-potential-race-between-s_group_info-online-resizing-and-access.patch create mode 100644 queue-5.5/ext4-fix-race-between-writepages-and-enabling-ext4_extents_fl.patch create mode 100644 queue-5.5/ext4-rename-s_journal_flag_rwsem-to-s_writepages_rwsem.patch create mode 100644 queue-5.5/kvm-nvmx-check-io-instruction-vm-exit-conditions.patch create mode 100644 queue-5.5/kvm-nvmx-refactor-io-bitmap-checks-into-helper-function.patch diff --git a/queue-5.5/drm-i915-execlists-always-force-a-context-reload-when-rewinding-ring_tail.patch b/queue-5.5/drm-i915-execlists-always-force-a-context-reload-when-rewinding-ring_tail.patch new file mode 100644 index 00000000000..7b7514f35d9 --- /dev/null +++ b/queue-5.5/drm-i915-execlists-always-force-a-context-reload-when-rewinding-ring_tail.patch @@ -0,0 +1,141 @@ +From b1339ecac661e1cf3e1dc78ac56bff3aeeaeb92c Mon Sep 17 00:00:00 2001 +From: Chris Wilson +Date: Fri, 7 Feb 2020 21:14:52 +0000 +Subject: drm/i915/execlists: Always force a context reload when rewinding RING_TAIL + +From: Chris Wilson + +commit b1339ecac661e1cf3e1dc78ac56bff3aeeaeb92c upstream. + +If we rewind the RING_TAIL on a context, due to a preemption event, we +must force the context restore for the RING_TAIL update to be properly +handled. Rather than note which preemption events may cause us to rewind +the tail, compare the new request's tail with the previously submitted +RING_TAIL, as it turns out that timeslicing was causing unexpected +rewinds. + + -0 0d.s2 1280851190us : __execlists_submission_tasklet: 0000:00:02.0 rcs0: expired last=130:4698, prio=3, hint=3 + -0 0d.s2 1280851192us : __i915_request_unsubmit: 0000:00:02.0 rcs0: fence 66:119966, current 119964 + -0 0d.s2 1280851195us : __i915_request_unsubmit: 0000:00:02.0 rcs0: fence 130:4698, current 4695 + -0 0d.s2 1280851198us : __i915_request_unsubmit: 0000:00:02.0 rcs0: fence 130:4696, current 4695 +^---- Note we unwind 2 requests from the same context + + -0 0d.s2 1280851208us : __i915_request_submit: 0000:00:02.0 rcs0: fence 130:4696, current 4695 + -0 0d.s2 1280851213us : __i915_request_submit: 0000:00:02.0 rcs0: fence 134:1508, current 1506 +^---- But to apply the new timeslice, we have to replay the first request + before the new client can start -- the unexpected RING_TAIL rewind + + -0 0d.s2 1280851219us : trace_ports: 0000:00:02.0 rcs0: submit { 130:4696*, 134:1508 } + synmark2-5425 2..s. 1280851239us : process_csb: 0000:00:02.0 rcs0: cs-irq head=5, tail=0 + synmark2-5425 2..s. 1280851240us : process_csb: 0000:00:02.0 rcs0: csb[0]: status=0x00008002:0x00000000 +^---- Preemption event for the ELSP update; note the lite-restore + + synmark2-5425 2..s. 1280851243us : trace_ports: 0000:00:02.0 rcs0: preempted { 130:4698, 66:119966 } + synmark2-5425 2..s. 1280851246us : trace_ports: 0000:00:02.0 rcs0: promote { 130:4696*, 134:1508 } + synmark2-5425 2.... 1280851462us : __i915_request_commit: 0000:00:02.0 rcs0: fence 130:4700, current 4695 + synmark2-5425 2.... 1280852111us : __i915_request_commit: 0000:00:02.0 rcs0: fence 130:4702, current 4695 + synmark2-5425 2.Ns1 1280852296us : process_csb: 0000:00:02.0 rcs0: cs-irq head=0, tail=2 + synmark2-5425 2.Ns1 1280852297us : process_csb: 0000:00:02.0 rcs0: csb[1]: status=0x00000814:0x00000000 + synmark2-5425 2.Ns1 1280852299us : trace_ports: 0000:00:02.0 rcs0: completed { 130:4696!, 134:1508 } + synmark2-5425 2.Ns1 1280852301us : process_csb: 0000:00:02.0 rcs0: csb[2]: status=0x00000818:0x00000040 + synmark2-5425 2.Ns1 1280852302us : trace_ports: 0000:00:02.0 rcs0: completed { 134:1508, 0:0 } + synmark2-5425 2.Ns1 1280852313us : process_csb: process_csb:2336 GEM_BUG_ON(!i915_request_completed(*execlists->active) && !reset_in_progress(execlists)) + +Fixes: 8ee36e048c98 ("drm/i915/execlists: Minimalistic timeslicing") +Referenecs: 82c69bf58650 ("drm/i915/gt: Detect if we miss WaIdleLiteRestore") +Signed-off-by: Chris Wilson +Cc: Mika Kuoppala +Reviewed-by: Mika Kuoppala +Cc: # v5.4+ +Link: https://patchwork.freedesktop.org/patch/msgid/20200207211452.2860634-1-chris@chris-wilson.co.uk +(cherry picked from commit 5ba32c7be81e53ea8a27190b0f6be98e6c6779af) +Signed-off-by: Jani Nikula +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/i915/gt/intel_lrc.c | 18 ++++++++---------- + drivers/gpu/drm/i915/gt/intel_ring.c | 1 + + drivers/gpu/drm/i915/gt/intel_ring.h | 8 ++++++++ + drivers/gpu/drm/i915/gt/intel_ring_types.h | 1 + + 4 files changed, 18 insertions(+), 10 deletions(-) + +--- a/drivers/gpu/drm/i915/gt/intel_lrc.c ++++ b/drivers/gpu/drm/i915/gt/intel_lrc.c +@@ -1157,7 +1157,7 @@ static u64 execlists_update_context(stru + { + struct intel_context *ce = rq->hw_context; + u64 desc = ce->lrc_desc; +- u32 tail; ++ u32 tail, prev; + + /* + * WaIdleLiteRestore:bdw,skl +@@ -1170,9 +1170,15 @@ static u64 execlists_update_context(stru + * subsequent resubmissions (for lite restore). Should that fail us, + * and we try and submit the same tail again, force the context + * reload. ++ * ++ * If we need to return to a preempted context, we need to skip the ++ * lite-restore and force it to reload the RING_TAIL. Otherwise, the ++ * HW has a tendency to ignore us rewinding the TAIL to the end of ++ * an earlier request. + */ + tail = intel_ring_set_tail(rq->ring, rq->tail); +- if (unlikely(ce->lrc_reg_state[CTX_RING_TAIL] == tail)) ++ prev = ce->lrc_reg_state[CTX_RING_TAIL]; ++ if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) + desc |= CTX_DESC_FORCE_RESTORE; + ce->lrc_reg_state[CTX_RING_TAIL] = tail; + rq->tail = rq->wa_tail; +@@ -1651,14 +1657,6 @@ static void execlists_dequeue(struct int + */ + __unwind_incomplete_requests(engine); + +- /* +- * If we need to return to the preempted context, we +- * need to skip the lite-restore and force it to +- * reload the RING_TAIL. Otherwise, the HW has a +- * tendency to ignore us rewinding the TAIL to the +- * end of an earlier request. +- */ +- last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; + last = NULL; + } else if (need_timeslice(engine, last) && + timer_expired(&engine->execlists.timer)) { +--- a/drivers/gpu/drm/i915/gt/intel_ring.c ++++ b/drivers/gpu/drm/i915/gt/intel_ring.c +@@ -145,6 +145,7 @@ intel_engine_create_ring(struct intel_en + + kref_init(&ring->ref); + ring->size = size; ++ ring->wrap = BITS_PER_TYPE(ring->size) - ilog2(size); + + /* + * Workaround an erratum on the i830 which causes a hang if +--- a/drivers/gpu/drm/i915/gt/intel_ring.h ++++ b/drivers/gpu/drm/i915/gt/intel_ring.h +@@ -56,6 +56,14 @@ static inline u32 intel_ring_wrap(const + return pos & (ring->size - 1); + } + ++static inline int intel_ring_direction(const struct intel_ring *ring, ++ u32 next, u32 prev) ++{ ++ typecheck(typeof(ring->size), next); ++ typecheck(typeof(ring->size), prev); ++ return (next - prev) << ring->wrap; ++} ++ + static inline bool + intel_ring_offset_valid(const struct intel_ring *ring, + unsigned int pos) +--- a/drivers/gpu/drm/i915/gt/intel_ring_types.h ++++ b/drivers/gpu/drm/i915/gt/intel_ring_types.h +@@ -45,6 +45,7 @@ struct intel_ring { + + u32 space; + u32 size; ++ u32 wrap; + u32 effective_size; + }; + diff --git a/queue-5.5/ext4-add-cond_resched-to-__ext4_find_entry.patch b/queue-5.5/ext4-add-cond_resched-to-__ext4_find_entry.patch new file mode 100644 index 00000000000..658281e64ed --- /dev/null +++ b/queue-5.5/ext4-add-cond_resched-to-__ext4_find_entry.patch @@ -0,0 +1,71 @@ +From 9424ef56e13a1f14c57ea161eed3ecfdc7b2770e Mon Sep 17 00:00:00 2001 +From: Shijie Luo +Date: Sat, 15 Feb 2020 03:02:06 -0500 +Subject: ext4: add cond_resched() to __ext4_find_entry() + +From: Shijie Luo + +commit 9424ef56e13a1f14c57ea161eed3ecfdc7b2770e upstream. + +We tested a soft lockup problem in linux 4.19 which could also +be found in linux 5.x. + +When dir inode takes up a large number of blocks, and if the +directory is growing when we are searching, it's possible the +restart branch could be called many times, and the do while loop +could hold cpu a long time. + +Here is the call trace in linux 4.19. + +[ 473.756186] Call trace: +[ 473.756196] dump_backtrace+0x0/0x198 +[ 473.756199] show_stack+0x24/0x30 +[ 473.756205] dump_stack+0xa4/0xcc +[ 473.756210] watchdog_timer_fn+0x300/0x3e8 +[ 473.756215] __hrtimer_run_queues+0x114/0x358 +[ 473.756217] hrtimer_interrupt+0x104/0x2d8 +[ 473.756222] arch_timer_handler_virt+0x38/0x58 +[ 473.756226] handle_percpu_devid_irq+0x90/0x248 +[ 473.756231] generic_handle_irq+0x34/0x50 +[ 473.756234] __handle_domain_irq+0x68/0xc0 +[ 473.756236] gic_handle_irq+0x6c/0x150 +[ 473.756238] el1_irq+0xb8/0x140 +[ 473.756286] ext4_es_lookup_extent+0xdc/0x258 [ext4] +[ 473.756310] ext4_map_blocks+0x64/0x5c0 [ext4] +[ 473.756333] ext4_getblk+0x6c/0x1d0 [ext4] +[ 473.756356] ext4_bread_batch+0x7c/0x1f8 [ext4] +[ 473.756379] ext4_find_entry+0x124/0x3f8 [ext4] +[ 473.756402] ext4_lookup+0x8c/0x258 [ext4] +[ 473.756407] __lookup_hash+0x8c/0xe8 +[ 473.756411] filename_create+0xa0/0x170 +[ 473.756413] do_mkdirat+0x6c/0x140 +[ 473.756415] __arm64_sys_mkdirat+0x28/0x38 +[ 473.756419] el0_svc_common+0x78/0x130 +[ 473.756421] el0_svc_handler+0x38/0x78 +[ 473.756423] el0_svc+0x8/0xc +[ 485.755156] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [tmp:5149] + +Add cond_resched() to avoid soft lockup and to provide a better +system responding. + +Link: https://lore.kernel.org/r/20200215080206.13293-1-luoshijie1@huawei.com +Signed-off-by: Shijie Luo +Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/namei.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -1507,6 +1507,7 @@ restart: + /* + * We deal with the read-ahead logic here. + */ ++ cond_resched(); + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; diff --git a/queue-5.5/ext4-fix-a-data-race-in-ext4_i-inode-i_disksize.patch b/queue-5.5/ext4-fix-a-data-race-in-ext4_i-inode-i_disksize.patch new file mode 100644 index 00000000000..1faaddd4a9e --- /dev/null +++ b/queue-5.5/ext4-fix-a-data-race-in-ext4_i-inode-i_disksize.patch @@ -0,0 +1,87 @@ +From 35df4299a6487f323b0aca120ea3f485dfee2ae3 Mon Sep 17 00:00:00 2001 +From: Qian Cai +Date: Fri, 7 Feb 2020 09:29:11 -0500 +Subject: ext4: fix a data race in EXT4_I(inode)->i_disksize + +From: Qian Cai + +commit 35df4299a6487f323b0aca120ea3f485dfee2ae3 upstream. + +EXT4_I(inode)->i_disksize could be accessed concurrently as noticed by +KCSAN, + + BUG: KCSAN: data-race in ext4_write_end [ext4] / ext4_writepages [ext4] + + write to 0xffff91c6713b00f8 of 8 bytes by task 49268 on cpu 127: + ext4_write_end+0x4e3/0x750 [ext4] + ext4_update_i_disksize at fs/ext4/ext4.h:3032 + (inlined by) ext4_update_inode_size at fs/ext4/ext4.h:3046 + (inlined by) ext4_write_end at fs/ext4/inode.c:1287 + generic_perform_write+0x208/0x2a0 + ext4_buffered_write_iter+0x11f/0x210 [ext4] + ext4_file_write_iter+0xce/0x9e0 [ext4] + new_sync_write+0x29c/0x3b0 + __vfs_write+0x92/0xa0 + vfs_write+0x103/0x260 + ksys_write+0x9d/0x130 + __x64_sys_write+0x4c/0x60 + do_syscall_64+0x91/0xb47 + entry_SYSCALL_64_after_hwframe+0x49/0xbe + + read to 0xffff91c6713b00f8 of 8 bytes by task 24872 on cpu 37: + ext4_writepages+0x10ac/0x1d00 [ext4] + mpage_map_and_submit_extent at fs/ext4/inode.c:2468 + (inlined by) ext4_writepages at fs/ext4/inode.c:2772 + do_writepages+0x5e/0x130 + __writeback_single_inode+0xeb/0xb20 + writeback_sb_inodes+0x429/0x900 + __writeback_inodes_wb+0xc4/0x150 + wb_writeback+0x4bd/0x870 + wb_workfn+0x6b4/0x960 + process_one_work+0x54c/0xbe0 + worker_thread+0x80/0x650 + kthread+0x1e0/0x200 + ret_from_fork+0x27/0x50 + + Reported by Kernel Concurrency Sanitizer on: + CPU: 37 PID: 24872 Comm: kworker/u261:2 Tainted: G W O L 5.5.0-next-20200204+ #5 + Hardware name: HPE ProLiant DL385 Gen10/ProLiant DL385 Gen10, BIOS A40 07/10/2019 + Workqueue: writeback wb_workfn (flush-7:0) + +Since only the read is operating as lockless (outside of the +"i_data_sem"), load tearing could introduce a logic bug. Fix it by +adding READ_ONCE() for the read and WRITE_ONCE() for the write. + +Signed-off-by: Qian Cai +Link: https://lore.kernel.org/r/1581085751-31793-1-git-send-email-cai@lca.pw +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/ext4.h | 2 +- + fs/ext4/inode.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -2972,7 +2972,7 @@ static inline void ext4_update_i_disksiz + !inode_is_locked(inode)); + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) +- EXT4_I(inode)->i_disksize = newsize; ++ WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); + up_write(&EXT4_I(inode)->i_data_sem); + } + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2466,7 +2466,7 @@ update_disksize: + * truncate are avoided by checking i_size under i_data_sem. + */ + disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; +- if (disksize > EXT4_I(inode)->i_disksize) { ++ if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { + int err2; + loff_t i_size; + diff --git a/queue-5.5/ext4-fix-mount-failure-with-quota-configured-as-module.patch b/queue-5.5/ext4-fix-mount-failure-with-quota-configured-as-module.patch new file mode 100644 index 00000000000..5d2509da0ac --- /dev/null +++ b/queue-5.5/ext4-fix-mount-failure-with-quota-configured-as-module.patch @@ -0,0 +1,36 @@ +From 9db176bceb5c5df4990486709da386edadc6bd1d Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Fri, 21 Feb 2020 11:08:35 +0100 +Subject: ext4: fix mount failure with quota configured as module + +From: Jan Kara + +commit 9db176bceb5c5df4990486709da386edadc6bd1d upstream. + +When CONFIG_QFMT_V2 is configured as a module, the test in +ext4_feature_set_ok() fails and so mount of filesystems with quota or +project features fails. Fix the test to use IS_ENABLED macro which +works properly even for modules. + +Link: https://lore.kernel.org/r/20200221100835.9332-1-jack@suse.cz +Fixes: d65d87a07476 ("ext4: improve explanation of a mount failure caused by a misconfigured kernel") +Signed-off-by: Jan Kara +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/super.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -2989,7 +2989,7 @@ static int ext4_feature_set_ok(struct su + return 0; + } + +-#if !defined(CONFIG_QUOTA) || !defined(CONFIG_QFMT_V2) ++#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2) + if (!readonly && (ext4_has_feature_quota(sb) || + ext4_has_feature_project(sb))) { + ext4_msg(sb, KERN_ERR, diff --git a/queue-5.5/ext4-fix-potential-race-between-online-resizing-and-write-operations.patch b/queue-5.5/ext4-fix-potential-race-between-online-resizing-and-write-operations.patch new file mode 100644 index 00000000000..36983f9142a --- /dev/null +++ b/queue-5.5/ext4-fix-potential-race-between-online-resizing-and-write-operations.patch @@ -0,0 +1,316 @@ +From 1d0c3924a92e69bfa91163bda83c12a994b4d106 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sat, 15 Feb 2020 16:40:37 -0500 +Subject: ext4: fix potential race between online resizing and write operations + +From: Theodore Ts'o + +commit 1d0c3924a92e69bfa91163bda83c12a994b4d106 upstream. + +During an online resize an array of pointers to buffer heads gets +replaced so it can get enlarged. If there is a racing block +allocation or deallocation which uses the old array, and the old array +has gotten reused this can lead to a GPF or some other random kernel +memory getting modified. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=206443 +Link: https://lore.kernel.org/r/20200221053458.730016-2-tytso@mit.edu +Reported-by: Suraj Jitindar Singh +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/balloc.c | 14 +++++++++++--- + fs/ext4/ext4.h | 20 +++++++++++++++++++- + fs/ext4/resize.c | 55 ++++++++++++++++++++++++++++++++++++++++++++----------- + fs/ext4/super.c | 33 +++++++++++++++++++++++---------- + 4 files changed, 97 insertions(+), 25 deletions(-) + +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -270,6 +270,7 @@ struct ext4_group_desc * ext4_get_group_ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_group_desc *desc; + struct ext4_sb_info *sbi = EXT4_SB(sb); ++ struct buffer_head *bh_p; + + if (block_group >= ngroups) { + ext4_error(sb, "block_group >= groups_count - block_group = %u," +@@ -280,7 +281,14 @@ struct ext4_group_desc * ext4_get_group_ + + group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); + offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); +- if (!sbi->s_group_desc[group_desc]) { ++ bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc); ++ /* ++ * sbi_array_rcu_deref returns with rcu unlocked, this is ok since ++ * the pointer being dereferenced won't be dereferenced again. By ++ * looking at the usage in add_new_gdb() the value isn't modified, ++ * just the pointer, and so it remains valid. ++ */ ++ if (!bh_p) { + ext4_error(sb, "Group descriptor not loaded - " + "block_group = %u, group_desc = %u, desc = %u", + block_group, group_desc, offset); +@@ -288,10 +296,10 @@ struct ext4_group_desc * ext4_get_group_ + } + + desc = (struct ext4_group_desc *)( +- (__u8 *)sbi->s_group_desc[group_desc]->b_data + ++ (__u8 *)bh_p->b_data + + offset * EXT4_DESC_SIZE(sb)); + if (bh) +- *bh = sbi->s_group_desc[group_desc]; ++ *bh = bh_p; + return desc; + } + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1401,7 +1401,7 @@ struct ext4_sb_info { + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ +- struct buffer_head **s_group_desc; ++ struct buffer_head * __rcu *s_group_desc; + unsigned int s_mount_opt; + unsigned int s_mount_opt2; + unsigned int s_mount_flags; +@@ -1575,6 +1575,23 @@ static inline int ext4_valid_inum(struct + } + + /* ++ * Returns: sbi->field[index] ++ * Used to access an array element from the following sbi fields which require ++ * rcu protection to avoid dereferencing an invalid pointer due to reassignment ++ * - s_group_desc ++ * - s_group_info ++ * - s_flex_group ++ */ ++#define sbi_array_rcu_deref(sbi, field, index) \ ++({ \ ++ typeof(*((sbi)->field)) _v; \ ++ rcu_read_lock(); \ ++ _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ ++ rcu_read_unlock(); \ ++ _v; \ ++}) ++ ++/* + * Inode dynamic state flags + */ + enum { +@@ -2669,6 +2686,7 @@ extern int ext4_generic_delete_entry(han + extern bool ext4_empty_dir(struct inode *inode); + + /* resize.c */ ++extern void ext4_kvfree_array_rcu(void *to_free); + extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); + extern int ext4_group_extend(struct super_block *sb, +--- a/fs/ext4/resize.c ++++ b/fs/ext4/resize.c +@@ -17,6 +17,33 @@ + + #include "ext4_jbd2.h" + ++struct ext4_rcu_ptr { ++ struct rcu_head rcu; ++ void *ptr; ++}; ++ ++static void ext4_rcu_ptr_callback(struct rcu_head *head) ++{ ++ struct ext4_rcu_ptr *ptr; ++ ++ ptr = container_of(head, struct ext4_rcu_ptr, rcu); ++ kvfree(ptr->ptr); ++ kfree(ptr); ++} ++ ++void ext4_kvfree_array_rcu(void *to_free) ++{ ++ struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); ++ ++ if (ptr) { ++ ptr->ptr = to_free; ++ call_rcu(&ptr->rcu, ext4_rcu_ptr_callback); ++ return; ++ } ++ synchronize_rcu(); ++ kvfree(to_free); ++} ++ + int ext4_resize_begin(struct super_block *sb) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -542,8 +569,8 @@ static int setup_new_flex_group_blocks(s + brelse(gdb); + goto out; + } +- memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data, +- gdb->b_size); ++ memcpy(gdb->b_data, sbi_array_rcu_deref(sbi, ++ s_group_desc, j)->b_data, gdb->b_size); + set_buffer_uptodate(gdb); + + err = ext4_handle_dirty_metadata(handle, NULL, gdb); +@@ -861,13 +888,15 @@ static int add_new_gdb(handle_t *handle, + } + brelse(dind); + +- o_group_desc = EXT4_SB(sb)->s_group_desc; ++ rcu_read_lock(); ++ o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc); + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); ++ rcu_read_unlock(); + n_group_desc[gdb_num] = gdb_bh; +- EXT4_SB(sb)->s_group_desc = n_group_desc; ++ rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc); + EXT4_SB(sb)->s_gdb_count++; +- kvfree(o_group_desc); ++ ext4_kvfree_array_rcu(o_group_desc); + + le16_add_cpu(&es->s_reserved_gdt_blocks, -1); + err = ext4_handle_dirty_super(handle, sb); +@@ -911,9 +940,11 @@ static int add_new_gdb_meta_bg(struct su + return err; + } + +- o_group_desc = EXT4_SB(sb)->s_group_desc; ++ rcu_read_lock(); ++ o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc); + memcpy(n_group_desc, o_group_desc, + EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); ++ rcu_read_unlock(); + n_group_desc[gdb_num] = gdb_bh; + + BUFFER_TRACE(gdb_bh, "get_write_access"); +@@ -924,9 +955,9 @@ static int add_new_gdb_meta_bg(struct su + return err; + } + +- EXT4_SB(sb)->s_group_desc = n_group_desc; ++ rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc); + EXT4_SB(sb)->s_gdb_count++; +- kvfree(o_group_desc); ++ ext4_kvfree_array_rcu(o_group_desc); + return err; + } + +@@ -1190,7 +1221,8 @@ static int ext4_add_new_descs(handle_t * + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { +- gdb_bh = sbi->s_group_desc[gdb_num]; ++ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, ++ gdb_num); + BUFFER_TRACE(gdb_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, gdb_bh); + +@@ -1272,7 +1304,7 @@ static int ext4_setup_new_descs(handle_t + /* + * get_write_access() has been called on gdb_bh by ext4_add_new_desc(). + */ +- gdb_bh = sbi->s_group_desc[gdb_num]; ++ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); + /* Update group descriptor block for new group */ + gdp = (struct ext4_group_desc *)(gdb_bh->b_data + + gdb_off * EXT4_DESC_SIZE(sb)); +@@ -1499,7 +1531,8 @@ exit_journal: + for (; gdb_num <= gdb_num_end; gdb_num++) { + struct buffer_head *gdb_bh; + +- gdb_bh = sbi->s_group_desc[gdb_num]; ++ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, ++ gdb_num); + if (old_gdb == gdb_bh->b_blocknr) + continue; + update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -970,6 +970,7 @@ static void ext4_put_super(struct super_ + { + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; ++ struct buffer_head **group_desc; + int aborted = 0; + int i, err; + +@@ -1000,9 +1001,12 @@ static void ext4_put_super(struct super_ + if (!sb_rdonly(sb)) + ext4_commit_super(sb, 1); + ++ rcu_read_lock(); ++ group_desc = rcu_dereference(sbi->s_group_desc); + for (i = 0; i < sbi->s_gdb_count; i++) +- brelse(sbi->s_group_desc[i]); +- kvfree(sbi->s_group_desc); ++ brelse(group_desc[i]); ++ kvfree(group_desc); ++ rcu_read_unlock(); + kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); +@@ -3589,7 +3593,7 @@ static int ext4_fill_super(struct super_ + { + struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); + char *orig_data = kstrdup(data, GFP_KERNEL); +- struct buffer_head *bh; ++ struct buffer_head *bh, **group_desc; + struct ext4_super_block *es = NULL; + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + ext4_fsblk_t block; +@@ -4245,9 +4249,10 @@ static int ext4_fill_super(struct super_ + goto failed_mount; + } + } +- sbi->s_group_desc = kvmalloc_array(db_count, +- sizeof(struct buffer_head *), +- GFP_KERNEL); ++ rcu_assign_pointer(sbi->s_group_desc, ++ kvmalloc_array(db_count, ++ sizeof(struct buffer_head *), ++ GFP_KERNEL)); + if (sbi->s_group_desc == NULL) { + ext4_msg(sb, KERN_ERR, "not enough memory"); + ret = -ENOMEM; +@@ -4263,14 +4268,19 @@ static int ext4_fill_super(struct super_ + } + + for (i = 0; i < db_count; i++) { ++ struct buffer_head *bh; ++ + block = descriptor_loc(sb, logical_sb_block, i); +- sbi->s_group_desc[i] = sb_bread_unmovable(sb, block); +- if (!sbi->s_group_desc[i]) { ++ bh = sb_bread_unmovable(sb, block); ++ if (!bh) { + ext4_msg(sb, KERN_ERR, + "can't read group descriptor %d", i); + db_count = i; + goto failed_mount2; + } ++ rcu_read_lock(); ++ rcu_dereference(sbi->s_group_desc)[i] = bh; ++ rcu_read_unlock(); + } + sbi->s_gdb_count = db_count; + if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { +@@ -4672,9 +4682,12 @@ failed_mount3: + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + failed_mount2: ++ rcu_read_lock(); ++ group_desc = rcu_dereference(sbi->s_group_desc); + for (i = 0; i < db_count; i++) +- brelse(sbi->s_group_desc[i]); +- kvfree(sbi->s_group_desc); ++ brelse(group_desc[i]); ++ kvfree(group_desc); ++ rcu_read_unlock(); + failed_mount: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); diff --git a/queue-5.5/ext4-fix-potential-race-between-s_flex_groups-online-resizing-and-access.patch b/queue-5.5/ext4-fix-potential-race-between-s_flex_groups-online-resizing-and-access.patch new file mode 100644 index 00000000000..a44ba16f622 --- /dev/null +++ b/queue-5.5/ext4-fix-potential-race-between-s_flex_groups-online-resizing-and-access.patch @@ -0,0 +1,291 @@ +From 7c990728b99ed6fbe9c75fc202fce1172d9916da Mon Sep 17 00:00:00 2001 +From: Suraj Jitindar Singh +Date: Tue, 18 Feb 2020 19:08:51 -0800 +Subject: ext4: fix potential race between s_flex_groups online resizing and access + +From: Suraj Jitindar Singh + +commit 7c990728b99ed6fbe9c75fc202fce1172d9916da upstream. + +During an online resize an array of s_flex_groups structures gets replaced +so it can get enlarged. If there is a concurrent access to the array and +this memory has been reused then this can lead to an invalid memory access. + +The s_flex_group array has been converted into an array of pointers rather +than an array of structures. This is to ensure that the information +contained in the structures cannot get out of sync during a resize due to +an accessor updating the value in the old structure after it has been +copied but before the array pointer is updated. Since the structures them- +selves are no longer copied but only the pointers to them this case is +mitigated. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=206443 +Link: https://lore.kernel.org/r/20200221053458.730016-4-tytso@mit.edu +Signed-off-by: Suraj Jitindar Singh +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/ext4.h | 2 - + fs/ext4/ialloc.c | 23 ++++++++++------- + fs/ext4/mballoc.c | 9 ++++-- + fs/ext4/resize.c | 7 +++-- + fs/ext4/super.c | 72 +++++++++++++++++++++++++++++++++++++----------------- + 5 files changed, 76 insertions(+), 37 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1513,7 +1513,7 @@ struct ext4_sb_info { + unsigned int s_extent_max_zeroout_kb; + + unsigned int s_log_groups_per_flex; +- struct flex_groups *s_flex_groups; ++ struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + + /* workqueue for reserved extent conversions (buffered io) */ +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -325,11 +325,13 @@ void ext4_free_inode(handle_t *handle, s + + percpu_counter_inc(&sbi->s_freeinodes_counter); + if (sbi->s_log_groups_per_flex) { +- ext4_group_t f = ext4_flex_group(sbi, block_group); ++ struct flex_groups *fg; + +- atomic_inc(&sbi->s_flex_groups[f].free_inodes); ++ fg = sbi_array_rcu_deref(sbi, s_flex_groups, ++ ext4_flex_group(sbi, block_group)); ++ atomic_inc(&fg->free_inodes); + if (is_directory) +- atomic_dec(&sbi->s_flex_groups[f].used_dirs); ++ atomic_dec(&fg->used_dirs); + } + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + fatal = ext4_handle_dirty_metadata(handle, NULL, bh2); +@@ -365,12 +367,13 @@ static void get_orlov_stats(struct super + int flex_size, struct orlov_stats *stats) + { + struct ext4_group_desc *desc; +- struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups; + + if (flex_size > 1) { +- stats->free_inodes = atomic_read(&flex_group[g].free_inodes); +- stats->free_clusters = atomic64_read(&flex_group[g].free_clusters); +- stats->used_dirs = atomic_read(&flex_group[g].used_dirs); ++ struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb), ++ s_flex_groups, g); ++ stats->free_inodes = atomic_read(&fg->free_inodes); ++ stats->free_clusters = atomic64_read(&fg->free_clusters); ++ stats->used_dirs = atomic_read(&fg->used_dirs); + return; + } + +@@ -1051,7 +1054,8 @@ got: + if (sbi->s_log_groups_per_flex) { + ext4_group_t f = ext4_flex_group(sbi, group); + +- atomic_inc(&sbi->s_flex_groups[f].used_dirs); ++ atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups, ++ f)->used_dirs); + } + } + if (ext4_has_group_desc_csum(sb)) { +@@ -1074,7 +1078,8 @@ got: + + if (sbi->s_log_groups_per_flex) { + flex_group = ext4_flex_group(sbi, group); +- atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); ++ atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups, ++ flex_group)->free_inodes); + } + + inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3038,7 +3038,8 @@ ext4_mb_mark_diskspace_used(struct ext4_ + ext4_group_t flex_group = ext4_flex_group(sbi, + ac->ac_b_ex.fe_group); + atomic64_sub(ac->ac_b_ex.fe_len, +- &sbi->s_flex_groups[flex_group].free_clusters); ++ &sbi_array_rcu_deref(sbi, s_flex_groups, ++ flex_group)->free_clusters); + } + + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); +@@ -4932,7 +4933,8 @@ do_more: + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic64_add(count_clusters, +- &sbi->s_flex_groups[flex_group].free_clusters); ++ &sbi_array_rcu_deref(sbi, s_flex_groups, ++ flex_group)->free_clusters); + } + + /* +@@ -5089,7 +5091,8 @@ int ext4_group_add_blocks(handle_t *hand + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic64_add(clusters_freed, +- &sbi->s_flex_groups[flex_group].free_clusters); ++ &sbi_array_rcu_deref(sbi, s_flex_groups, ++ flex_group)->free_clusters); + } + + ext4_mb_unload_buddy(&e4b); +--- a/fs/ext4/resize.c ++++ b/fs/ext4/resize.c +@@ -1432,11 +1432,14 @@ static void ext4_update_super(struct sup + percpu_counter_read(&sbi->s_freeclusters_counter)); + if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) { + ext4_group_t flex_group; ++ struct flex_groups *fg; ++ + flex_group = ext4_flex_group(sbi, group_data[0].group); ++ fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); + atomic64_add(EXT4_NUM_B2C(sbi, free_blocks), +- &sbi->s_flex_groups[flex_group].free_clusters); ++ &fg->free_clusters); + atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count, +- &sbi->s_flex_groups[flex_group].free_inodes); ++ &fg->free_inodes); + } + + /* +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -971,6 +971,7 @@ static void ext4_put_super(struct super_ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + struct buffer_head **group_desc; ++ struct flex_groups **flex_groups; + int aborted = 0; + int i, err; + +@@ -1006,8 +1007,13 @@ static void ext4_put_super(struct super_ + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(group_desc[i]); + kvfree(group_desc); ++ flex_groups = rcu_dereference(sbi->s_flex_groups); ++ if (flex_groups) { ++ for (i = 0; i < sbi->s_flex_groups_allocated; i++) ++ kvfree(flex_groups[i]); ++ kvfree(flex_groups); ++ } + rcu_read_unlock(); +- kvfree(sbi->s_flex_groups); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); +@@ -2339,8 +2345,8 @@ done: + int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +- struct flex_groups *new_groups; +- int size; ++ struct flex_groups **old_groups, **new_groups; ++ int size, i; + + if (!sbi->s_log_groups_per_flex) + return 0; +@@ -2349,22 +2355,37 @@ int ext4_alloc_flex_bg_array(struct supe + if (size <= sbi->s_flex_groups_allocated) + return 0; + +- size = roundup_pow_of_two(size * sizeof(struct flex_groups)); +- new_groups = kvzalloc(size, GFP_KERNEL); ++ new_groups = kvzalloc(roundup_pow_of_two(size * ++ sizeof(*sbi->s_flex_groups)), GFP_KERNEL); + if (!new_groups) { +- ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", +- size / (int) sizeof(struct flex_groups)); ++ ext4_msg(sb, KERN_ERR, ++ "not enough memory for %d flex group pointers", size); + return -ENOMEM; + } +- +- if (sbi->s_flex_groups) { +- memcpy(new_groups, sbi->s_flex_groups, +- (sbi->s_flex_groups_allocated * +- sizeof(struct flex_groups))); +- kvfree(sbi->s_flex_groups); ++ for (i = sbi->s_flex_groups_allocated; i < size; i++) { ++ new_groups[i] = kvzalloc(roundup_pow_of_two( ++ sizeof(struct flex_groups)), ++ GFP_KERNEL); ++ if (!new_groups[i]) { ++ for (i--; i >= sbi->s_flex_groups_allocated; i--) ++ kvfree(new_groups[i]); ++ kvfree(new_groups); ++ ext4_msg(sb, KERN_ERR, ++ "not enough memory for %d flex groups", size); ++ return -ENOMEM; ++ } + } +- sbi->s_flex_groups = new_groups; +- sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); ++ rcu_read_lock(); ++ old_groups = rcu_dereference(sbi->s_flex_groups); ++ if (old_groups) ++ memcpy(new_groups, old_groups, ++ (sbi->s_flex_groups_allocated * ++ sizeof(struct flex_groups *))); ++ rcu_read_unlock(); ++ rcu_assign_pointer(sbi->s_flex_groups, new_groups); ++ sbi->s_flex_groups_allocated = size; ++ if (old_groups) ++ ext4_kvfree_array_rcu(old_groups); + return 0; + } + +@@ -2372,6 +2393,7 @@ static int ext4_fill_flex_info(struct su + { + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = NULL; ++ struct flex_groups *fg; + ext4_group_t flex_group; + int i, err; + +@@ -2389,12 +2411,11 @@ static int ext4_fill_flex_info(struct su + gdp = ext4_get_group_desc(sb, i, NULL); + + flex_group = ext4_flex_group(sbi, i); +- atomic_add(ext4_free_inodes_count(sb, gdp), +- &sbi->s_flex_groups[flex_group].free_inodes); ++ fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); ++ atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes); + atomic64_add(ext4_free_group_clusters(sb, gdp), +- &sbi->s_flex_groups[flex_group].free_clusters); +- atomic_add(ext4_used_dirs_count(sb, gdp), +- &sbi->s_flex_groups[flex_group].used_dirs); ++ &fg->free_clusters); ++ atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs); + } + + return 1; +@@ -3596,6 +3617,7 @@ static int ext4_fill_super(struct super_ + struct buffer_head *bh, **group_desc; + struct ext4_super_block *es = NULL; + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); ++ struct flex_groups **flex_groups; + ext4_fsblk_t block; + ext4_fsblk_t sb_block = get_sb_block(&data); + ext4_fsblk_t logical_sb_block; +@@ -4647,8 +4669,14 @@ failed_mount7: + ext4_unregister_li_request(sb); + failed_mount6: + ext4_mb_release(sb); +- if (sbi->s_flex_groups) +- kvfree(sbi->s_flex_groups); ++ rcu_read_lock(); ++ flex_groups = rcu_dereference(sbi->s_flex_groups); ++ if (flex_groups) { ++ for (i = 0; i < sbi->s_flex_groups_allocated; i++) ++ kvfree(flex_groups[i]); ++ kvfree(flex_groups); ++ } ++ rcu_read_unlock(); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); diff --git a/queue-5.5/ext4-fix-potential-race-between-s_group_info-online-resizing-and-access.patch b/queue-5.5/ext4-fix-potential-race-between-s_group_info-online-resizing-and-access.patch new file mode 100644 index 00000000000..10ae46b01c8 --- /dev/null +++ b/queue-5.5/ext4-fix-potential-race-between-s_group_info-online-resizing-and-access.patch @@ -0,0 +1,180 @@ +From df3da4ea5a0fc5d115c90d5aa6caa4dd433750a7 Mon Sep 17 00:00:00 2001 +From: Suraj Jitindar Singh +Date: Tue, 18 Feb 2020 19:08:50 -0800 +Subject: ext4: fix potential race between s_group_info online resizing and access + +From: Suraj Jitindar Singh + +commit df3da4ea5a0fc5d115c90d5aa6caa4dd433750a7 upstream. + +During an online resize an array of pointers to s_group_info gets replaced +so it can get enlarged. If there is a concurrent access to the array in +ext4_get_group_info() and this memory has been reused then this can lead to +an invalid memory access. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=206443 +Link: https://lore.kernel.org/r/20200221053458.730016-3-tytso@mit.edu +Signed-off-by: Suraj Jitindar Singh +Signed-off-by: Theodore Ts'o +Reviewed-by: Balbir Singh +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/ext4.h | 8 ++++---- + fs/ext4/mballoc.c | 52 +++++++++++++++++++++++++++++++++++----------------- + 2 files changed, 39 insertions(+), 21 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1463,7 +1463,7 @@ struct ext4_sb_info { + #endif + + /* for buddy allocator */ +- struct ext4_group_info ***s_group_info; ++ struct ext4_group_info ** __rcu *s_group_info; + struct inode *s_buddy_cache; + spinlock_t s_md_lock; + unsigned short *s_mb_offsets; +@@ -2934,13 +2934,13 @@ static inline + struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) + { +- struct ext4_group_info ***grp_info; ++ struct ext4_group_info **grp_info; + long indexv, indexh; + BUG_ON(group >= EXT4_SB(sb)->s_groups_count); +- grp_info = EXT4_SB(sb)->s_group_info; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); +- return grp_info[indexv][indexh]; ++ grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv); ++ return grp_info[indexh]; + } + + /* +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -2356,7 +2356,7 @@ int ext4_mb_alloc_groupinfo(struct super + { + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned size; +- struct ext4_group_info ***new_groupinfo; ++ struct ext4_group_info ***old_groupinfo, ***new_groupinfo; + + size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); +@@ -2369,13 +2369,16 @@ int ext4_mb_alloc_groupinfo(struct super + ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); + return -ENOMEM; + } +- if (sbi->s_group_info) { +- memcpy(new_groupinfo, sbi->s_group_info, ++ rcu_read_lock(); ++ old_groupinfo = rcu_dereference(sbi->s_group_info); ++ if (old_groupinfo) ++ memcpy(new_groupinfo, old_groupinfo, + sbi->s_group_info_size * sizeof(*sbi->s_group_info)); +- kvfree(sbi->s_group_info); +- } +- sbi->s_group_info = new_groupinfo; ++ rcu_read_unlock(); ++ rcu_assign_pointer(sbi->s_group_info, new_groupinfo); + sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); ++ if (old_groupinfo) ++ ext4_kvfree_array_rcu(old_groupinfo); + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + sbi->s_group_info_size); + return 0; +@@ -2387,6 +2390,7 @@ int ext4_mb_add_groupinfo(struct super_b + { + int i; + int metalen = 0; ++ int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_info **meta_group_info; + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); +@@ -2405,12 +2409,12 @@ int ext4_mb_add_groupinfo(struct super_b + "for a buddy group"); + goto exit_meta_group_info; + } +- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = +- meta_group_info; ++ rcu_read_lock(); ++ rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; ++ rcu_read_unlock(); + } + +- meta_group_info = +- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; ++ meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); + + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); +@@ -2458,8 +2462,13 @@ int ext4_mb_add_groupinfo(struct super_b + exit_group_info: + /* If a meta_group_info table has been allocated, release it now */ + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { +- kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); +- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL; ++ struct ext4_group_info ***group_info; ++ ++ rcu_read_lock(); ++ group_info = rcu_dereference(sbi->s_group_info); ++ kfree(group_info[idx]); ++ group_info[idx] = NULL; ++ rcu_read_unlock(); + } + exit_meta_group_info: + return -ENOMEM; +@@ -2472,6 +2481,7 @@ static int ext4_mb_init_backend(struct s + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err; + struct ext4_group_desc *desc; ++ struct ext4_group_info ***group_info; + struct kmem_cache *cachep; + + err = ext4_mb_alloc_groupinfo(sb, ngroups); +@@ -2507,11 +2517,16 @@ err_freebuddy: + while (i-- > 0) + kmem_cache_free(cachep, ext4_get_group_info(sb, i)); + i = sbi->s_group_info_size; ++ rcu_read_lock(); ++ group_info = rcu_dereference(sbi->s_group_info); + while (i-- > 0) +- kfree(sbi->s_group_info[i]); ++ kfree(group_info[i]); ++ rcu_read_unlock(); + iput(sbi->s_buddy_cache); + err_freesgi: +- kvfree(sbi->s_group_info); ++ rcu_read_lock(); ++ kvfree(rcu_dereference(sbi->s_group_info)); ++ rcu_read_unlock(); + return -ENOMEM; + } + +@@ -2700,7 +2715,7 @@ int ext4_mb_release(struct super_block * + ext4_group_t ngroups = ext4_get_groups_count(sb); + ext4_group_t i; + int num_meta_group_infos; +- struct ext4_group_info *grinfo; ++ struct ext4_group_info *grinfo, ***group_info; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + +@@ -2719,9 +2734,12 @@ int ext4_mb_release(struct super_block * + num_meta_group_infos = (ngroups + + EXT4_DESC_PER_BLOCK(sb) - 1) >> + EXT4_DESC_PER_BLOCK_BITS(sb); ++ rcu_read_lock(); ++ group_info = rcu_dereference(sbi->s_group_info); + for (i = 0; i < num_meta_group_infos; i++) +- kfree(sbi->s_group_info[i]); +- kvfree(sbi->s_group_info); ++ kfree(group_info[i]); ++ kvfree(group_info); ++ rcu_read_unlock(); + } + kfree(sbi->s_mb_offsets); + kfree(sbi->s_mb_maxs); diff --git a/queue-5.5/ext4-fix-race-between-writepages-and-enabling-ext4_extents_fl.patch b/queue-5.5/ext4-fix-race-between-writepages-and-enabling-ext4_extents_fl.patch new file mode 100644 index 00000000000..cc8540255c7 --- /dev/null +++ b/queue-5.5/ext4-fix-race-between-writepages-and-enabling-ext4_extents_fl.patch @@ -0,0 +1,167 @@ +From cb85f4d23f794e24127f3e562cb3b54b0803f456 Mon Sep 17 00:00:00 2001 +From: Eric Biggers +Date: Wed, 19 Feb 2020 10:30:47 -0800 +Subject: ext4: fix race between writepages and enabling EXT4_EXTENTS_FL + +From: Eric Biggers + +commit cb85f4d23f794e24127f3e562cb3b54b0803f456 upstream. + +If EXT4_EXTENTS_FL is set on an inode while ext4_writepages() is running +on it, the following warning in ext4_add_complete_io() can be hit: + +WARNING: CPU: 1 PID: 0 at fs/ext4/page-io.c:234 ext4_put_io_end_defer+0xf0/0x120 + +Here's a minimal reproducer (not 100% reliable) (root isn't required): + + while true; do + sync + done & + while true; do + rm -f file + touch file + chattr -e file + echo X >> file + chattr +e file + done + +The problem is that in ext4_writepages(), ext4_should_dioread_nolock() +(which only returns true on extent-based files) is checked once to set +the number of reserved journal credits, and also again later to select +the flags for ext4_map_blocks() and copy the reserved journal handle to +ext4_io_end::handle. But if EXT4_EXTENTS_FL is being concurrently set, +the first check can see dioread_nolock disabled while the later one can +see it enabled, causing the reserved handle to unexpectedly be NULL. + +Since changing EXT4_EXTENTS_FL is uncommon, and there may be other races +related to doing so as well, fix this by synchronizing changing +EXT4_EXTENTS_FL with ext4_writepages() via the existing +s_writepages_rwsem (previously called s_journal_flag_rwsem). + +This was originally reported by syzbot without a reproducer at +https://syzkaller.appspot.com/bug?extid=2202a584a00fffd19fbf, +but now that dioread_nolock is the default I also started seeing this +when running syzkaller locally. + +Link: https://lore.kernel.org/r/20200219183047.47417-3-ebiggers@kernel.org +Reported-by: syzbot+2202a584a00fffd19fbf@syzkaller.appspotmail.com +Fixes: 6b523df4fb5a ("ext4: use transaction reservation for extent conversion in ext4_end_io") +Signed-off-by: Eric Biggers +Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/ext4.h | 5 ++++- + fs/ext4/migrate.c | 27 +++++++++++++++++++-------- + 2 files changed, 23 insertions(+), 9 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1553,7 +1553,10 @@ struct ext4_sb_info { + struct ratelimit_state s_warning_ratelimit_state; + struct ratelimit_state s_msg_ratelimit_state; + +- /* Barrier between changing inodes' journal flags and writepages ops. */ ++ /* ++ * Barrier between writepages ops and changing any inode's JOURNAL_DATA ++ * or EXTENTS flag. ++ */ + struct percpu_rw_semaphore s_writepages_rwsem; + struct dax_device *s_daxdev; + }; +--- a/fs/ext4/migrate.c ++++ b/fs/ext4/migrate.c +@@ -407,6 +407,7 @@ static int free_ext_block(handle_t *hand + + int ext4_ext_migrate(struct inode *inode) + { ++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + handle_t *handle; + int retval = 0, i; + __le32 *i_data; +@@ -431,6 +432,8 @@ int ext4_ext_migrate(struct inode *inode + */ + return retval; + ++ percpu_down_write(&sbi->s_writepages_rwsem); ++ + /* + * Worst case we can touch the allocation bitmaps, a bgd + * block, and a block to link in the orphan list. We do need +@@ -441,7 +444,7 @@ int ext4_ext_migrate(struct inode *inode + + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); +- return retval; ++ goto out_unlock; + } + goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) * + EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; +@@ -452,7 +455,7 @@ int ext4_ext_migrate(struct inode *inode + if (IS_ERR(tmp_inode)) { + retval = PTR_ERR(tmp_inode); + ext4_journal_stop(handle); +- return retval; ++ goto out_unlock; + } + i_size_write(tmp_inode, i_size_read(inode)); + /* +@@ -494,7 +497,7 @@ int ext4_ext_migrate(struct inode *inode + */ + ext4_orphan_del(NULL, tmp_inode); + retval = PTR_ERR(handle); +- goto out; ++ goto out_tmp_inode; + } + + ei = EXT4_I(inode); +@@ -576,10 +579,11 @@ err_out: + ext4_ext_tree_init(handle, tmp_inode); + out_stop: + ext4_journal_stop(handle); +-out: ++out_tmp_inode: + unlock_new_inode(tmp_inode); + iput(tmp_inode); +- ++out_unlock: ++ percpu_up_write(&sbi->s_writepages_rwsem); + return retval; + } + +@@ -589,7 +593,8 @@ out: + int ext4_ind_migrate(struct inode *inode) + { + struct ext4_extent_header *eh; +- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; ++ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ++ struct ext4_super_block *es = sbi->s_es; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_extent *ex; + unsigned int i, len; +@@ -613,9 +618,13 @@ int ext4_ind_migrate(struct inode *inode + if (test_opt(inode->i_sb, DELALLOC)) + ext4_alloc_da_blocks(inode); + ++ percpu_down_write(&sbi->s_writepages_rwsem); ++ + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); +- if (IS_ERR(handle)) +- return PTR_ERR(handle); ++ if (IS_ERR(handle)) { ++ ret = PTR_ERR(handle); ++ goto out_unlock; ++ } + + down_write(&EXT4_I(inode)->i_data_sem); + ret = ext4_ext_check_inode(inode); +@@ -650,5 +659,7 @@ int ext4_ind_migrate(struct inode *inode + errout: + ext4_journal_stop(handle); + up_write(&EXT4_I(inode)->i_data_sem); ++out_unlock: ++ percpu_up_write(&sbi->s_writepages_rwsem); + return ret; + } diff --git a/queue-5.5/ext4-rename-s_journal_flag_rwsem-to-s_writepages_rwsem.patch b/queue-5.5/ext4-rename-s_journal_flag_rwsem-to-s_writepages_rwsem.patch new file mode 100644 index 00000000000..73c6c474c76 --- /dev/null +++ b/queue-5.5/ext4-rename-s_journal_flag_rwsem-to-s_writepages_rwsem.patch @@ -0,0 +1,130 @@ +From bbd55937de8f2754adc5792b0f8e5ff7d9c0420e Mon Sep 17 00:00:00 2001 +From: Eric Biggers +Date: Wed, 19 Feb 2020 10:30:46 -0800 +Subject: ext4: rename s_journal_flag_rwsem to s_writepages_rwsem + +From: Eric Biggers + +commit bbd55937de8f2754adc5792b0f8e5ff7d9c0420e upstream. + +In preparation for making s_journal_flag_rwsem synchronize +ext4_writepages() with changes to both the EXTENTS and JOURNAL_DATA +flags (rather than just JOURNAL_DATA as it does currently), rename it to +s_writepages_rwsem. + +Link: https://lore.kernel.org/r/20200219183047.47417-2-ebiggers@kernel.org +Signed-off-by: Eric Biggers +Signed-off-by: Theodore Ts'o +Reviewed-by: Jan Kara +Cc: stable@kernel.org +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/ext4.h | 2 +- + fs/ext4/inode.c | 14 +++++++------- + fs/ext4/super.c | 6 +++--- + 3 files changed, 11 insertions(+), 11 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1554,7 +1554,7 @@ struct ext4_sb_info { + struct ratelimit_state s_msg_ratelimit_state; + + /* Barrier between changing inodes' journal flags and writepages ops. */ +- struct percpu_rw_semaphore s_journal_flag_rwsem; ++ struct percpu_rw_semaphore s_writepages_rwsem; + struct dax_device *s_daxdev; + }; + +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2627,7 +2627,7 @@ static int ext4_writepages(struct addres + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + +- percpu_down_read(&sbi->s_journal_flag_rwsem); ++ percpu_down_read(&sbi->s_writepages_rwsem); + trace_ext4_writepages(inode, wbc); + + /* +@@ -2848,7 +2848,7 @@ unplug: + out_writepages: + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); +- percpu_up_read(&sbi->s_journal_flag_rwsem); ++ percpu_up_read(&sbi->s_writepages_rwsem); + return ret; + } + +@@ -2863,13 +2863,13 @@ static int ext4_dax_writepages(struct ad + if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + return -EIO; + +- percpu_down_read(&sbi->s_journal_flag_rwsem); ++ percpu_down_read(&sbi->s_writepages_rwsem); + trace_ext4_writepages(inode, wbc); + + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc); + trace_ext4_writepages_result(inode, wbc, ret, + nr_to_write - wbc->nr_to_write); +- percpu_up_read(&sbi->s_journal_flag_rwsem); ++ percpu_up_read(&sbi->s_writepages_rwsem); + return ret; + } + +@@ -5830,7 +5830,7 @@ int ext4_change_inode_journal_flag(struc + } + } + +- percpu_down_write(&sbi->s_journal_flag_rwsem); ++ percpu_down_write(&sbi->s_writepages_rwsem); + jbd2_journal_lock_updates(journal); + + /* +@@ -5847,7 +5847,7 @@ int ext4_change_inode_journal_flag(struc + err = jbd2_journal_flush(journal); + if (err < 0) { + jbd2_journal_unlock_updates(journal); +- percpu_up_write(&sbi->s_journal_flag_rwsem); ++ percpu_up_write(&sbi->s_writepages_rwsem); + return err; + } + ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); +@@ -5855,7 +5855,7 @@ int ext4_change_inode_journal_flag(struc + ext4_set_aops(inode); + + jbd2_journal_unlock_updates(journal); +- percpu_up_write(&sbi->s_journal_flag_rwsem); ++ percpu_up_write(&sbi->s_writepages_rwsem); + + if (val) + up_write(&EXT4_I(inode)->i_mmap_sem); +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1018,7 +1018,7 @@ static void ext4_put_super(struct super_ + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); +- percpu_free_rwsem(&sbi->s_journal_flag_rwsem); ++ percpu_free_rwsem(&sbi->s_writepages_rwsem); + #ifdef CONFIG_QUOTA + for (i = 0; i < EXT4_MAXQUOTAS; i++) + kfree(get_qf_name(sb, sbi, i)); +@@ -4581,7 +4581,7 @@ no_journal: + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (!err) +- err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem); ++ err = percpu_init_rwsem(&sbi->s_writepages_rwsem); + + if (err) { + ext4_msg(sb, KERN_ERR, "insufficient memory"); +@@ -4681,7 +4681,7 @@ failed_mount6: + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); +- percpu_free_rwsem(&sbi->s_journal_flag_rwsem); ++ percpu_free_rwsem(&sbi->s_writepages_rwsem); + failed_mount5: + ext4_ext_release(sb); + ext4_release_system_zone(sb); diff --git a/queue-5.5/kvm-nvmx-check-io-instruction-vm-exit-conditions.patch b/queue-5.5/kvm-nvmx-check-io-instruction-vm-exit-conditions.patch new file mode 100644 index 00000000000..c404e74ac96 --- /dev/null +++ b/queue-5.5/kvm-nvmx-check-io-instruction-vm-exit-conditions.patch @@ -0,0 +1,113 @@ +From 35a571346a94fb93b5b3b6a599675ef3384bc75c Mon Sep 17 00:00:00 2001 +From: Oliver Upton +Date: Tue, 4 Feb 2020 15:26:31 -0800 +Subject: KVM: nVMX: Check IO instruction VM-exit conditions + +From: Oliver Upton + +commit 35a571346a94fb93b5b3b6a599675ef3384bc75c upstream. + +Consult the 'unconditional IO exiting' and 'use IO bitmaps' VM-execution +controls when checking instruction interception. If the 'use IO bitmaps' +VM-execution control is 1, check the instruction access against the IO +bitmaps to determine if the instruction causes a VM-exit. + +Signed-off-by: Oliver Upton +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx/nested.c | 2 - + arch/x86/kvm/vmx/vmx.c | 57 +++++++++++++++++++++++++++++++++++++++++----- + 2 files changed, 52 insertions(+), 7 deletions(-) + +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -5345,7 +5345,7 @@ static bool nested_vmx_exit_handled_io(s + struct vmcs12 *vmcs12) + { + unsigned long exit_qualification; +- unsigned int port; ++ unsigned short port; + int size; + + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) +--- a/arch/x86/kvm/vmx/vmx.c ++++ b/arch/x86/kvm/vmx/vmx.c +@@ -7146,6 +7146,39 @@ static void vmx_request_immediate_exit(s + to_vmx(vcpu)->req_immediate_exit = true; + } + ++static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, ++ struct x86_instruction_info *info) ++{ ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu); ++ unsigned short port; ++ bool intercept; ++ int size; ++ ++ if (info->intercept == x86_intercept_in || ++ info->intercept == x86_intercept_ins) { ++ port = info->src_val; ++ size = info->dst_bytes; ++ } else { ++ port = info->dst_val; ++ size = info->src_bytes; ++ } ++ ++ /* ++ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction ++ * VM-exits depend on the 'unconditional IO exiting' VM-execution ++ * control. ++ * ++ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps. ++ */ ++ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) ++ intercept = nested_cpu_has(vmcs12, ++ CPU_BASED_UNCOND_IO_EXITING); ++ else ++ intercept = nested_vmx_check_io_bitmaps(vcpu, port, size); ++ ++ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; ++} ++ + static int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +@@ -7153,18 +7186,30 @@ static int vmx_check_intercept(struct kv + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + ++ switch (info->intercept) { + /* + * RDPID causes #UD if disabled through secondary execution controls. + * Because it is marked as EmulateOnUD, we need to intercept it here. + */ +- if (info->intercept == x86_intercept_rdtscp && +- !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { +- ctxt->exception.vector = UD_VECTOR; +- ctxt->exception.error_code_valid = false; +- return X86EMUL_PROPAGATE_FAULT; +- } ++ case x86_intercept_rdtscp: ++ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { ++ ctxt->exception.vector = UD_VECTOR; ++ ctxt->exception.error_code_valid = false; ++ return X86EMUL_PROPAGATE_FAULT; ++ } ++ break; ++ ++ case x86_intercept_in: ++ case x86_intercept_ins: ++ case x86_intercept_out: ++ case x86_intercept_outs: ++ return vmx_check_intercept_io(vcpu, info); + + /* TODO: check more intercepts... */ ++ default: ++ break; ++ } ++ + return X86EMUL_UNHANDLEABLE; + } + diff --git a/queue-5.5/kvm-nvmx-refactor-io-bitmap-checks-into-helper-function.patch b/queue-5.5/kvm-nvmx-refactor-io-bitmap-checks-into-helper-function.patch new file mode 100644 index 00000000000..0f4d7923628 --- /dev/null +++ b/queue-5.5/kvm-nvmx-refactor-io-bitmap-checks-into-helper-function.patch @@ -0,0 +1,93 @@ +From e71237d3ff1abf9f3388337cfebf53b96df2020d Mon Sep 17 00:00:00 2001 +From: Oliver Upton +Date: Tue, 4 Feb 2020 15:26:30 -0800 +Subject: KVM: nVMX: Refactor IO bitmap checks into helper function + +From: Oliver Upton + +commit e71237d3ff1abf9f3388337cfebf53b96df2020d upstream. + +Checks against the IO bitmap are useful for both instruction emulation +and VM-exit reflection. Refactor the IO bitmap checks into a helper +function. + +Signed-off-by: Oliver Upton +Reviewed-by: Vitaly Kuznetsov +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/vmx/nested.c | 39 +++++++++++++++++++++++++-------------- + arch/x86/kvm/vmx/nested.h | 2 ++ + 2 files changed, 27 insertions(+), 14 deletions(-) + +--- a/arch/x86/kvm/vmx/nested.c ++++ b/arch/x86/kvm/vmx/nested.c +@@ -5304,24 +5304,17 @@ fail: + return 1; + } + +- +-static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, +- struct vmcs12 *vmcs12) ++/* ++ * Return true if an IO instruction with the specified port and size should cause ++ * a VM-exit into L1. ++ */ ++bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, ++ int size) + { +- unsigned long exit_qualification; ++ struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + gpa_t bitmap, last_bitmap; +- unsigned int port; +- int size; + u8 b; + +- if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) +- return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); +- +- exit_qualification = vmcs_readl(EXIT_QUALIFICATION); +- +- port = exit_qualification >> 16; +- size = (exit_qualification & 7) + 1; +- + last_bitmap = (gpa_t)-1; + b = -1; + +@@ -5348,6 +5341,24 @@ static bool nested_vmx_exit_handled_io(s + return false; + } + ++static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, ++ struct vmcs12 *vmcs12) ++{ ++ unsigned long exit_qualification; ++ unsigned int port; ++ int size; ++ ++ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) ++ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); ++ ++ exit_qualification = vmcs_readl(EXIT_QUALIFICATION); ++ ++ port = exit_qualification >> 16; ++ size = (exit_qualification & 7) + 1; ++ ++ return nested_vmx_check_io_bitmaps(vcpu, port, size); ++} ++ + /* + * Return 1 if we should exit from L2 to L1 to handle an MSR access access, + * rather than handle it ourselves in L0. I.e., check whether L1 expressed +--- a/arch/x86/kvm/vmx/nested.h ++++ b/arch/x86/kvm/vmx/nested.h +@@ -34,6 +34,8 @@ int vmx_get_vmx_msr(struct nested_vmx_ms + int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, + u32 vmx_instruction_info, bool wr, int len, gva_t *ret); + void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); ++bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, ++ int size); + + static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) + { diff --git a/queue-5.5/series b/queue-5.5/series index 7114d780ff6..4c5b08924f0 100644 --- a/queue-5.5/series +++ b/queue-5.5/series @@ -86,3 +86,14 @@ drm-i915-update-drm-i915-bug-filing-url.patch sched-psi-fix-oob-write-when-writing-0-bytes-to-psi-files.patch kvm-nvmx-don-t-emulate-instructions-in-guest-mode.patch kvm-x86-don-t-notify-userspace-ioapic-on-edge-triggered-interrupt-eoi.patch +ext4-fix-a-data-race-in-ext4_i-inode-i_disksize.patch +ext4-add-cond_resched-to-__ext4_find_entry.patch +ext4-fix-potential-race-between-online-resizing-and-write-operations.patch +ext4-fix-potential-race-between-s_group_info-online-resizing-and-access.patch +ext4-fix-potential-race-between-s_flex_groups-online-resizing-and-access.patch +ext4-fix-mount-failure-with-quota-configured-as-module.patch +ext4-rename-s_journal_flag_rwsem-to-s_writepages_rwsem.patch +ext4-fix-race-between-writepages-and-enabling-ext4_extents_fl.patch +drm-i915-execlists-always-force-a-context-reload-when-rewinding-ring_tail.patch +kvm-nvmx-refactor-io-bitmap-checks-into-helper-function.patch +kvm-nvmx-check-io-instruction-vm-exit-conditions.patch -- 2.47.3