From: Greg Kroah-Hartman Date: Thu, 23 May 2019 18:01:30 +0000 (+0200) Subject: 4.14-stable patches X-Git-Tag: v5.1.5~10 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=3d4bcb19bc3e5440735168e7c84e9fef4472bb5b;p=thirdparty%2Fkernel%2Fstable-queue.git 4.14-stable patches added patches: bpf-add-map_lookup_elem_sys_only-for-lookups-from-syscall-side.patch bpf-lru-avoid-messing-with-eviction-heuristics-upon-syscall-lookup.patch btrfs-honour-fitrim-range-constraints-during-free-space-trim.patch driver-core-postpone-dma-tear-down-until-after-devres-release-for-probe-failure.patch md-raid-raid5-preserve-the-writeback-action-after-the-parity-check.patch revert-don-t-jump-to-compute_result-state-from-check_result-state.patch --- diff --git a/queue-4.14/bpf-add-map_lookup_elem_sys_only-for-lookups-from-syscall-side.patch b/queue-4.14/bpf-add-map_lookup_elem_sys_only-for-lookups-from-syscall-side.patch new file mode 100644 index 00000000000..5c70e148af6 --- /dev/null +++ b/queue-4.14/bpf-add-map_lookup_elem_sys_only-for-lookups-from-syscall-side.patch @@ -0,0 +1,54 @@ +From c6110222c6f49ea68169f353565eb865488a8619 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Tue, 14 May 2019 01:18:55 +0200 +Subject: bpf: add map_lookup_elem_sys_only for lookups from syscall side + +From: Daniel Borkmann + +commit c6110222c6f49ea68169f353565eb865488a8619 upstream. + +Add a callback map_lookup_elem_sys_only() that map implementations +could use over map_lookup_elem() from system call side in case the +map implementation needs to handle the latter differently than from +the BPF data path. If map_lookup_elem_sys_only() is set, this will +be preferred pick for map lookups out of user space. This hook is +used in a follow-up fix for LRU map, but once development window +opens, we can convert other map types from map_lookup_elem() (here, +the one called upon BPF_MAP_LOOKUP_ELEM cmd is meant) over to use +the callback to simplify and clean up the latter. + +Signed-off-by: Daniel Borkmann +Acked-by: Martin KaFai Lau +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman + + +--- + include/linux/bpf.h | 1 + + kernel/bpf/syscall.c | 5 ++++- + 2 files changed, 5 insertions(+), 1 deletion(-) + +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -28,6 +28,7 @@ struct bpf_map_ops { + void (*map_free)(struct bpf_map *map); + int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); + void (*map_release_uref)(struct bpf_map *map); ++ void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); + + /* funcs callable from userspace and from eBPF programs */ + void *(*map_lookup_elem)(struct bpf_map *map, void *key); +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -493,7 +493,10 @@ static int map_lookup_elem(union bpf_att + err = bpf_fd_htab_map_lookup_elem(map, key, value); + } else { + rcu_read_lock(); +- ptr = map->ops->map_lookup_elem(map, key); ++ if (map->ops->map_lookup_elem_sys_only) ++ ptr = map->ops->map_lookup_elem_sys_only(map, key); ++ else ++ ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, value_size); + rcu_read_unlock(); diff --git a/queue-4.14/bpf-lru-avoid-messing-with-eviction-heuristics-upon-syscall-lookup.patch b/queue-4.14/bpf-lru-avoid-messing-with-eviction-heuristics-upon-syscall-lookup.patch new file mode 100644 index 00000000000..153f14b2cb5 --- /dev/null +++ b/queue-4.14/bpf-lru-avoid-messing-with-eviction-heuristics-upon-syscall-lookup.patch @@ -0,0 +1,106 @@ +From 50b045a8c0ccf44f76640ac3eea8d80ca53979a3 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Tue, 14 May 2019 01:18:56 +0200 +Subject: bpf, lru: avoid messing with eviction heuristics upon syscall lookup + +From: Daniel Borkmann + +commit 50b045a8c0ccf44f76640ac3eea8d80ca53979a3 upstream. + +One of the biggest issues we face right now with picking LRU map over +regular hash table is that a map walk out of user space, for example, +to just dump the existing entries or to remove certain ones, will +completely mess up LRU eviction heuristics and wrong entries such +as just created ones will get evicted instead. The reason for this +is that we mark an entry as "in use" via bpf_lru_node_set_ref() from +system call lookup side as well. Thus upon walk, all entries are +being marked, so information of actual least recently used ones +are "lost". + +In case of Cilium where it can be used (besides others) as a BPF +based connection tracker, this current behavior causes disruption +upon control plane changes that need to walk the map from user space +to evict certain entries. Discussion result from bpfconf [0] was that +we should simply just remove marking from system call side as no +good use case could be found where it's actually needed there. +Therefore this patch removes marking for regular LRU and per-CPU +flavor. If there ever should be a need in future, the behavior could +be selected via map creation flag, but due to mentioned reason we +avoid this here. + + [0] http://vger.kernel.org/bpfconf.html + +Fixes: 29ba732acbee ("bpf: Add BPF_MAP_TYPE_LRU_HASH") +Fixes: 8f8449384ec3 ("bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH") +Signed-off-by: Daniel Borkmann +Acked-by: Martin KaFai Lau +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/bpf/hashtab.c | 23 ++++++++++++++++++----- + 1 file changed, 18 insertions(+), 5 deletions(-) + +--- a/kernel/bpf/hashtab.c ++++ b/kernel/bpf/hashtab.c +@@ -498,18 +498,30 @@ static u32 htab_map_gen_lookup(struct bp + return insn - insn_buf; + } + +-static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) ++static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, ++ void *key, const bool mark) + { + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) { +- bpf_lru_node_set_ref(&l->lru_node); ++ if (mark) ++ bpf_lru_node_set_ref(&l->lru_node); + return l->key + round_up(map->key_size, 8); + } + + return NULL; + } + ++static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) ++{ ++ return __htab_lru_map_lookup_elem(map, key, true); ++} ++ ++static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) ++{ ++ return __htab_lru_map_lookup_elem(map, key, false); ++} ++ + static u32 htab_lru_map_gen_lookup(struct bpf_map *map, + struct bpf_insn *insn_buf) + { +@@ -1160,6 +1172,7 @@ const struct bpf_map_ops htab_lru_map_op + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_lru_map_lookup_elem, ++ .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, + .map_update_elem = htab_lru_map_update_elem, + .map_delete_elem = htab_lru_map_delete_elem, + .map_gen_lookup = htab_lru_map_gen_lookup, +@@ -1190,7 +1203,6 @@ static void *htab_lru_percpu_map_lookup_ + + int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) + { +- struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l; + void __percpu *pptr; + int ret = -ENOENT; +@@ -1206,8 +1218,9 @@ int bpf_percpu_hash_copy(struct bpf_map + l = __htab_map_lookup_elem(map, key); + if (!l) + goto out; +- if (htab_is_lru(htab)) +- bpf_lru_node_set_ref(&l->lru_node); ++ /* We do not mark LRU map element here in order to not mess up ++ * eviction heuristics when user space does a map walk. ++ */ + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, diff --git a/queue-4.14/btrfs-honour-fitrim-range-constraints-during-free-space-trim.patch b/queue-4.14/btrfs-honour-fitrim-range-constraints-during-free-space-trim.patch new file mode 100644 index 00000000000..94067ef1ccf --- /dev/null +++ b/queue-4.14/btrfs-honour-fitrim-range-constraints-during-free-space-trim.patch @@ -0,0 +1,91 @@ +From c2d1b3aae33605a61cbab445d8ae1c708ccd2698 Mon Sep 17 00:00:00 2001 +From: Nikolay Borisov +Date: Mon, 25 Mar 2019 14:31:21 +0200 +Subject: btrfs: Honour FITRIM range constraints during free space trim + +From: Nikolay Borisov + +commit c2d1b3aae33605a61cbab445d8ae1c708ccd2698 upstream. + +Up until now trimming the freespace was done irrespective of what the +arguments of the FITRIM ioctl were. For example fstrim's -o/-l arguments +will be entirely ignored. Fix it by correctly handling those paramter. +This requires breaking if the found freespace extent is after the end of +the passed range as well as completing trim after trimming +fstrim_range::len bytes. + +Fixes: 499f377f49f0 ("btrfs: iterate over unused chunk space in FITRIM") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Nikolay Borisov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/btrfs/extent-tree.c | 25 +++++++++++++++++++------ + 1 file changed, 19 insertions(+), 6 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -11058,9 +11058,9 @@ int btrfs_error_unpin_extent_range(struc + * transaction. + */ + static int btrfs_trim_free_extents(struct btrfs_device *device, +- u64 minlen, u64 *trimmed) ++ struct fstrim_range *range, u64 *trimmed) + { +- u64 start = 0, len = 0; ++ u64 start = range->start, len = 0; + int ret; + + *trimmed = 0; +@@ -11096,8 +11096,8 @@ static int btrfs_trim_free_extents(struc + refcount_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + +- ret = find_free_dev_extent_start(trans, device, minlen, start, +- &start, &len); ++ ret = find_free_dev_extent_start(trans, device, range->minlen, ++ start, &start, &len); + if (trans) + btrfs_put_transaction(trans); + +@@ -11109,6 +11109,16 @@ static int btrfs_trim_free_extents(struc + break; + } + ++ /* If we are out of the passed range break */ ++ if (start > range->start + range->len - 1) { ++ mutex_unlock(&fs_info->chunk_mutex); ++ ret = 0; ++ break; ++ } ++ ++ start = max(range->start, start); ++ len = min(range->len, len); ++ + ret = btrfs_issue_discard(device->bdev, start, len, &bytes); + up_read(&fs_info->commit_root_sem); + mutex_unlock(&fs_info->chunk_mutex); +@@ -11119,6 +11129,10 @@ static int btrfs_trim_free_extents(struc + start += len; + *trimmed += bytes; + ++ /* We've trimmed enough */ ++ if (*trimmed >= range->len) ++ break; ++ + if (fatal_signal_pending(current)) { + ret = -ERESTARTSYS; + break; +@@ -11202,8 +11216,7 @@ int btrfs_trim_fs(struct btrfs_fs_info * + mutex_lock(&fs_info->fs_devices->device_list_mutex); + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { +- ret = btrfs_trim_free_extents(device, range->minlen, +- &group_trimmed); ++ ret = btrfs_trim_free_extents(device, range, &group_trimmed); + if (ret) { + dev_failed++; + dev_ret = ret; diff --git a/queue-4.14/driver-core-postpone-dma-tear-down-until-after-devres-release-for-probe-failure.patch b/queue-4.14/driver-core-postpone-dma-tear-down-until-after-devres-release-for-probe-failure.patch new file mode 100644 index 00000000000..ab19a3d0c52 --- /dev/null +++ b/queue-4.14/driver-core-postpone-dma-tear-down-until-after-devres-release-for-probe-failure.patch @@ -0,0 +1,119 @@ +From 0b777eee88d712256ba8232a9429edb17c4f9ceb Mon Sep 17 00:00:00 2001 +From: John Garry +Date: Thu, 28 Mar 2019 18:08:05 +0800 +Subject: driver core: Postpone DMA tear-down until after devres release for probe failure + +From: John Garry + +commit 0b777eee88d712256ba8232a9429edb17c4f9ceb upstream. + +In commit 376991db4b64 ("driver core: Postpone DMA tear-down until after +devres release"), we changed the ordering of tearing down the device DMA +ops and releasing all the device's resources; this was because the DMA ops +should be maintained until we release the device's managed DMA memories. + +However, we have seen another crash on an arm64 system when a +device driver probe fails: + + hisi_sas_v3_hw 0000:74:02.0: Adding to iommu group 2 + scsi host1: hisi_sas_v3_hw + BUG: Bad page state in process swapper/0 pfn:313f5 + page:ffff7e0000c4fd40 count:1 mapcount:0 + mapping:0000000000000000 index:0x0 + flags: 0xfffe00000001000(reserved) + raw: 0fffe00000001000 ffff7e0000c4fd48 ffff7e0000c4fd48 +0000000000000000 + raw: 0000000000000000 0000000000000000 00000001ffffffff +0000000000000000 + page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set + bad because of flags: 0x1000(reserved) + Modules linked in: + CPU: 49 PID: 1 Comm: swapper/0 Not tainted +5.1.0-rc1-43081-g22d97fd-dirty #1433 + Hardware name: Huawei D06/D06, BIOS Hisilicon D06 UEFI +RC0 - V1.12.01 01/29/2019 + Call trace: + dump_backtrace+0x0/0x118 + show_stack+0x14/0x1c + dump_stack+0xa4/0xc8 + bad_page+0xe4/0x13c + free_pages_check_bad+0x4c/0xc0 + __free_pages_ok+0x30c/0x340 + __free_pages+0x30/0x44 + __dma_direct_free_pages+0x30/0x38 + dma_direct_free+0x24/0x38 + dma_free_attrs+0x9c/0xd8 + dmam_release+0x20/0x28 + release_nodes+0x17c/0x220 + devres_release_all+0x34/0x54 + really_probe+0xc4/0x2c8 + driver_probe_device+0x58/0xfc + device_driver_attach+0x68/0x70 + __driver_attach+0x94/0xdc + bus_for_each_dev+0x5c/0xb4 + driver_attach+0x20/0x28 + bus_add_driver+0x14c/0x200 + driver_register+0x6c/0x124 + __pci_register_driver+0x48/0x50 + sas_v3_pci_driver_init+0x20/0x28 + do_one_initcall+0x40/0x25c + kernel_init_freeable+0x2b8/0x3c0 + kernel_init+0x10/0x100 + ret_from_fork+0x10/0x18 + Disabling lock debugging due to kernel taint + BUG: Bad page state in process swapper/0 pfn:313f6 + page:ffff7e0000c4fd80 count:1 mapcount:0 +mapping:0000000000000000 index:0x0 +[ 89.322983] flags: 0xfffe00000001000(reserved) + raw: 0fffe00000001000 ffff7e0000c4fd88 ffff7e0000c4fd88 +0000000000000000 + raw: 0000000000000000 0000000000000000 00000001ffffffff +0000000000000000 + +The crash occurs for the same reason. + +In this case, on the really_probe() failure path, we are still clearing +the DMA ops prior to releasing the device's managed memories. + +This patch fixes this issue by reordering the DMA ops teardown and the +call to devres_release_all() on the failure path. + +Reported-by: Xiang Chen +Tested-by: Xiang Chen +Signed-off-by: John Garry +Reviewed-by: Robin Murphy +[jpg: backport to 4.19.x and earlier] +Signed-off-by: John Garry +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/base/dd.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +--- a/drivers/base/dd.c ++++ b/drivers/base/dd.c +@@ -387,7 +387,7 @@ re_probe: + + ret = dma_configure(dev); + if (ret) +- goto dma_failed; ++ goto probe_failed; + + if (driver_sysfs_add(dev)) { + printk(KERN_ERR "%s: driver_sysfs_add(%s) failed\n", +@@ -442,14 +442,13 @@ re_probe: + goto done; + + probe_failed: +- dma_deconfigure(dev); +-dma_failed: + if (dev->bus) + blocking_notifier_call_chain(&dev->bus->p->bus_notifier, + BUS_NOTIFY_DRIVER_NOT_BOUND, dev); + pinctrl_bind_failed: + device_links_no_driver(dev); + devres_release_all(dev); ++ dma_deconfigure(dev); + driver_sysfs_remove(dev); + dev->driver = NULL; + dev_set_drvdata(dev, NULL); diff --git a/queue-4.14/md-raid-raid5-preserve-the-writeback-action-after-the-parity-check.patch b/queue-4.14/md-raid-raid5-preserve-the-writeback-action-after-the-parity-check.patch new file mode 100644 index 00000000000..33d3de72735 --- /dev/null +++ b/queue-4.14/md-raid-raid5-preserve-the-writeback-action-after-the-parity-check.patch @@ -0,0 +1,52 @@ +From b2176a1dfb518d870ee073445d27055fea64dfb8 Mon Sep 17 00:00:00 2001 +From: Nigel Croxon +Date: Tue, 16 Apr 2019 09:50:09 -0700 +Subject: md/raid: raid5 preserve the writeback action after the parity check + +From: Nigel Croxon + +commit b2176a1dfb518d870ee073445d27055fea64dfb8 upstream. + +The problem is that any 'uptodate' vs 'disks' check is not precise +in this path. Put a "WARN_ON(!test_bit(R5_UPTODATE, &dev->flags)" on the +device that might try to kick off writes and then skip the action. +Better to prevent the raid driver from taking unexpected action *and* keep +the system alive vs killing the machine with BUG_ON. + +Note: fixed warning reported by kbuild test robot + +Signed-off-by: Dan Williams +Signed-off-by: Nigel Croxon +Signed-off-by: Song Liu +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid5.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -4182,7 +4182,7 @@ static void handle_parity_checks6(struct + /* now write out any block on a failed drive, + * or P or Q if they were recomputed + */ +- BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ ++ dev = NULL; + if (s->failed == 2) { + dev = &sh->dev[s->failed_num[1]]; + s->locked++; +@@ -4207,6 +4207,14 @@ static void handle_parity_checks6(struct + set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantwrite, &dev->flags); + } ++ if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags), ++ "%s: disk%td not up to date\n", ++ mdname(conf->mddev), ++ dev - (struct r5dev *) &sh->dev)) { ++ clear_bit(R5_LOCKED, &dev->flags); ++ clear_bit(R5_Wantwrite, &dev->flags); ++ s->locked--; ++ } + clear_bit(STRIPE_DEGRADED, &sh->state); + + set_bit(STRIPE_INSYNC, &sh->state); diff --git a/queue-4.14/revert-don-t-jump-to-compute_result-state-from-check_result-state.patch b/queue-4.14/revert-don-t-jump-to-compute_result-state-from-check_result-state.patch new file mode 100644 index 00000000000..e6728686f0f --- /dev/null +++ b/queue-4.14/revert-don-t-jump-to-compute_result-state-from-check_result-state.patch @@ -0,0 +1,54 @@ +From a25d8c327bb41742dbd59f8c545f59f3b9c39983 Mon Sep 17 00:00:00 2001 +From: Song Liu +Date: Tue, 16 Apr 2019 09:34:21 -0700 +Subject: Revert "Don't jump to compute_result state from check_result state" + +From: Song Liu + +commit a25d8c327bb41742dbd59f8c545f59f3b9c39983 upstream. + +This reverts commit 4f4fd7c5798bbdd5a03a60f6269cf1177fbd11ef. + +Cc: Dan Williams +Cc: Nigel Croxon +Cc: Xiao Ni +Signed-off-by: Song Liu +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/md/raid5.c | 19 +++++++++++++++---- + 1 file changed, 15 insertions(+), 4 deletions(-) + +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -4218,15 +4218,26 @@ static void handle_parity_checks6(struct + case check_state_check_result: + sh->check_state = check_state_idle; + +- if (s->failed > 1) +- break; + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) { +- /* Any parity checked was correct */ +- set_bit(STRIPE_INSYNC, &sh->state); ++ /* both parities are correct */ ++ if (!s->failed) ++ set_bit(STRIPE_INSYNC, &sh->state); ++ else { ++ /* in contrast to the raid5 case we can validate ++ * parity, but still have a failure to write ++ * back ++ */ ++ sh->check_state = check_state_compute_result; ++ /* Returning at this point means that we may go ++ * off and bring p and/or q uptodate again so ++ * we make sure to check zero_sum_result again ++ * to verify if p or q need writeback ++ */ ++ } + } else { + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { diff --git a/queue-4.14/series b/queue-4.14/series index bad08a0cbe9..92c8fb9f286 100644 --- a/queue-4.14/series +++ b/queue-4.14/series @@ -69,3 +69,9 @@ sched-cpufreq-fix-kobject-memleak.patch x86-mm-mem_encrypt-disable-all-instrumentation-for-e.patch ufs-fix-braino-in-ufs_get_inode_gid-for-solaris-ufs-.patch perf-bench-numa-add-define-for-rusage_thread-if-not-.patch +revert-don-t-jump-to-compute_result-state-from-check_result-state.patch +md-raid-raid5-preserve-the-writeback-action-after-the-parity-check.patch +driver-core-postpone-dma-tear-down-until-after-devres-release-for-probe-failure.patch +bpf-add-map_lookup_elem_sys_only-for-lookups-from-syscall-side.patch +bpf-lru-avoid-messing-with-eviction-heuristics-upon-syscall-lookup.patch +btrfs-honour-fitrim-range-constraints-during-free-space-trim.patch