From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 23 May 2019 18:02:09 +0000 (+0200)
Subject: 5.0-stable patches
X-Git-Tag: v5.1.5~8
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b8704eb5cf6d2827dd86c3c8d5471358da0d056f;p=thirdparty%2Fkernel%2Fstable-queue.git

5.0-stable patches

added patches:
	bpf-add-map_lookup_elem_sys_only-for-lookups-from-syscall-side.patch
	bpf-lru-avoid-messing-with-eviction-heuristics-upon-syscall-lookup.patch
	bpf-relax-inode-permission-check-for-retrieving-bpf-program.patch
	driver-core-postpone-dma-tear-down-until-after-devres-release-for-probe-failure.patch
	md-raid-raid5-preserve-the-writeback-action-after-the-parity-check.patch
	revert-don-t-jump-to-compute_result-state-from-check_result-state.patch
---

diff --git a/queue-5.0/bpf-add-map_lookup_elem_sys_only-for-lookups-from-syscall-side.patch b/queue-5.0/bpf-add-map_lookup_elem_sys_only-for-lookups-from-syscall-side.patch
new file mode 100644
index 00000000000..ce87d2b4225
--- /dev/null
+++ b/queue-5.0/bpf-add-map_lookup_elem_sys_only-for-lookups-from-syscall-side.patch
@@ -0,0 +1,53 @@
+From c6110222c6f49ea68169f353565eb865488a8619 Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Tue, 14 May 2019 01:18:55 +0200
+Subject: bpf: add map_lookup_elem_sys_only for lookups from syscall side
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit c6110222c6f49ea68169f353565eb865488a8619 upstream.
+
+Add a callback map_lookup_elem_sys_only() that map implementations
+could use over map_lookup_elem() from system call side in case the
+map implementation needs to handle the latter differently than from
+the BPF data path. If map_lookup_elem_sys_only() is set, this will
+be preferred pick for map lookups out of user space. This hook is
+used in a follow-up fix for LRU map, but once development window
+opens, we can convert other map types from map_lookup_elem() (here,
+the one called upon BPF_MAP_LOOKUP_ELEM cmd is meant) over to use
+the callback to simplify and clean up the latter.
+
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Martin KaFai Lau <kafai@fb.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/bpf.h  |    1 +
+ kernel/bpf/syscall.c |    5 ++++-
+ 2 files changed, 5 insertions(+), 1 deletion(-)
+
+--- a/include/linux/bpf.h
++++ b/include/linux/bpf.h
+@@ -35,6 +35,7 @@ struct bpf_map_ops {
+ 	void (*map_free)(struct bpf_map *map);
+ 	int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key);
+ 	void (*map_release_uref)(struct bpf_map *map);
++	void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key);
+ 
+ 	/* funcs callable from userspace and from eBPF programs */
+ 	void *(*map_lookup_elem)(struct bpf_map *map, void *key);
+--- a/kernel/bpf/syscall.c
++++ b/kernel/bpf/syscall.c
+@@ -738,7 +738,10 @@ static int map_lookup_elem(union bpf_att
+ 		err = map->ops->map_peek_elem(map, value);
+ 	} else {
+ 		rcu_read_lock();
+-		ptr = map->ops->map_lookup_elem(map, key);
++		if (map->ops->map_lookup_elem_sys_only)
++			ptr = map->ops->map_lookup_elem_sys_only(map, key);
++		else
++			ptr = map->ops->map_lookup_elem(map, key);
+ 		if (IS_ERR(ptr)) {
+ 			err = PTR_ERR(ptr);
+ 		} else if (!ptr) {
diff --git a/queue-5.0/bpf-lru-avoid-messing-with-eviction-heuristics-upon-syscall-lookup.patch b/queue-5.0/bpf-lru-avoid-messing-with-eviction-heuristics-upon-syscall-lookup.patch
new file mode 100644
index 00000000000..561b1906fb4
--- /dev/null
+++ b/queue-5.0/bpf-lru-avoid-messing-with-eviction-heuristics-upon-syscall-lookup.patch
@@ -0,0 +1,106 @@
+From 50b045a8c0ccf44f76640ac3eea8d80ca53979a3 Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Tue, 14 May 2019 01:18:56 +0200
+Subject: bpf, lru: avoid messing with eviction heuristics upon syscall lookup
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit 50b045a8c0ccf44f76640ac3eea8d80ca53979a3 upstream.
+
+One of the biggest issues we face right now with picking LRU map over
+regular hash table is that a map walk out of user space, for example,
+to just dump the existing entries or to remove certain ones, will
+completely mess up LRU eviction heuristics and wrong entries such
+as just created ones will get evicted instead. The reason for this
+is that we mark an entry as "in use" via bpf_lru_node_set_ref() from
+system call lookup side as well. Thus upon walk, all entries are
+being marked, so information of actual least recently used ones
+are "lost".
+
+In case of Cilium where it can be used (besides others) as a BPF
+based connection tracker, this current behavior causes disruption
+upon control plane changes that need to walk the map from user space
+to evict certain entries. Discussion result from bpfconf [0] was that
+we should simply just remove marking from system call side as no
+good use case could be found where it's actually needed there.
+Therefore this patch removes marking for regular LRU and per-CPU
+flavor. If there ever should be a need in future, the behavior could
+be selected via map creation flag, but due to mentioned reason we
+avoid this here.
+
+  [0] http://vger.kernel.org/bpfconf.html
+
+Fixes: 29ba732acbee ("bpf: Add BPF_MAP_TYPE_LRU_HASH")
+Fixes: 8f8449384ec3 ("bpf: Add BPF_MAP_TYPE_LRU_PERCPU_HASH")
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Martin KaFai Lau <kafai@fb.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/bpf/hashtab.c |   23 ++++++++++++++++++-----
+ 1 file changed, 18 insertions(+), 5 deletions(-)
+
+--- a/kernel/bpf/hashtab.c
++++ b/kernel/bpf/hashtab.c
+@@ -527,18 +527,30 @@ static u32 htab_map_gen_lookup(struct bp
+ 	return insn - insn_buf;
+ }
+ 
+-static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
++static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
++							void *key, const bool mark)
+ {
+ 	struct htab_elem *l = __htab_map_lookup_elem(map, key);
+ 
+ 	if (l) {
+-		bpf_lru_node_set_ref(&l->lru_node);
++		if (mark)
++			bpf_lru_node_set_ref(&l->lru_node);
+ 		return l->key + round_up(map->key_size, 8);
+ 	}
+ 
+ 	return NULL;
+ }
+ 
++static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
++{
++	return __htab_lru_map_lookup_elem(map, key, true);
++}
++
++static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
++{
++	return __htab_lru_map_lookup_elem(map, key, false);
++}
++
+ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+ 				   struct bpf_insn *insn_buf)
+ {
+@@ -1215,6 +1227,7 @@ const struct bpf_map_ops htab_lru_map_op
+ 	.map_free = htab_map_free,
+ 	.map_get_next_key = htab_map_get_next_key,
+ 	.map_lookup_elem = htab_lru_map_lookup_elem,
++	.map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
+ 	.map_update_elem = htab_lru_map_update_elem,
+ 	.map_delete_elem = htab_lru_map_delete_elem,
+ 	.map_gen_lookup = htab_lru_map_gen_lookup,
+@@ -1246,7 +1259,6 @@ static void *htab_lru_percpu_map_lookup_
+ 
+ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
+ {
+-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ 	struct htab_elem *l;
+ 	void __percpu *pptr;
+ 	int ret = -ENOENT;
+@@ -1262,8 +1274,9 @@ int bpf_percpu_hash_copy(struct bpf_map
+ 	l = __htab_map_lookup_elem(map, key);
+ 	if (!l)
+ 		goto out;
+-	if (htab_is_lru(htab))
+-		bpf_lru_node_set_ref(&l->lru_node);
++	/* We do not mark LRU map element here in order to not mess up
++	 * eviction heuristics when user space does a map walk.
++	 */
+ 	pptr = htab_elem_get_ptr(l, map->key_size);
+ 	for_each_possible_cpu(cpu) {
+ 		bpf_long_memcpy(value + off,
diff --git a/queue-5.0/bpf-relax-inode-permission-check-for-retrieving-bpf-program.patch b/queue-5.0/bpf-relax-inode-permission-check-for-retrieving-bpf-program.patch
new file mode 100644
index 00000000000..63f3fc48b04
--- /dev/null
+++ b/queue-5.0/bpf-relax-inode-permission-check-for-retrieving-bpf-program.patch
@@ -0,0 +1,38 @@
+From e547ff3f803e779a3898f1f48447b29f43c54085 Mon Sep 17 00:00:00 2001
+From: Chenbo Feng <fengc@google.com>
+Date: Tue, 14 May 2019 19:42:57 -0700
+Subject: bpf: relax inode permission check for retrieving bpf program
+
+From: Chenbo Feng <fengc@google.com>
+
+commit e547ff3f803e779a3898f1f48447b29f43c54085 upstream.
+
+For iptable module to load a bpf program from a pinned location, it
+only retrieve a loaded program and cannot change the program content so
+requiring a write permission for it might not be necessary.
+Also when adding or removing an unrelated iptable rule, it might need to
+flush and reload the xt_bpf related rules as well and triggers the inode
+permission check. It might be better to remove the write premission
+check for the inode so we won't need to grant write access to all the
+processes that flush and restore iptables rules.
+
+Signed-off-by: Chenbo Feng <fengc@google.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/bpf/inode.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/bpf/inode.c
++++ b/kernel/bpf/inode.c
+@@ -518,7 +518,7 @@ out:
+ static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
+ {
+ 	struct bpf_prog *prog;
+-	int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
++	int ret = inode_permission(inode, MAY_READ);
+ 	if (ret)
+ 		return ERR_PTR(ret);
+ 
diff --git a/queue-5.0/driver-core-postpone-dma-tear-down-until-after-devres-release-for-probe-failure.patch b/queue-5.0/driver-core-postpone-dma-tear-down-until-after-devres-release-for-probe-failure.patch
new file mode 100644
index 00000000000..8ce8388e8c4
--- /dev/null
+++ b/queue-5.0/driver-core-postpone-dma-tear-down-until-after-devres-release-for-probe-failure.patch
@@ -0,0 +1,117 @@
+From 0b777eee88d712256ba8232a9429edb17c4f9ceb Mon Sep 17 00:00:00 2001
+From: John Garry <john.garry@huawei.com>
+Date: Thu, 28 Mar 2019 18:08:05 +0800
+Subject: driver core: Postpone DMA tear-down until after devres release for probe failure
+
+From: John Garry <john.garry@huawei.com>
+
+commit 0b777eee88d712256ba8232a9429edb17c4f9ceb upstream.
+
+In commit 376991db4b64 ("driver core: Postpone DMA tear-down until after
+devres release"), we changed the ordering of tearing down the device DMA
+ops and releasing all the device's resources; this was because the DMA ops
+should be maintained until we release the device's managed DMA memories.
+
+However, we have seen another crash on an arm64 system when a
+device driver probe fails:
+
+  hisi_sas_v3_hw 0000:74:02.0: Adding to iommu group 2
+  scsi host1: hisi_sas_v3_hw
+  BUG: Bad page state in process swapper/0  pfn:313f5
+  page:ffff7e0000c4fd40 count:1 mapcount:0
+  mapping:0000000000000000 index:0x0
+  flags: 0xfffe00000001000(reserved)
+  raw: 0fffe00000001000 ffff7e0000c4fd48 ffff7e0000c4fd48
+0000000000000000
+  raw: 0000000000000000 0000000000000000 00000001ffffffff
+0000000000000000
+  page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set
+  bad because of flags: 0x1000(reserved)
+  Modules linked in:
+  CPU: 49 PID: 1 Comm: swapper/0 Not tainted
+5.1.0-rc1-43081-g22d97fd-dirty #1433
+  Hardware name: Huawei D06/D06, BIOS Hisilicon D06 UEFI
+RC0 - V1.12.01 01/29/2019
+  Call trace:
+  dump_backtrace+0x0/0x118
+  show_stack+0x14/0x1c
+  dump_stack+0xa4/0xc8
+  bad_page+0xe4/0x13c
+  free_pages_check_bad+0x4c/0xc0
+  __free_pages_ok+0x30c/0x340
+  __free_pages+0x30/0x44
+  __dma_direct_free_pages+0x30/0x38
+  dma_direct_free+0x24/0x38
+  dma_free_attrs+0x9c/0xd8
+  dmam_release+0x20/0x28
+  release_nodes+0x17c/0x220
+  devres_release_all+0x34/0x54
+  really_probe+0xc4/0x2c8
+  driver_probe_device+0x58/0xfc
+  device_driver_attach+0x68/0x70
+  __driver_attach+0x94/0xdc
+  bus_for_each_dev+0x5c/0xb4
+  driver_attach+0x20/0x28
+  bus_add_driver+0x14c/0x200
+  driver_register+0x6c/0x124
+  __pci_register_driver+0x48/0x50
+  sas_v3_pci_driver_init+0x20/0x28
+  do_one_initcall+0x40/0x25c
+  kernel_init_freeable+0x2b8/0x3c0
+  kernel_init+0x10/0x100
+  ret_from_fork+0x10/0x18
+  Disabling lock debugging due to kernel taint
+  BUG: Bad page state in process swapper/0  pfn:313f6
+  page:ffff7e0000c4fd80 count:1 mapcount:0
+mapping:0000000000000000 index:0x0
+[   89.322983] flags: 0xfffe00000001000(reserved)
+  raw: 0fffe00000001000 ffff7e0000c4fd88 ffff7e0000c4fd88
+0000000000000000
+  raw: 0000000000000000 0000000000000000 00000001ffffffff
+0000000000000000
+
+The crash occurs for the same reason.
+
+In this case, on the really_probe() failure path, we are still clearing
+the DMA ops prior to releasing the device's managed memories.
+
+This patch fixes this issue by reordering the DMA ops teardown and the
+call to devres_release_all() on the failure path.
+
+Reported-by: Xiang Chen <chenxiang66@hisilicon.com>
+Tested-by: Xiang Chen <chenxiang66@hisilicon.com>
+Signed-off-by: John Garry <john.garry@huawei.com>
+Reviewed-by: Robin Murphy <robin.murphy@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/base/dd.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/drivers/base/dd.c
++++ b/drivers/base/dd.c
+@@ -486,7 +486,7 @@ re_probe:
+ 	if (dev->bus->dma_configure) {
+ 		ret = dev->bus->dma_configure(dev);
+ 		if (ret)
+-			goto dma_failed;
++			goto probe_failed;
+ 	}
+ 
+ 	if (driver_sysfs_add(dev)) {
+@@ -542,14 +542,13 @@ re_probe:
+ 	goto done;
+ 
+ probe_failed:
+-	arch_teardown_dma_ops(dev);
+-dma_failed:
+ 	if (dev->bus)
+ 		blocking_notifier_call_chain(&dev->bus->p->bus_notifier,
+ 					     BUS_NOTIFY_DRIVER_NOT_BOUND, dev);
+ pinctrl_bind_failed:
+ 	device_links_no_driver(dev);
+ 	devres_release_all(dev);
++	arch_teardown_dma_ops(dev);
+ 	driver_sysfs_remove(dev);
+ 	dev->driver = NULL;
+ 	dev_set_drvdata(dev, NULL);
diff --git a/queue-5.0/md-raid-raid5-preserve-the-writeback-action-after-the-parity-check.patch b/queue-5.0/md-raid-raid5-preserve-the-writeback-action-after-the-parity-check.patch
new file mode 100644
index 00000000000..79f3627738d
--- /dev/null
+++ b/queue-5.0/md-raid-raid5-preserve-the-writeback-action-after-the-parity-check.patch
@@ -0,0 +1,52 @@
+From b2176a1dfb518d870ee073445d27055fea64dfb8 Mon Sep 17 00:00:00 2001
+From: Nigel Croxon <ncroxon@redhat.com>
+Date: Tue, 16 Apr 2019 09:50:09 -0700
+Subject: md/raid: raid5 preserve the writeback action after the parity check
+
+From: Nigel Croxon <ncroxon@redhat.com>
+
+commit b2176a1dfb518d870ee073445d27055fea64dfb8 upstream.
+
+The problem is that any 'uptodate' vs 'disks' check is not precise
+in this path. Put a "WARN_ON(!test_bit(R5_UPTODATE, &dev->flags)" on the
+device that might try to kick off writes and then skip the action.
+Better to prevent the raid driver from taking unexpected action *and* keep
+the system alive vs killing the machine with BUG_ON.
+
+Note: fixed warning reported by kbuild test robot <lkp@intel.com>
+
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Nigel Croxon <ncroxon@redhat.com>
+Signed-off-by: Song Liu <songliubraving@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid5.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/drivers/md/raid5.c
++++ b/drivers/md/raid5.c
+@@ -4197,7 +4197,7 @@ static void handle_parity_checks6(struct
+ 		/* now write out any block on a failed drive,
+ 		 * or P or Q if they were recomputed
+ 		 */
+-		BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
++		dev = NULL;
+ 		if (s->failed == 2) {
+ 			dev = &sh->dev[s->failed_num[1]];
+ 			s->locked++;
+@@ -4222,6 +4222,14 @@ static void handle_parity_checks6(struct
+ 			set_bit(R5_LOCKED, &dev->flags);
+ 			set_bit(R5_Wantwrite, &dev->flags);
+ 		}
++		if (WARN_ONCE(dev && !test_bit(R5_UPTODATE, &dev->flags),
++			      "%s: disk%td not up to date\n",
++			      mdname(conf->mddev),
++			      dev - (struct r5dev *) &sh->dev)) {
++			clear_bit(R5_LOCKED, &dev->flags);
++			clear_bit(R5_Wantwrite, &dev->flags);
++			s->locked--;
++		}
+ 		clear_bit(STRIPE_DEGRADED, &sh->state);
+ 
+ 		set_bit(STRIPE_INSYNC, &sh->state);
diff --git a/queue-5.0/revert-don-t-jump-to-compute_result-state-from-check_result-state.patch b/queue-5.0/revert-don-t-jump-to-compute_result-state-from-check_result-state.patch
new file mode 100644
index 00000000000..d1c0ed76216
--- /dev/null
+++ b/queue-5.0/revert-don-t-jump-to-compute_result-state-from-check_result-state.patch
@@ -0,0 +1,54 @@
+From a25d8c327bb41742dbd59f8c545f59f3b9c39983 Mon Sep 17 00:00:00 2001
+From: Song Liu <songliubraving@fb.com>
+Date: Tue, 16 Apr 2019 09:34:21 -0700
+Subject: Revert "Don't jump to compute_result state from check_result state"
+
+From: Song Liu <songliubraving@fb.com>
+
+commit a25d8c327bb41742dbd59f8c545f59f3b9c39983 upstream.
+
+This reverts commit 4f4fd7c5798bbdd5a03a60f6269cf1177fbd11ef.
+
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Nigel Croxon <ncroxon@redhat.com>
+Cc: Xiao Ni <xni@redhat.com>
+Signed-off-by: Song Liu <songliubraving@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/md/raid5.c |   19 +++++++++++++++----
+ 1 file changed, 15 insertions(+), 4 deletions(-)
+
+--- a/drivers/md/raid5.c
++++ b/drivers/md/raid5.c
+@@ -4233,15 +4233,26 @@ static void handle_parity_checks6(struct
+ 	case check_state_check_result:
+ 		sh->check_state = check_state_idle;
+ 
+-		if (s->failed > 1)
+-			break;
+ 		/* handle a successful check operation, if parity is correct
+ 		 * we are done.  Otherwise update the mismatch count and repair
+ 		 * parity if !MD_RECOVERY_CHECK
+ 		 */
+ 		if (sh->ops.zero_sum_result == 0) {
+-			/* Any parity checked was correct */
+-			set_bit(STRIPE_INSYNC, &sh->state);
++			/* both parities are correct */
++			if (!s->failed)
++				set_bit(STRIPE_INSYNC, &sh->state);
++			else {
++				/* in contrast to the raid5 case we can validate
++				 * parity, but still have a failure to write
++				 * back
++				 */
++				sh->check_state = check_state_compute_result;
++				/* Returning at this point means that we may go
++				 * off and bring p and/or q uptodate again so
++				 * we make sure to check zero_sum_result again
++				 * to verify if p or q need writeback
++				 */
++			}
+ 		} else {
+ 			atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches);
+ 			if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) {
diff --git a/queue-5.0/series b/queue-5.0/series
index d29fd0a1899..55aa2d5580d 100644
--- a/queue-5.0/series
+++ b/queue-5.0/series
@@ -131,3 +131,9 @@ i2c-designware-ratelimit-transfer-when-suspended-err.patch
 perf-bench-numa-add-define-for-rusage_thread-if-not-.patch
 perf-cs-etm-always-allocate-memory-for-cs_etm_queue-.patch
 perf-x86-intel-fix-race-in-intel_pmu_disable_event.patch
+revert-don-t-jump-to-compute_result-state-from-check_result-state.patch
+md-raid-raid5-preserve-the-writeback-action-after-the-parity-check.patch
+driver-core-postpone-dma-tear-down-until-after-devres-release-for-probe-failure.patch
+bpf-relax-inode-permission-check-for-retrieving-bpf-program.patch
+bpf-add-map_lookup_elem_sys_only-for-lookups-from-syscall-side.patch
+bpf-lru-avoid-messing-with-eviction-heuristics-upon-syscall-lookup.patch