]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 5.6
authorSasha Levin <sashal@kernel.org>
Mon, 25 May 2020 20:56:02 +0000 (16:56 -0400)
committerSasha Levin <sashal@kernel.org>
Mon, 25 May 2020 20:56:02 +0000 (16:56 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
queue-5.6/bpf-prevent-mmap-ing-read-only-maps-as-writable.patch [new file with mode: 0644]
queue-5.6/rxrpc-fix-ack-discard.patch [new file with mode: 0644]
queue-5.6/rxrpc-trace-discarded-acks.patch [new file with mode: 0644]
queue-5.6/sched-fair-fix-enqueue_task_fair-warning-some-more.patch [new file with mode: 0644]
queue-5.6/sched-fair-fix-reordering-of-enqueue-dequeue_task_fa.patch [new file with mode: 0644]
queue-5.6/sched-fair-reorder-enqueue-dequeue_task_fair-path.patch [new file with mode: 0644]
queue-5.6/series

diff --git a/queue-5.6/bpf-prevent-mmap-ing-read-only-maps-as-writable.patch b/queue-5.6/bpf-prevent-mmap-ing-read-only-maps-as-writable.patch
new file mode 100644 (file)
index 0000000..5c95d7b
--- /dev/null
@@ -0,0 +1,112 @@
+From e2e3b3cb1be76755359bfd2ebfc6b2ec5e690910 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 18 May 2020 22:38:24 -0700
+Subject: bpf: Prevent mmap()'ing read-only maps as writable
+
+From: Andrii Nakryiko <andriin@fb.com>
+
+[ Upstream commit dfeb376dd4cb2c5004aeb625e2475f58a5ff2ea7 ]
+
+As discussed in [0], it's dangerous to allow mapping BPF map, that's meant to
+be frozen and is read-only on BPF program side, because that allows user-space
+to actually store a writable view to the page even after it is frozen. This is
+exacerbated by BPF verifier making a strong assumption that contents of such
+frozen map will remain unchanged. To prevent this, disallow mapping
+BPF_F_RDONLY_PROG mmap()'able BPF maps as writable, ever.
+
+  [0] https://lore.kernel.org/bpf/CAEf4BzYGWYhXdp6BJ7_=9OQPJxQpgug080MMjdSB72i9R+5c6g@mail.gmail.com/
+
+Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY")
+Suggested-by: Jann Horn <jannh@google.com>
+Signed-off-by: Andrii Nakryiko <andriin@fb.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Reviewed-by: Jann Horn <jannh@google.com>
+Link: https://lore.kernel.org/bpf/20200519053824.1089415-1-andriin@fb.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/syscall.c                          | 17 ++++++++++++++---
+ tools/testing/selftests/bpf/prog_tests/mmap.c | 13 ++++++++++++-
+ tools/testing/selftests/bpf/progs/test_mmap.c |  8 ++++++++
+ 3 files changed, 34 insertions(+), 4 deletions(-)
+
+diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
+index e04ea4c8f935..c0ab9bfdf28a 100644
+--- a/kernel/bpf/syscall.c
++++ b/kernel/bpf/syscall.c
+@@ -629,9 +629,20 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
+       mutex_lock(&map->freeze_mutex);
+-      if ((vma->vm_flags & VM_WRITE) && map->frozen) {
+-              err = -EPERM;
+-              goto out;
++      if (vma->vm_flags & VM_WRITE) {
++              if (map->frozen) {
++                      err = -EPERM;
++                      goto out;
++              }
++              /* map is meant to be read-only, so do not allow mapping as
++               * writable, because it's possible to leak a writable page
++               * reference and allows user-space to still modify it after
++               * freezing, while verifier will assume contents do not change
++               */
++              if (map->map_flags & BPF_F_RDONLY_PROG) {
++                      err = -EACCES;
++                      goto out;
++              }
+       }
+       /* set default open/close callbacks */
+diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c
+index b0e789678aa4..5495b669fccc 100644
+--- a/tools/testing/selftests/bpf/prog_tests/mmap.c
++++ b/tools/testing/selftests/bpf/prog_tests/mmap.c
+@@ -19,7 +19,7 @@ void test_mmap(void)
+       const size_t map_sz = roundup_page(sizeof(struct map_data));
+       const int zero = 0, one = 1, two = 2, far = 1500;
+       const long page_size = sysconf(_SC_PAGE_SIZE);
+-      int err, duration = 0, i, data_map_fd;
++      int err, duration = 0, i, data_map_fd, rdmap_fd;
+       struct bpf_map *data_map, *bss_map;
+       void *bss_mmaped = NULL, *map_mmaped = NULL, *tmp1, *tmp2;
+       struct test_mmap__bss *bss_data;
+@@ -36,6 +36,17 @@ void test_mmap(void)
+       data_map = skel->maps.data_map;
+       data_map_fd = bpf_map__fd(data_map);
++      rdmap_fd = bpf_map__fd(skel->maps.rdonly_map);
++      tmp1 = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0);
++      if (CHECK(tmp1 != MAP_FAILED, "rdonly_write_mmap", "unexpected success\n")) {
++              munmap(tmp1, 4096);
++              goto cleanup;
++      }
++      /* now double-check if it's mmap()'able at all */
++      tmp1 = mmap(NULL, 4096, PROT_READ, MAP_SHARED, rdmap_fd, 0);
++      if (CHECK(tmp1 == MAP_FAILED, "rdonly_read_mmap", "failed: %d\n", errno))
++              goto cleanup;
++
+       bss_mmaped = mmap(NULL, bss_sz, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         bpf_map__fd(bss_map), 0);
+       if (CHECK(bss_mmaped == MAP_FAILED, "bss_mmap",
+diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c
+index 6239596cd14e..4eb42cff5fe9 100644
+--- a/tools/testing/selftests/bpf/progs/test_mmap.c
++++ b/tools/testing/selftests/bpf/progs/test_mmap.c
+@@ -7,6 +7,14 @@
+ char _license[] SEC("license") = "GPL";
++struct {
++      __uint(type, BPF_MAP_TYPE_ARRAY);
++      __uint(max_entries, 4096);
++      __uint(map_flags, BPF_F_MMAPABLE | BPF_F_RDONLY_PROG);
++      __type(key, __u32);
++      __type(value, char);
++} rdonly_map SEC(".maps");
++
+ struct {
+       __uint(type, BPF_MAP_TYPE_ARRAY);
+       __uint(max_entries, 512 * 4); /* at least 4 pages of data */
+-- 
+2.25.1
+
diff --git a/queue-5.6/rxrpc-fix-ack-discard.patch b/queue-5.6/rxrpc-fix-ack-discard.patch
new file mode 100644 (file)
index 0000000..9908fc4
--- /dev/null
@@ -0,0 +1,157 @@
+From 4284959868e938be733171e662e05e05ef9d1f22 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 29 Apr 2020 23:48:43 +0100
+Subject: rxrpc: Fix ack discard
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 441fdee1eaf050ef0040bde0d7af075c1c6a6d8b ]
+
+The Rx protocol has a "previousPacket" field in it that is not handled in
+the same way by all protocol implementations.  Sometimes it contains the
+serial number of the last DATA packet received, sometimes the sequence
+number of the last DATA packet received and sometimes the highest sequence
+number so far received.
+
+AF_RXRPC is using this to weed out ACKs that are out of date (it's possible
+for ACK packets to get reordered on the wire), but this does not work with
+OpenAFS which will just stick the sequence number of the last packet seen
+into previousPacket.
+
+The issue being seen is that big AFS FS.StoreData RPC (eg. of ~256MiB) are
+timing out when partly sent.  A trace was captured, with an additional
+tracepoint to show ACKs being discarded in rxrpc_input_ack().  Here's an
+excerpt showing the problem.
+
+ 52873.203230: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 0002449c q=00024499 fl=09
+
+A DATA packet with sequence number 00024499 has been transmitted (the "q="
+field).
+
+ ...
+ 52873.243296: rxrpc_rx_ack: c=000004ae 00012a2b DLY r=00024499 f=00024497 p=00024496 n=0
+ 52873.243376: rxrpc_rx_ack: c=000004ae 00012a2c IDL r=0002449b f=00024499 p=00024498 n=0
+ 52873.243383: rxrpc_rx_ack: c=000004ae 00012a2d OOS r=0002449d f=00024499 p=0002449a n=2
+
+The Out-Of-Sequence ACK indicates that the server didn't see DATA sequence
+number 00024499, but did see seq 0002449a (previousPacket, shown as "p=",
+skipped the number, but firstPacket, "f=", which shows the bottom of the
+window is set at that point).
+
+ 52873.252663: rxrpc_retransmit: c=000004ae q=24499 a=02 xp=14581537
+ 52873.252664: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244bc q=00024499 fl=0b *RETRANS*
+
+The packet has been retransmitted.  Retransmission recurs until the peer
+says it got the packet.
+
+ 52873.271013: rxrpc_rx_ack: c=000004ae 00012a31 OOS r=000244a1 f=00024499 p=0002449e n=6
+
+More OOS ACKs indicate that the other packets that are already in the
+transmission pipeline are being received.  The specific-ACK list is up to 6
+ACKs and NAKs.
+
+ ...
+ 52873.284792: rxrpc_rx_ack: c=000004ae 00012a49 OOS r=000244b9 f=00024499 p=000244b6 n=30
+ 52873.284802: rxrpc_retransmit: c=000004ae q=24499 a=0a xp=63505500
+ 52873.284804: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244c2 q=00024499 fl=0b *RETRANS*
+ 52873.287468: rxrpc_rx_ack: c=000004ae 00012a4a OOS r=000244ba f=00024499 p=000244b7 n=31
+ 52873.287478: rxrpc_rx_ack: c=000004ae 00012a4b OOS r=000244bb f=00024499 p=000244b8 n=32
+
+At this point, the server's receive window is full (n=32) with presumably 1
+NAK'd packet and 31 ACK'd packets.  We can't transmit any more packets.
+
+ 52873.287488: rxrpc_retransmit: c=000004ae q=24499 a=0a xp=61327980
+ 52873.287489: rxrpc_tx_data: c=000004ae DATA ed1a3584:00000002 000244c3 q=00024499 fl=0b *RETRANS*
+ 52873.293850: rxrpc_rx_ack: c=000004ae 00012a4c DLY r=000244bc f=000244a0 p=00024499 n=25
+
+And now we've received an ACK indicating that a DATA retransmission was
+received.  7 packets have been processed (the occupied part of the window
+moved, as indicated by f= and n=).
+
+ 52873.293853: rxrpc_rx_discard_ack: c=000004ae r=00012a4c 000244a0<00024499 00024499<000244b8
+
+However, the DLY ACK gets discarded because its previousPacket has gone
+backwards (from p=000244b8, in the ACK at 52873.287478 to p=00024499 in the
+ACK at 52873.293850).
+
+We then end up in a continuous cycle of retransmit/discard.  kafs fails to
+update its window because it's discarding the ACKs and can't transmit an
+extra packet that would clear the issue because the window is full.
+OpenAFS doesn't change the previousPacket value in the ACKs because no new
+DATA packets are received with a different previousPacket number.
+
+Fix this by altering the discard check to only discard an ACK based on
+previousPacket if there was no advance in the firstPacket.  This allows us
+to transmit a new packet which will cause previousPacket to advance in the
+next ACK.
+
+The check, however, needs to allow for the possibility that previousPacket
+may actually have had the serial number placed in it instead - in which
+case it will go outside the window and we should ignore it.
+
+Fixes: 1a2391c30c0b ("rxrpc: Fix detection of out of order acks")
+Reported-by: Dave Botsch <botsch@cnf.cornell.edu>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/rxrpc/input.c | 30 ++++++++++++++++++++++++++----
+ 1 file changed, 26 insertions(+), 4 deletions(-)
+
+diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
+index 2f22f082a66c..3be4177baf70 100644
+--- a/net/rxrpc/input.c
++++ b/net/rxrpc/input.c
+@@ -802,6 +802,30 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks,
+       }
+ }
++/*
++ * Return true if the ACK is valid - ie. it doesn't appear to have regressed
++ * with respect to the ack state conveyed by preceding ACKs.
++ */
++static bool rxrpc_is_ack_valid(struct rxrpc_call *call,
++                             rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt)
++{
++      rxrpc_seq_t base = READ_ONCE(call->ackr_first_seq);
++
++      if (after(first_pkt, base))
++              return true; /* The window advanced */
++
++      if (before(first_pkt, base))
++              return false; /* firstPacket regressed */
++
++      if (after_eq(prev_pkt, call->ackr_prev_seq))
++              return true; /* previousPacket hasn't regressed. */
++
++      /* Some rx implementations put a serial number in previousPacket. */
++      if (after_eq(prev_pkt, base + call->tx_winsize))
++              return false;
++      return true;
++}
++
+ /*
+  * Process an ACK packet.
+  *
+@@ -865,8 +889,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
+       }
+       /* Discard any out-of-order or duplicate ACKs (outside lock). */
+-      if (before(first_soft_ack, call->ackr_first_seq) ||
+-          before(prev_pkt, call->ackr_prev_seq)) {
++      if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) {
+               trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial,
+                                          first_soft_ack, call->ackr_first_seq,
+                                          prev_pkt, call->ackr_prev_seq);
+@@ -882,8 +905,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
+       spin_lock(&call->input_lock);
+       /* Discard any out-of-order or duplicate ACKs (inside lock). */
+-      if (before(first_soft_ack, call->ackr_first_seq) ||
+-          before(prev_pkt, call->ackr_prev_seq)) {
++      if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) {
+               trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial,
+                                          first_soft_ack, call->ackr_first_seq,
+                                          prev_pkt, call->ackr_prev_seq);
+-- 
+2.25.1
+
diff --git a/queue-5.6/rxrpc-trace-discarded-acks.patch b/queue-5.6/rxrpc-trace-discarded-acks.patch
new file mode 100644 (file)
index 0000000..e8ffef2
--- /dev/null
@@ -0,0 +1,100 @@
+From 115f971a7f21063892a5a522c1ae637d3cee9b94 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Apr 2020 22:06:54 +0100
+Subject: rxrpc: Trace discarded ACKs
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit d1f129470e6cb79b8b97fecd12689f6eb49e27fe ]
+
+Add a tracepoint to track received ACKs that are discarded due to being
+outside of the Tx window.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/trace/events/rxrpc.h | 35 +++++++++++++++++++++++++++++++++++
+ net/rxrpc/input.c            | 12 ++++++++++--
+ 2 files changed, 45 insertions(+), 2 deletions(-)
+
+diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
+index ab75f261f04a..ba9efdc848f9 100644
+--- a/include/trace/events/rxrpc.h
++++ b/include/trace/events/rxrpc.h
+@@ -1541,6 +1541,41 @@ TRACE_EVENT(rxrpc_notify_socket,
+                     __entry->serial)
+           );
++TRACE_EVENT(rxrpc_rx_discard_ack,
++          TP_PROTO(unsigned int debug_id, rxrpc_serial_t serial,
++                   rxrpc_seq_t first_soft_ack, rxrpc_seq_t call_ackr_first,
++                   rxrpc_seq_t prev_pkt, rxrpc_seq_t call_ackr_prev),
++
++          TP_ARGS(debug_id, serial, first_soft_ack, call_ackr_first,
++                  prev_pkt, call_ackr_prev),
++
++          TP_STRUCT__entry(
++                  __field(unsigned int,       debug_id        )
++                  __field(rxrpc_serial_t,     serial          )
++                  __field(rxrpc_seq_t,        first_soft_ack)
++                  __field(rxrpc_seq_t,        call_ackr_first)
++                  __field(rxrpc_seq_t,        prev_pkt)
++                  __field(rxrpc_seq_t,        call_ackr_prev)
++                           ),
++
++          TP_fast_assign(
++                  __entry->debug_id           = debug_id;
++                  __entry->serial             = serial;
++                  __entry->first_soft_ack     = first_soft_ack;
++                  __entry->call_ackr_first    = call_ackr_first;
++                  __entry->prev_pkt           = prev_pkt;
++                  __entry->call_ackr_prev     = call_ackr_prev;
++                         ),
++
++          TP_printk("c=%08x r=%08x %08x<%08x %08x<%08x",
++                    __entry->debug_id,
++                    __entry->serial,
++                    __entry->first_soft_ack,
++                    __entry->call_ackr_first,
++                    __entry->prev_pkt,
++                    __entry->call_ackr_prev)
++          );
++
+ #endif /* _TRACE_RXRPC_H */
+ /* This part must be outside protection */
+diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
+index e438bfd3fdf5..2f22f082a66c 100644
+--- a/net/rxrpc/input.c
++++ b/net/rxrpc/input.c
+@@ -866,8 +866,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
+       /* Discard any out-of-order or duplicate ACKs (outside lock). */
+       if (before(first_soft_ack, call->ackr_first_seq) ||
+-          before(prev_pkt, call->ackr_prev_seq))
++          before(prev_pkt, call->ackr_prev_seq)) {
++              trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial,
++                                         first_soft_ack, call->ackr_first_seq,
++                                         prev_pkt, call->ackr_prev_seq);
+               return;
++      }
+       buf.info.rxMTU = 0;
+       ioffset = offset + nr_acks + 3;
+@@ -879,8 +883,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
+       /* Discard any out-of-order or duplicate ACKs (inside lock). */
+       if (before(first_soft_ack, call->ackr_first_seq) ||
+-          before(prev_pkt, call->ackr_prev_seq))
++          before(prev_pkt, call->ackr_prev_seq)) {
++              trace_rxrpc_rx_discard_ack(call->debug_id, sp->hdr.serial,
++                                         first_soft_ack, call->ackr_first_seq,
++                                         prev_pkt, call->ackr_prev_seq);
+               goto out;
++      }
+       call->acks_latest_ts = skb->tstamp;
+       call->ackr_first_seq = first_soft_ack;
+-- 
+2.25.1
+
diff --git a/queue-5.6/sched-fair-fix-enqueue_task_fair-warning-some-more.patch b/queue-5.6/sched-fair-fix-enqueue_task_fair-warning-some-more.patch
new file mode 100644 (file)
index 0000000..9c5d3ac
--- /dev/null
@@ -0,0 +1,57 @@
+From 9b105fa10c3a2471228c157430934272b1598593 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 May 2020 09:52:22 -0400
+Subject: sched/fair: Fix enqueue_task_fair() warning some more
+
+From: Phil Auld <pauld@redhat.com>
+
+[ Upstream commit b34cb07dde7c2346dec73d053ce926aeaa087303 ]
+
+sched/fair: Fix enqueue_task_fair warning some more
+
+The recent patch, fe61468b2cb (sched/fair: Fix enqueue_task_fair warning)
+did not fully resolve the issues with the rq->tmp_alone_branch !=
+&rq->leaf_cfs_rq_list warning in enqueue_task_fair. There is a case where
+the first for_each_sched_entity loop exits due to on_rq, having incompletely
+updated the list.  In this case the second for_each_sched_entity loop can
+further modify se. The later code to fix up the list management fails to do
+what is needed because se does not point to the sched_entity which broke out
+of the first loop. The list is not fixed up because the throttled parent was
+already added back to the list by a task enqueue in a parallel child hierarchy.
+
+Address this by calling list_add_leaf_cfs_rq if there are throttled parents
+while doing the second for_each_sched_entity loop.
+
+Fixes: fe61468b2cb ("sched/fair: Fix enqueue_task_fair warning")
+Suggested-by: Vincent Guittot <vincent.guittot@linaro.org>
+Signed-off-by: Phil Auld <pauld@redhat.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
+Link: https://lkml.kernel.org/r/20200512135222.GC2201@lorien.usersys.redhat.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/fair.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 7cd86641b44b..603d3d3cbf77 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5298,6 +5298,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+               /* end evaluation on encountering a throttled cfs_rq */
+               if (cfs_rq_throttled(cfs_rq))
+                       goto enqueue_throttle;
++
++               /*
++                * One parent has been throttled and cfs_rq removed from the
++                * list. Add it back to not break the leaf list.
++                */
++               if (throttled_hierarchy(cfs_rq))
++                       list_add_leaf_cfs_rq(cfs_rq);
+       }
+ enqueue_throttle:
+-- 
+2.25.1
+
diff --git a/queue-5.6/sched-fair-fix-reordering-of-enqueue-dequeue_task_fa.patch b/queue-5.6/sched-fair-fix-reordering-of-enqueue-dequeue_task_fa.patch
new file mode 100644 (file)
index 0000000..9f8663d
--- /dev/null
@@ -0,0 +1,82 @@
+From 815a484c2fc6646d2bc77c2a3ea452d061786365 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 6 Mar 2020 09:42:08 +0100
+Subject: sched/fair: Fix reordering of enqueue/dequeue_task_fair()
+
+From: Vincent Guittot <vincent.guittot@linaro.org>
+
+[ Upstream commit 5ab297bab984310267734dfbcc8104566658ebef ]
+
+Even when a cgroup is throttled, the group se of a child cgroup can still
+be enqueued and its gse->on_rq stays true. When a task is enqueued on such
+child, we still have to update the load_avg and increase
+h_nr_running of the throttled cfs. Nevertheless, the 1st
+for_each_sched_entity() loop is skipped because of gse->on_rq == true and the
+2nd loop because the cfs is throttled whereas we have to update both
+load_avg with the old h_nr_running and increase h_nr_running in such case.
+
+The same sequence can happen during dequeue when se moves to parent before
+breaking in the 1st loop.
+
+Note that the update of load_avg will effectively happen only once in order
+to sync up to the throttled time. Next call for updating load_avg will stop
+early because the clock stays unchanged.
+
+Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Fixes: 6d4d22468dae ("sched/fair: Reorder enqueue/dequeue_task_fair path")
+Link: https://lkml.kernel.org/r/20200306084208.12583-1-vincent.guittot@linaro.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/fair.c | 17 +++++++++--------
+ 1 file changed, 9 insertions(+), 8 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index a486bf3d5078..7cd86641b44b 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5289,15 +5289,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+-              /* end evaluation on encountering a throttled cfs_rq */
+-              if (cfs_rq_throttled(cfs_rq))
+-                      goto enqueue_throttle;
+-
+               update_load_avg(cfs_rq, se, UPDATE_TG);
+               update_cfs_group(se);
+               cfs_rq->h_nr_running++;
+               cfs_rq->idle_h_nr_running += idle_h_nr_running;
++
++              /* end evaluation on encountering a throttled cfs_rq */
++              if (cfs_rq_throttled(cfs_rq))
++                      goto enqueue_throttle;
+       }
+ enqueue_throttle:
+@@ -5386,15 +5386,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+-              /* end evaluation on encountering a throttled cfs_rq */
+-              if (cfs_rq_throttled(cfs_rq))
+-                      goto dequeue_throttle;
+-
+               update_load_avg(cfs_rq, se, UPDATE_TG);
+               update_cfs_group(se);
+               cfs_rq->h_nr_running--;
+               cfs_rq->idle_h_nr_running -= idle_h_nr_running;
++
++              /* end evaluation on encountering a throttled cfs_rq */
++              if (cfs_rq_throttled(cfs_rq))
++                      goto dequeue_throttle;
++
+       }
+ dequeue_throttle:
+-- 
+2.25.1
+
diff --git a/queue-5.6/sched-fair-reorder-enqueue-dequeue_task_fair-path.patch b/queue-5.6/sched-fair-reorder-enqueue-dequeue_task_fair-path.patch
new file mode 100644 (file)
index 0000000..ac50bcc
--- /dev/null
@@ -0,0 +1,137 @@
+From 131a2a3ed376bad4be1470ebbb1db717cd3a5ec2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 24 Feb 2020 09:52:14 +0000
+Subject: sched/fair: Reorder enqueue/dequeue_task_fair path
+
+From: Vincent Guittot <vincent.guittot@linaro.org>
+
+[ Upstream commit 6d4d22468dae3d8757af9f8b81b848a76ef4409d ]
+
+The walk through the cgroup hierarchy during the enqueue/dequeue of a task
+is split in 2 distinct parts for throttled cfs_rq without any added value
+but making code less readable.
+
+Change the code ordering such that everything related to a cfs_rq
+(throttled or not) will be done in the same loop.
+
+In addition, the same steps ordering is used when updating a cfs_rq:
+
+ - update_load_avg
+ - update_cfs_group
+ - update *h_nr_running
+
+This reordering enables the use of h_nr_running in PELT algorithm.
+
+No functional and performance changes are expected and have been noticed
+during tests.
+
+Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: "Dietmar Eggemann <dietmar.eggemann@arm.com>"
+Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Juri Lelli <juri.lelli@redhat.com>
+Cc: Valentin Schneider <valentin.schneider@arm.com>
+Cc: Phil Auld <pauld@redhat.com>
+Cc: Hillf Danton <hdanton@sina.com>
+Link: https://lore.kernel.org/r/20200224095223.13361-5-mgorman@techsingularity.net
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/sched/fair.c | 42 ++++++++++++++++++++----------------------
+ 1 file changed, 20 insertions(+), 22 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index c76a20648b72..a486bf3d5078 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5276,32 +5276,31 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+               cfs_rq = cfs_rq_of(se);
+               enqueue_entity(cfs_rq, se, flags);
+-              /*
+-               * end evaluation on encountering a throttled cfs_rq
+-               *
+-               * note: in the case of encountering a throttled cfs_rq we will
+-               * post the final h_nr_running increment below.
+-               */
+-              if (cfs_rq_throttled(cfs_rq))
+-                      break;
+               cfs_rq->h_nr_running++;
+               cfs_rq->idle_h_nr_running += idle_h_nr_running;
++              /* end evaluation on encountering a throttled cfs_rq */
++              if (cfs_rq_throttled(cfs_rq))
++                      goto enqueue_throttle;
++
+               flags = ENQUEUE_WAKEUP;
+       }
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+-              cfs_rq->h_nr_running++;
+-              cfs_rq->idle_h_nr_running += idle_h_nr_running;
++              /* end evaluation on encountering a throttled cfs_rq */
+               if (cfs_rq_throttled(cfs_rq))
+-                      break;
++                      goto enqueue_throttle;
+               update_load_avg(cfs_rq, se, UPDATE_TG);
+               update_cfs_group(se);
++
++              cfs_rq->h_nr_running++;
++              cfs_rq->idle_h_nr_running += idle_h_nr_running;
+       }
++enqueue_throttle:
+       if (!se) {
+               add_nr_running(rq, 1);
+               /*
+@@ -5362,17 +5361,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+               cfs_rq = cfs_rq_of(se);
+               dequeue_entity(cfs_rq, se, flags);
+-              /*
+-               * end evaluation on encountering a throttled cfs_rq
+-               *
+-               * note: in the case of encountering a throttled cfs_rq we will
+-               * post the final h_nr_running decrement below.
+-              */
+-              if (cfs_rq_throttled(cfs_rq))
+-                      break;
+               cfs_rq->h_nr_running--;
+               cfs_rq->idle_h_nr_running -= idle_h_nr_running;
++              /* end evaluation on encountering a throttled cfs_rq */
++              if (cfs_rq_throttled(cfs_rq))
++                      goto dequeue_throttle;
++
+               /* Don't dequeue parent if it has other entities besides us */
+               if (cfs_rq->load.weight) {
+                       /* Avoid re-evaluating load for this entity: */
+@@ -5390,16 +5385,19 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+       for_each_sched_entity(se) {
+               cfs_rq = cfs_rq_of(se);
+-              cfs_rq->h_nr_running--;
+-              cfs_rq->idle_h_nr_running -= idle_h_nr_running;
++              /* end evaluation on encountering a throttled cfs_rq */
+               if (cfs_rq_throttled(cfs_rq))
+-                      break;
++                      goto dequeue_throttle;
+               update_load_avg(cfs_rq, se, UPDATE_TG);
+               update_cfs_group(se);
++
++              cfs_rq->h_nr_running--;
++              cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+       }
++dequeue_throttle:
+       if (!se)
+               sub_nr_running(rq, 1);
+-- 
+2.25.1
+
index c6cd57006390273ae71bb09967ecfd9906ed6de6..c4b36155cd1eb6154ba6b26e59904e98bf91c28e 100644 (file)
@@ -118,3 +118,9 @@ tpm-check-event-log-version-before-reading-final-events.patch
 s390-kexec_file-fix-initrd-location-for-kdump-kernel.patch
 flow_dissector-drop-bpf-flow-dissector-prog-ref-on-netns-cleanup.patch
 x86-unwind-orc-fix-unwind_get_return_address_ptr-for-inactive-tasks.patch
+rxrpc-trace-discarded-acks.patch
+rxrpc-fix-ack-discard.patch
+bpf-prevent-mmap-ing-read-only-maps-as-writable.patch
+sched-fair-reorder-enqueue-dequeue_task_fair-path.patch
+sched-fair-fix-reordering-of-enqueue-dequeue_task_fa.patch
+sched-fair-fix-enqueue_task_fair-warning-some-more.patch