--- /dev/null
+From foo@baz Thu Aug 25 02:12:36 PM CEST 2022
+From: Ankit Jain <ankitja@vmware.com>
+Date: Mon, 22 Aug 2022 13:09:41 +0530
+Subject: sched/deadline: Fix priority inheritance with multiple scheduling classes
+To: juri.lelli@redhat.com, bristot@redhat.com, l.stach@pengutronix.de, suhui_kernel@163.com, msimmons@redhat.com, peterz@infradead.org, glenn@aurora.tech, stable@vger.kernel.org, linux-kernel@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: srivatsab@vmware.com, srivatsa@csail.mit.edu, akaher@vmware.com, amakhalov@vmware.com, vsirnapalli@vmware.com, sturlapati@vmware.com, bordoloih@vmware.com, keerthanak@vmware.com, Ankit Jain <ankitja@vmware.com>
+Message-ID: <20220822073942.218045-4-ankitja@vmware.com>
+
+From: Juri Lelli <juri.lelli@redhat.com>
+
+commit 2279f540ea7d05f22d2f0c4224319330228586bc upstream.
+
+Glenn reported that "an application [he developed produces] a BUG in
+deadline.c when a SCHED_DEADLINE task contends with CFS tasks on nested
+PTHREAD_PRIO_INHERIT mutexes. I believe the bug is triggered when a CFS
+task that was boosted by a SCHED_DEADLINE task boosts another CFS task
+(nested priority inheritance).
+
+ ------------[ cut here ]------------
+ kernel BUG at kernel/sched/deadline.c:1462!
+ invalid opcode: 0000 [#1] PREEMPT SMP
+ CPU: 12 PID: 19171 Comm: dl_boost_bug Tainted: ...
+ Hardware name: ...
+ RIP: 0010:enqueue_task_dl+0x335/0x910
+ Code: ...
+ RSP: 0018:ffffc9000c2bbc68 EFLAGS: 00010002
+ RAX: 0000000000000009 RBX: ffff888c0af94c00 RCX: ffffffff81e12500
+ RDX: 000000000000002e RSI: ffff888c0af94c00 RDI: ffff888c10b22600
+ RBP: ffffc9000c2bbd08 R08: 0000000000000009 R09: 0000000000000078
+ R10: ffffffff81e12440 R11: ffffffff81e1236c R12: ffff888bc8932600
+ R13: ffff888c0af94eb8 R14: ffff888c10b22600 R15: ffff888bc8932600
+ FS: 00007fa58ac55700(0000) GS:ffff888c10b00000(0000) knlGS:0000000000000000
+ CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 00007fa58b523230 CR3: 0000000bf44ab003 CR4: 00000000007606e0
+ DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+ PKRU: 55555554
+ Call Trace:
+ ? intel_pstate_update_util_hwp+0x13/0x170
+ rt_mutex_setprio+0x1cc/0x4b0
+ task_blocks_on_rt_mutex+0x225/0x260
+ rt_spin_lock_slowlock_locked+0xab/0x2d0
+ rt_spin_lock_slowlock+0x50/0x80
+ hrtimer_grab_expiry_lock+0x20/0x30
+ hrtimer_cancel+0x13/0x30
+ do_nanosleep+0xa0/0x150
+ hrtimer_nanosleep+0xe1/0x230
+ ? __hrtimer_init_sleeper+0x60/0x60
+ __x64_sys_nanosleep+0x8d/0xa0
+ do_syscall_64+0x4a/0x100
+ entry_SYSCALL_64_after_hwframe+0x49/0xbe
+ RIP: 0033:0x7fa58b52330d
+ ...
+ ---[ end trace 0000000000000002 ]—
+
+He also provided a simple reproducer creating the situation below:
+
+ So the execution order of locking steps are the following
+ (N1 and N2 are non-deadline tasks. D1 is a deadline task. M1 and M2
+ are mutexes that are enabled * with priority inheritance.)
+
+ Time moves forward as this timeline goes down:
+
+ N1 N2 D1
+ | | |
+ | | |
+ Lock(M1) | |
+ | | |
+ | Lock(M2) |
+ | | |
+ | | Lock(M2)
+ | | |
+ | Lock(M1) |
+ | (!!bug triggered!) |
+
+Daniel reported a similar situation as well, by just letting ksoftirqd
+run with DEADLINE (and eventually block on a mutex).
+
+Problem is that boosted entities (Priority Inheritance) use static
+DEADLINE parameters of the top priority waiter. However, there might be
+cases where top waiter could be a non-DEADLINE entity that is currently
+boosted by a DEADLINE entity from a different lock chain (i.e., nested
+priority chains involving entities of non-DEADLINE classes). In this
+case, top waiter static DEADLINE parameters could be null (initialized
+to 0 at fork()) and replenish_dl_entity() would hit a BUG().
+
+Fix this by keeping track of the original donor and using its parameters
+when a task is boosted.
+
+Reported-by: Glenn Elliott <glenn@aurora.tech>
+Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
+Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
+Link: https://lkml.kernel.org/r/20201117061432.517340-1-juri.lelli@redhat.com
+[Ankit: Regenerated the patch for v5.4.y]
+Signed-off-by: Ankit Jain <ankitja@vmware.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/sched.h | 10 ++++
+ kernel/sched/core.c | 11 ++---
+ kernel/sched/deadline.c | 97 ++++++++++++++++++++++++++----------------------
+ 3 files changed, 68 insertions(+), 50 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -544,7 +544,6 @@ struct sched_dl_entity {
+ * overruns.
+ */
+ unsigned int dl_throttled : 1;
+- unsigned int dl_boosted : 1;
+ unsigned int dl_yielded : 1;
+ unsigned int dl_non_contending : 1;
+ unsigned int dl_overrun : 1;
+@@ -563,6 +562,15 @@ struct sched_dl_entity {
+ * time.
+ */
+ struct hrtimer inactive_timer;
++
++#ifdef CONFIG_RT_MUTEXES
++ /*
++ * Priority Inheritance. When a DEADLINE scheduling entity is boosted
++ * pi_se points to the donor, otherwise points to the dl_se it belongs
++ * to (the original one/itself).
++ */
++ struct sched_dl_entity *pi_se;
++#endif
+ };
+
+ #ifdef CONFIG_UCLAMP_TASK
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4554,20 +4554,21 @@ void rt_mutex_setprio(struct task_struct
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
+- p->dl.dl_boosted = 1;
++ p->dl.pi_se = pi_task->dl.pi_se;
+ queue_flag |= ENQUEUE_REPLENISH;
+- } else
+- p->dl.dl_boosted = 0;
++ } else {
++ p->dl.pi_se = &p->dl;
++ }
+ p->sched_class = &dl_sched_class;
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+- p->dl.dl_boosted = 0;
++ p->dl.pi_se = &p->dl;
+ if (oldprio < prio)
+ queue_flag |= ENQUEUE_HEAD;
+ p->sched_class = &rt_sched_class;
+ } else {
+ if (dl_prio(oldprio))
+- p->dl.dl_boosted = 0;
++ p->dl.pi_se = &p->dl;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
+ p->sched_class = &fair_sched_class;
+--- a/kernel/sched/deadline.c
++++ b/kernel/sched/deadline.c
+@@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_
+ return !RB_EMPTY_NODE(&dl_se->rb_node);
+ }
+
++#ifdef CONFIG_RT_MUTEXES
++static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
++{
++ return dl_se->pi_se;
++}
++
++static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
++{
++ return pi_of(dl_se) != dl_se;
++}
++#else
++static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
++{
++ return dl_se;
++}
++
++static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
++{
++ return false;
++}
++#endif
++
+ #ifdef CONFIG_SMP
+ static inline struct dl_bw *dl_bw_of(int i)
+ {
+@@ -657,7 +679,7 @@ static inline void setup_new_dl_entity(s
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+- WARN_ON(dl_se->dl_boosted);
++ WARN_ON(is_dl_boosted(dl_se));
+ WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
+
+ /*
+@@ -695,21 +717,20 @@ static inline void setup_new_dl_entity(s
+ * could happen are, typically, a entity voluntarily trying to overcome its
+ * runtime, or it just underestimated it during sched_setattr().
+ */
+-static void replenish_dl_entity(struct sched_dl_entity *dl_se,
+- struct sched_dl_entity *pi_se)
++static void replenish_dl_entity(struct sched_dl_entity *dl_se)
+ {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+- BUG_ON(pi_se->dl_runtime <= 0);
++ BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
+
+ /*
+ * This could be the case for a !-dl task that is boosted.
+ * Just go with full inherited parameters.
+ */
+ if (dl_se->dl_deadline == 0) {
+- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+- dl_se->runtime = pi_se->dl_runtime;
++ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
++ dl_se->runtime = pi_of(dl_se)->dl_runtime;
+ }
+
+ if (dl_se->dl_yielded && dl_se->runtime > 0)
+@@ -722,8 +743,8 @@ static void replenish_dl_entity(struct s
+ * arbitrary large.
+ */
+ while (dl_se->runtime <= 0) {
+- dl_se->deadline += pi_se->dl_period;
+- dl_se->runtime += pi_se->dl_runtime;
++ dl_se->deadline += pi_of(dl_se)->dl_period;
++ dl_se->runtime += pi_of(dl_se)->dl_runtime;
+ }
+
+ /*
+@@ -737,8 +758,8 @@ static void replenish_dl_entity(struct s
+ */
+ if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
+ printk_deferred_once("sched: DL replenish lagged too much\n");
+- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+- dl_se->runtime = pi_se->dl_runtime;
++ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
++ dl_se->runtime = pi_of(dl_se)->dl_runtime;
+ }
+
+ if (dl_se->dl_yielded)
+@@ -771,8 +792,7 @@ static void replenish_dl_entity(struct s
+ * task with deadline equal to period this is the same of using
+ * dl_period instead of dl_deadline in the equation above.
+ */
+-static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
+- struct sched_dl_entity *pi_se, u64 t)
++static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
+ {
+ u64 left, right;
+
+@@ -794,9 +814,9 @@ static bool dl_entity_overflow(struct sc
+ * of anything below microseconds resolution is actually fiction
+ * (but still we want to give the user that illusion >;).
+ */
+- left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
++ left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ right = ((dl_se->deadline - t) >> DL_SCALE) *
+- (pi_se->dl_runtime >> DL_SCALE);
++ (pi_of(dl_se)->dl_runtime >> DL_SCALE);
+
+ return dl_time_before(right, left);
+ }
+@@ -881,24 +901,23 @@ static inline bool dl_is_implicit(struct
+ * Please refer to the comments update_dl_revised_wakeup() function to find
+ * more about the Revised CBS rule.
+ */
+-static void update_dl_entity(struct sched_dl_entity *dl_se,
+- struct sched_dl_entity *pi_se)
++static void update_dl_entity(struct sched_dl_entity *dl_se)
+ {
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
+- dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
++ dl_entity_overflow(dl_se, rq_clock(rq))) {
+
+ if (unlikely(!dl_is_implicit(dl_se) &&
+ !dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+- !dl_se->dl_boosted)){
++ !is_dl_boosted(dl_se))) {
+ update_dl_revised_wakeup(dl_se, rq);
+ return;
+ }
+
+- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+- dl_se->runtime = pi_se->dl_runtime;
++ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
++ dl_se->runtime = pi_of(dl_se)->dl_runtime;
+ }
+ }
+
+@@ -997,7 +1016,7 @@ static enum hrtimer_restart dl_task_time
+ * The task might have been boosted by someone else and might be in the
+ * boosting/deboosting path, its not throttled.
+ */
+- if (dl_se->dl_boosted)
++ if (is_dl_boosted(dl_se))
+ goto unlock;
+
+ /*
+@@ -1025,7 +1044,7 @@ static enum hrtimer_restart dl_task_time
+ * but do not enqueue -- wait for our wakeup to do that.
+ */
+ if (!task_on_rq_queued(p)) {
+- replenish_dl_entity(dl_se, dl_se);
++ replenish_dl_entity(dl_se);
+ goto unlock;
+ }
+
+@@ -1115,7 +1134,7 @@ static inline void dl_check_constrained_
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
+- if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
++ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
+ return;
+ dl_se->dl_throttled = 1;
+ if (dl_se->runtime > 0)
+@@ -1246,7 +1265,7 @@ throttle:
+ dl_se->dl_overrun = 1;
+
+ __dequeue_task_dl(rq, curr, 0);
+- if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
++ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
+ enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+
+ if (!is_leftmost(curr, &rq->dl))
+@@ -1440,8 +1459,7 @@ static void __dequeue_dl_entity(struct s
+ }
+
+ static void
+-enqueue_dl_entity(struct sched_dl_entity *dl_se,
+- struct sched_dl_entity *pi_se, int flags)
++enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
+ {
+ BUG_ON(on_dl_rq(dl_se));
+
+@@ -1452,9 +1470,9 @@ enqueue_dl_entity(struct sched_dl_entity
+ */
+ if (flags & ENQUEUE_WAKEUP) {
+ task_contending(dl_se, flags);
+- update_dl_entity(dl_se, pi_se);
++ update_dl_entity(dl_se);
+ } else if (flags & ENQUEUE_REPLENISH) {
+- replenish_dl_entity(dl_se, pi_se);
++ replenish_dl_entity(dl_se);
+ } else if ((flags & ENQUEUE_RESTORE) &&
+ dl_time_before(dl_se->deadline,
+ rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+@@ -1471,19 +1489,7 @@ static void dequeue_dl_entity(struct sch
+
+ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+ {
+- struct task_struct *pi_task = rt_mutex_get_top_task(p);
+- struct sched_dl_entity *pi_se = &p->dl;
+-
+- /*
+- * Use the scheduling parameters of the top pi-waiter task if:
+- * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
+- * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
+- * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
+- * boosted due to a SCHED_DEADLINE pi-waiter).
+- * Otherwise we keep our runtime and deadline.
+- */
+- if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
+- pi_se = &pi_task->dl;
++ if (is_dl_boosted(&p->dl)) {
+ /*
+ * Because of delays in the detection of the overrun of a
+ * thread's runtime, it might be the case that a thread
+@@ -1516,7 +1522,7 @@ static void enqueue_task_dl(struct rq *r
+ * the throttle.
+ */
+ p->dl.dl_throttled = 0;
+- BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
++ BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);
+ return;
+ }
+
+@@ -1553,7 +1559,7 @@ static void enqueue_task_dl(struct rq *r
+ return;
+ }
+
+- enqueue_dl_entity(&p->dl, pi_se, flags);
++ enqueue_dl_entity(&p->dl, flags);
+
+ if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+ enqueue_pushable_dl_task(rq, p);
+@@ -2722,11 +2728,14 @@ void __dl_clear_params(struct task_struc
+ dl_se->dl_bw = 0;
+ dl_se->dl_density = 0;
+
+- dl_se->dl_boosted = 0;
+ dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
+ dl_se->dl_non_contending = 0;
+ dl_se->dl_overrun = 0;
++
++#ifdef CONFIG_RT_MUTEXES
++ dl_se->pi_se = dl_se;
++#endif
+ }
+
+ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
--- /dev/null
+From foo@baz Thu Aug 25 02:12:36 PM CEST 2022
+From: Ankit Jain <ankitja@vmware.com>
+Date: Mon, 22 Aug 2022 13:09:40 +0530
+Subject: sched/deadline: Fix stale throttling on de-/boosted tasks
+To: juri.lelli@redhat.com, bristot@redhat.com, l.stach@pengutronix.de, suhui_kernel@163.com, msimmons@redhat.com, peterz@infradead.org, glenn@aurora.tech, stable@vger.kernel.org, linux-kernel@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: srivatsab@vmware.com, srivatsa@csail.mit.edu, akaher@vmware.com, amakhalov@vmware.com, vsirnapalli@vmware.com, sturlapati@vmware.com, bordoloih@vmware.com, keerthanak@vmware.com, Ankit Jain <ankitja@vmware.com>
+Message-ID: <20220822073942.218045-3-ankitja@vmware.com>
+
+From: Lucas Stach <l.stach@pengutronix.de>
+
+commit 46fcc4b00c3cca8adb9b7c9afdd499f64e427135 upstream.
+
+When a boosted task gets throttled, what normally happens is that it's
+immediately enqueued again with ENQUEUE_REPLENISH, which replenishes the
+runtime and clears the dl_throttled flag. There is a special case however:
+if the throttling happened on sched-out and the task has been deboosted in
+the meantime, the replenish is skipped as the task will return to its
+normal scheduling class. This leaves the task with the dl_throttled flag
+set.
+
+Now if the task gets boosted up to the deadline scheduling class again
+while it is sleeping, it's still in the throttled state. The normal wakeup
+however will enqueue the task with ENQUEUE_REPLENISH not set, so we don't
+actually place it on the rq. Thus we end up with a task that is runnable,
+but not actually on the rq and neither a immediate replenishment happens,
+nor is the replenishment timer set up, so the task is stuck in
+forever-throttled limbo.
+
+Clear the dl_throttled flag before dropping back to the normal scheduling
+class to fix this issue.
+
+Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Acked-by: Juri Lelli <juri.lelli@redhat.com>
+Link: https://lkml.kernel.org/r/20200831110719.2126930-1-l.stach@pengutronix.de
+[Ankit: Regenerated the patch for v5.4.y]
+Signed-off-by: Ankit Jain <ankitja@vmware.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/deadline.c | 13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+--- a/kernel/sched/deadline.c
++++ b/kernel/sched/deadline.c
+@@ -1507,12 +1507,15 @@ static void enqueue_task_dl(struct rq *r
+ }
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+- * Special case in which we have a !SCHED_DEADLINE task
+- * that is going to be deboosted, but exceeds its
+- * runtime while doing so. No point in replenishing
+- * it, as it's going to return back to its original
+- * scheduling class after this.
++ * Special case in which we have a !SCHED_DEADLINE task that is going
++ * to be deboosted, but exceeds its runtime while doing so. No point in
++ * replenishing it, as it's going to return back to its original
++ * scheduling class after this. If it has been throttled, we need to
++ * clear the flag, otherwise the task may wake up as throttled after
++ * being boosted again with no means to replenish the runtime and clear
++ * the throttle.
+ */
++ p->dl.dl_throttled = 0;
+ BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ return;
+ }
--- /dev/null
+From foo@baz Thu Aug 25 02:12:36 PM CEST 2022
+From: Ankit Jain <ankitja@vmware.com>
+Date: Mon, 22 Aug 2022 13:09:39 +0530
+Subject: sched/deadline: Unthrottle PI boosted threads while enqueuing
+To: juri.lelli@redhat.com, bristot@redhat.com, l.stach@pengutronix.de, suhui_kernel@163.com, msimmons@redhat.com, peterz@infradead.org, glenn@aurora.tech, stable@vger.kernel.org, linux-kernel@vger.kernel.org, gregkh@linuxfoundation.org
+Cc: srivatsab@vmware.com, srivatsa@csail.mit.edu, akaher@vmware.com, amakhalov@vmware.com, vsirnapalli@vmware.com, sturlapati@vmware.com, bordoloih@vmware.com, keerthanak@vmware.com, Ankit Jain <ankitja@vmware.com>
+Message-ID: <20220822073942.218045-2-ankitja@vmware.com>
+
+From: Daniel Bristot de Oliveira <bristot@redhat.com>
+
+commit feff2e65efd8d84cf831668e182b2ce73c604bbb upstream.
+
+stress-ng has a test (stress-ng --cyclic) that creates a set of threads
+under SCHED_DEADLINE with the following parameters:
+
+ dl_runtime = 10000 (10 us)
+ dl_deadline = 100000 (100 us)
+ dl_period = 100000 (100 us)
+
+These parameters are very aggressive. When using a system without HRTICK
+set, these threads can easily execute longer than the dl_runtime because
+the throttling happens with 1/HZ resolution.
+
+During the main part of the test, the system works just fine because
+the workload does not try to run over the 10 us. The problem happens at
+the end of the test, on the exit() path. During exit(), the threads need
+to do some cleanups that require real-time mutex locks, mainly those
+related to memory management, resulting in this scenario:
+
+Note: locks are rt_mutexes...
+ ------------------------------------------------------------------------
+ TASK A: TASK B: TASK C:
+ activation
+ activation
+ activation
+
+ lock(a): OK! lock(b): OK!
+ <overrun runtime>
+ lock(a)
+ -> block (task A owns it)
+ -> self notice/set throttled
+ +--< -> arm replenished timer
+ | switch-out
+ | lock(b)
+ | -> <C prio > B prio>
+ | -> boost TASK B
+ | unlock(a) switch-out
+ | -> handle lock a to B
+ | -> wakeup(B)
+ | -> B is throttled:
+ | -> do not enqueue
+ | switch-out
+ |
+ |
+ +---------------------> replenishment timer
+ -> TASK B is boosted:
+ -> do not enqueue
+ ------------------------------------------------------------------------
+
+BOOM: TASK B is runnable but !enqueued, holding TASK C: the system
+crashes with hung task C.
+
+This problem is avoided by removing the throttle state from the boosted
+thread while boosting it (by TASK A in the example above), allowing it to
+be queued and run boosted.
+
+The next replenishment will take care of the runtime overrun, pushing
+the deadline further away. See the "while (dl_se->runtime <= 0)" on
+replenish_dl_entity() for more information.
+
+Reported-by: Mark Simmons <msimmons@redhat.com>
+Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Juri Lelli <juri.lelli@redhat.com>
+Tested-by: Mark Simmons <msimmons@redhat.com>
+Link: https://lkml.kernel.org/r/5076e003450835ec74e6fa5917d02c4fa41687e6.1600170294.git.bristot@redhat.com
+[Ankit: Regenerated the patch for v5.4.y]
+Signed-off-by: Ankit Jain <ankitja@vmware.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/deadline.c | 21 +++++++++++++++++++++
+ 1 file changed, 21 insertions(+)
+
+--- a/kernel/sched/deadline.c
++++ b/kernel/sched/deadline.c
+@@ -1484,6 +1484,27 @@ static void enqueue_task_dl(struct rq *r
+ */
+ if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
+ pi_se = &pi_task->dl;
++ /*
++ * Because of delays in the detection of the overrun of a
++ * thread's runtime, it might be the case that a thread
++ * goes to sleep in a rt mutex with negative runtime. As
++ * a consequence, the thread will be throttled.
++ *
++ * While waiting for the mutex, this thread can also be
++ * boosted via PI, resulting in a thread that is throttled
++ * and boosted at the same time.
++ *
++ * In this case, the boost overrides the throttle.
++ */
++ if (p->dl.dl_throttled) {
++ /*
++ * The replenish timer needs to be canceled. No
++ * problem if it fires concurrently: boosted threads
++ * are ignored in dl_task_timer().
++ */
++ hrtimer_try_to_cancel(&p->dl.dl_timer);
++ p->dl.dl_throttled = 0;
++ }
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+ * Special case in which we have a !SCHED_DEADLINE task