From: Greg Kroah-Hartman Date: Mon, 7 Nov 2016 16:41:26 +0000 (+0100) Subject: 4.8-stable patches X-Git-Tag: v4.4.31~24 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=124f251da641ee20449d12aff220235cebf33748;p=thirdparty%2Fkernel%2Fstable-queue.git 4.8-stable patches added patches: timers-lock-base-for-same-bucket-optimization.patch timers-plug-locking-race-vs.-timer-migration.patch timers-prevent-base-clock-corruption-when-forwarding.patch timers-prevent-base-clock-rewind-when-forwarding-clock.patch ubifs-abort-readdir-upon-error.patch ubifs-fix-regression-in-ubifs_readdir.patch --- diff --git a/queue-4.8/series b/queue-4.8/series index e9883abd403..675240ad475 100644 --- a/queue-4.8/series +++ b/queue-4.8/series @@ -40,3 +40,9 @@ powerpc-mm-radix-use-tlbiel-only-if-we-ever-ran-on-the-current-cpu.patch powerpc-64-re-fix-race-condition-between-going-idle-and-entering-guest.patch powerpc-64-fix-race-condition-in-setting-lock-bit-in-idle-wakeup-code.patch x86-microcode-amd-fix-more-fallout-from-config_randomize_memory-y.patch +timers-prevent-base-clock-rewind-when-forwarding-clock.patch +timers-prevent-base-clock-corruption-when-forwarding.patch +timers-plug-locking-race-vs.-timer-migration.patch +timers-lock-base-for-same-bucket-optimization.patch +ubifs-abort-readdir-upon-error.patch +ubifs-fix-regression-in-ubifs_readdir.patch diff --git a/queue-4.8/timers-lock-base-for-same-bucket-optimization.patch b/queue-4.8/timers-lock-base-for-same-bucket-optimization.patch new file mode 100644 index 00000000000..4428735ce16 --- /dev/null +++ b/queue-4.8/timers-lock-base-for-same-bucket-optimization.patch @@ -0,0 +1,99 @@ +From 4da9152a4308dcbf611cde399c695c359fc9145f Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 24 Oct 2016 11:55:10 +0200 +Subject: timers: Lock base for same bucket optimization + +From: Thomas Gleixner + +commit 4da9152a4308dcbf611cde399c695c359fc9145f upstream. + +Linus stumbled over the unlocked modification of the timer expiry value in +mod_timer() which is an optimization for timers which stay in the same +bucket - due to the bucket granularity - despite their expiry time getting +updated. + +The optimization itself still makes sense even if we take the lock, because +in case that the bucket stays the same, we avoid the pointless +queue/enqueue dance. + +Make the check and the modification of timer->expires protected by the base +lock and shuffle the remaining code around so we can keep the lock held +when we actually have to requeue the timer to a different bucket. + +Fixes: f00c0afdfa62 ("timers: Implement optimization for same expiry time in mod_timer()") +Reported-by: Linus Torvalds +Signed-off-by: Thomas Gleixner +Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos +Cc: Andrew Morton +Cc: Peter Zijlstra +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/time/timer.c | 28 +++++++++++++++++----------- + 1 file changed, 17 insertions(+), 11 deletions(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -965,6 +965,8 @@ __mod_timer(struct timer_list *timer, un + unsigned long clk = 0, flags; + int ret = 0; + ++ BUG_ON(!timer->function); ++ + /* + * This is a common optimization triggered by the networking code - if + * the timer is re-modified to have the same timeout or ends up in the +@@ -973,13 +975,16 @@ __mod_timer(struct timer_list *timer, un + if (timer_pending(timer)) { + if (timer->expires == expires) + return 1; ++ + /* +- * Take the current timer_jiffies of base, but without holding +- * the lock! ++ * We lock timer base and calculate the bucket index right ++ * here. If the timer ends up in the same bucket, then we ++ * just update the expiry time and avoid the whole ++ * dequeue/enqueue dance. + */ +- base = get_timer_base(timer->flags); +- clk = base->clk; ++ base = lock_timer_base(timer, &flags); + ++ clk = base->clk; + idx = calc_wheel_index(expires, clk); + + /* +@@ -989,14 +994,14 @@ __mod_timer(struct timer_list *timer, un + */ + if (idx == timer_get_idx(timer)) { + timer->expires = expires; +- return 1; ++ ret = 1; ++ goto out_unlock; + } ++ } else { ++ base = lock_timer_base(timer, &flags); + } + + timer_stats_timer_set_start_info(timer); +- BUG_ON(!timer->function); +- +- base = lock_timer_base(timer, &flags); + + ret = detach_if_pending(timer, base, false); + if (!ret && pending_only) +@@ -1032,9 +1037,10 @@ __mod_timer(struct timer_list *timer, un + timer->expires = expires; + /* + * If 'idx' was calculated above and the base time did not advance +- * between calculating 'idx' and taking the lock, only enqueue_timer() +- * and trigger_dyntick_cpu() is required. Otherwise we need to +- * (re)calculate the wheel index via internal_add_timer(). ++ * between calculating 'idx' and possibly switching the base, only ++ * enqueue_timer() and trigger_dyntick_cpu() is required. Otherwise ++ * we need to (re)calculate the wheel index via ++ * internal_add_timer(). + */ + if (idx != UINT_MAX && clk == base->clk) { + enqueue_timer(base, timer, idx); diff --git a/queue-4.8/timers-plug-locking-race-vs.-timer-migration.patch b/queue-4.8/timers-plug-locking-race-vs.-timer-migration.patch new file mode 100644 index 00000000000..18dd5727fff --- /dev/null +++ b/queue-4.8/timers-plug-locking-race-vs.-timer-migration.patch @@ -0,0 +1,47 @@ +From b831275a3553c32091222ac619cfddd73a5553fb Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Mon, 24 Oct 2016 11:41:56 +0200 +Subject: timers: Plug locking race vs. timer migration + +From: Thomas Gleixner + +commit b831275a3553c32091222ac619cfddd73a5553fb upstream. + +Linus noticed that lock_timer_base() lacks a READ_ONCE() for accessing the +timer flags. As a consequence the compiler is allowed to reload the flags +between the initial check for TIMER_MIGRATION and the following timer base +computation and the spin lock of the base. + +While this has not been observed (yet), we need to make sure that it never +happens. + +Fixes: 0eeda71bc30d ("timer: Replace timer base by a cpu index") +Reported-by: Linus Torvalds +Signed-off-by: Thomas Gleixner +Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1610241711220.4983@nanos +Cc: Andrew Morton +Cc: Peter Zijlstra +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/time/timer.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -937,7 +937,14 @@ static struct timer_base *lock_timer_bas + { + for (;;) { + struct timer_base *base; +- u32 tf = timer->flags; ++ u32 tf; ++ ++ /* ++ * We need to use READ_ONCE() here, otherwise the compiler ++ * might re-read @tf between the check for TIMER_MIGRATING ++ * and spin_lock(). ++ */ ++ tf = READ_ONCE(timer->flags); + + if (!(tf & TIMER_MIGRATING)) { + base = get_timer_base(tf); diff --git a/queue-4.8/timers-prevent-base-clock-corruption-when-forwarding.patch b/queue-4.8/timers-prevent-base-clock-corruption-when-forwarding.patch new file mode 100644 index 00000000000..3cb1fc295df --- /dev/null +++ b/queue-4.8/timers-prevent-base-clock-corruption-when-forwarding.patch @@ -0,0 +1,115 @@ +From 6bad6bccf2d717f652d37e63cf261eaa23466009 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sat, 22 Oct 2016 11:07:37 +0000 +Subject: timers: Prevent base clock corruption when forwarding + +From: Thomas Gleixner + +commit 6bad6bccf2d717f652d37e63cf261eaa23466009 upstream. + +When a timer is enqueued we try to forward the timer base clock. This +mechanism has two issues: + +1) Forwarding a remote base unlocked + +The forwarding function is called from get_target_base() with the current +timer base lock held. But if the new target base is a different base than +the current base (can happen with NOHZ, sigh!) then the forwarding is done +on an unlocked base. This can lead to corruption of base->clk. + +Solution is simple: Invoke the forwarding after the target base is locked. + +2) Possible corruption due to jiffies advancing + +This is similar to the issue in get_net_timer_interrupt() which was fixed +in the previous patch. jiffies can advance between check and assignement +and therefore advancing base->clk beyond the next expiry value. + +So we need to read jiffies into a local variable once and do the checks and +assignment with the local copy. + +Fixes: a683f390b93f("timers: Forward the wheel clock whenever possible") +Reported-by: Ashton Holmes +Reported-by: Michael Thayer +Signed-off-by: Thomas Gleixner +Cc: Michal Necasek +Cc: Peter Zijlstra +Cc: knut.osmundsen@oracle.com +Cc: stern@rowland.harvard.edu +Cc: rt@linutronix.de +Link: http://lkml.kernel.org/r/20161022110552.253640125@linutronix.de +Signed-off-by: Thomas Gleixner +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/time/timer.c | 23 ++++++++++------------- + 1 file changed, 10 insertions(+), 13 deletions(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -878,7 +878,7 @@ static inline struct timer_base *get_tim + + #ifdef CONFIG_NO_HZ_COMMON + static inline struct timer_base * +-__get_target_base(struct timer_base *base, unsigned tflags) ++get_target_base(struct timer_base *base, unsigned tflags) + { + #ifdef CONFIG_SMP + if ((tflags & TIMER_PINNED) || !base->migration_enabled) +@@ -891,25 +891,27 @@ __get_target_base(struct timer_base *bas + + static inline void forward_timer_base(struct timer_base *base) + { ++ unsigned long jnow = READ_ONCE(jiffies); ++ + /* + * We only forward the base when it's idle and we have a delta between + * base clock and jiffies. + */ +- if (!base->is_idle || (long) (jiffies - base->clk) < 2) ++ if (!base->is_idle || (long) (jnow - base->clk) < 2) + return; + + /* + * If the next expiry value is > jiffies, then we fast forward to + * jiffies otherwise we forward to the next expiry value. + */ +- if (time_after(base->next_expiry, jiffies)) +- base->clk = jiffies; ++ if (time_after(base->next_expiry, jnow)) ++ base->clk = jnow; + else + base->clk = base->next_expiry; + } + #else + static inline struct timer_base * +-__get_target_base(struct timer_base *base, unsigned tflags) ++get_target_base(struct timer_base *base, unsigned tflags) + { + return get_timer_this_cpu_base(tflags); + } +@@ -917,14 +919,6 @@ __get_target_base(struct timer_base *bas + static inline void forward_timer_base(struct timer_base *base) { } + #endif + +-static inline struct timer_base * +-get_target_base(struct timer_base *base, unsigned tflags) +-{ +- struct timer_base *target = __get_target_base(base, tflags); +- +- forward_timer_base(target); +- return target; +-} + + /* + * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means +@@ -1025,6 +1019,9 @@ __mod_timer(struct timer_list *timer, un + } + } + ++ /* Try to forward a stale timer base clock */ ++ forward_timer_base(base); ++ + timer->expires = expires; + /* + * If 'idx' was calculated above and the base time did not advance diff --git a/queue-4.8/timers-prevent-base-clock-rewind-when-forwarding-clock.patch b/queue-4.8/timers-prevent-base-clock-rewind-when-forwarding-clock.patch new file mode 100644 index 00000000000..84faa35e7ed --- /dev/null +++ b/queue-4.8/timers-prevent-base-clock-rewind-when-forwarding-clock.patch @@ -0,0 +1,110 @@ +From 041ad7bc758db259bb960ef795197dd14aab19a6 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sat, 22 Oct 2016 11:07:35 +0000 +Subject: timers: Prevent base clock rewind when forwarding clock + +From: Thomas Gleixner + +commit 041ad7bc758db259bb960ef795197dd14aab19a6 upstream. + +Ashton and Michael reported, that kernel versions 4.8 and later suffer from +USB timeouts which are caused by the timer wheel rework. + +This is caused by a bug in the base clock forwarding mechanism, which leads +to timers expiring early. The scenario which leads to this is: + +run_timers() + while (jiffies >= base->clk) { + collect_expired_timers(); + base->clk++; + expire_timers(); + } + +So base->clk = jiffies + 1. Now the cpu goes idle: + +idle() + get_next_timer_interrupt() + nextevt = __next_time_interrupt(); + if (time_after(nextevt, base->clk)) + base->clk = jiffies; + +jiffies has not advanced since run_timers(), so this assignment effectively +decrements base->clk by one. + +base->clk is the index into the timer wheel arrays. So let's assume the +following state after the base->clk increment in run_timers(): + + jiffies = 0 + base->clk = 1 + +A timer gets enqueued with an expiry delta of 63 ticks (which is the case +with the USB timeout and HZ=250) so the resulting bucket index is: + + base->clk + delta = 1 + 63 = 64 + +The timer goes into the first wheel level. The array size is 64 so it ends +up in bucket 0, which is correct as it takes 63 ticks to advance base->clk +to index into bucket 0 again. + +If the cpu goes idle before jiffies advance, then the bug in the forwarding +mechanism sets base->clk back to 0, so the next invocation of run_timers() +at the next tick will index into bucket 0 and therefore expire the timer 62 +ticks too early. + +Instead of blindly setting base->clk to jiffies we must make the forwarding +conditional on jiffies > base->clk, but we cannot use jiffies for this as +we might run into the following issue: + + if (time_after(jiffies, base->clk) { + if (time_after(nextevt, base->clk)) + base->clk = jiffies; + +jiffies can increment between the check and the assigment far enough to +advance beyond nextevt. So we need to use a stable value for checking. + +get_next_timer_interrupt() has the basej argument which is the jiffies +value snapshot taken in the calling code. So we can just that. + +Thanks to Ashton for bisecting and providing trace data! + +Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible") +Reported-by: Ashton Holmes +Reported-by: Michael Thayer +Signed-off-by: Thomas Gleixner +Cc: Michal Necasek +Cc: Peter Zijlstra +Cc: knut.osmundsen@oracle.com +Cc: stern@rowland.harvard.edu +Cc: rt@linutronix.de +Link: http://lkml.kernel.org/r/20161022110552.175308322@linutronix.de +Signed-off-by: Thomas Gleixner +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/time/timer.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1510,12 +1510,16 @@ u64 get_next_timer_interrupt(unsigned lo + is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA); + base->next_expiry = nextevt; + /* +- * We have a fresh next event. Check whether we can forward the base: ++ * We have a fresh next event. Check whether we can forward the ++ * base. We can only do that when @basej is past base->clk ++ * otherwise we might rewind base->clk. + */ +- if (time_after(nextevt, jiffies)) +- base->clk = jiffies; +- else if (time_after(nextevt, base->clk)) +- base->clk = nextevt; ++ if (time_after(basej, base->clk)) { ++ if (time_after(nextevt, basej)) ++ base->clk = basej; ++ else if (time_after(nextevt, base->clk)) ++ base->clk = nextevt; ++ } + + if (time_before_eq(nextevt, basej)) { + expires = basem; diff --git a/queue-4.8/ubifs-abort-readdir-upon-error.patch b/queue-4.8/ubifs-abort-readdir-upon-error.patch new file mode 100644 index 00000000000..3159ea12b6e --- /dev/null +++ b/queue-4.8/ubifs-abort-readdir-upon-error.patch @@ -0,0 +1,59 @@ +From c83ed4c9dbb358b9e7707486e167e940d48bfeed Mon Sep 17 00:00:00 2001 +From: Richard Weinberger +Date: Wed, 19 Oct 2016 12:43:07 +0200 +Subject: ubifs: Abort readdir upon error + +From: Richard Weinberger + +commit c83ed4c9dbb358b9e7707486e167e940d48bfeed upstream. + +If UBIFS is facing an error while walking a directory, it reports this +error and ubifs_readdir() returns the error code. But the VFS readdir +logic does not make the getdents system call fail in all cases. When the +readdir cursor indicates that more entries are present, the system call +will just return and the libc wrapper will try again since it also +knows that more entries are present. +This causes the libc wrapper to busy loop for ever when a directory is +corrupted on UBIFS. +A common approach do deal with corrupted directory entries is +skipping them by setting the cursor to the next entry. On UBIFS this +approach is not possible since we cannot compute the next directory +entry cursor position without reading the current entry. So all we can +do is setting the cursor to the "no more entries" position and make +getdents exit. + +Signed-off-by: Richard Weinberger +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ubifs/dir.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/fs/ubifs/dir.c ++++ b/fs/ubifs/dir.c +@@ -350,7 +350,7 @@ static unsigned int vfs_dent_type(uint8_ + */ + static int ubifs_readdir(struct file *file, struct dir_context *ctx) + { +- int err; ++ int err = 0; + struct qstr nm; + union ubifs_key key; + struct ubifs_dent_node *dent; +@@ -452,14 +452,12 @@ out: + kfree(file->private_data); + file->private_data = NULL; + +- if (err != -ENOENT) { ++ if (err != -ENOENT) + ubifs_err(c, "cannot find next direntry, error %d", err); +- return err; +- } + + /* 2 is a special value indicating that there are no more direntries */ + ctx->pos = 2; +- return 0; ++ return err; + } + + /* Free saved readdir() state when the directory is closed */ diff --git a/queue-4.8/ubifs-fix-regression-in-ubifs_readdir.patch b/queue-4.8/ubifs-fix-regression-in-ubifs_readdir.patch new file mode 100644 index 00000000000..e4ca8288243 --- /dev/null +++ b/queue-4.8/ubifs-fix-regression-in-ubifs_readdir.patch @@ -0,0 +1,42 @@ +From a00052a296e54205cf238c75bd98d17d5d02a6db Mon Sep 17 00:00:00 2001 +From: Richard Weinberger +Date: Fri, 28 Oct 2016 11:49:03 +0200 +Subject: ubifs: Fix regression in ubifs_readdir() + +From: Richard Weinberger + +commit a00052a296e54205cf238c75bd98d17d5d02a6db upstream. + +Commit c83ed4c9dbb35 ("ubifs: Abort readdir upon error") broke +overlayfs support because the fix exposed an internal error +code to VFS. + +Reported-by: Peter Rosin +Tested-by: Peter Rosin +Reported-by: Ralph Sennhauser +Tested-by: Ralph Sennhauser +Fixes: c83ed4c9dbb35 ("ubifs: Abort readdir upon error") +Signed-off-by: Richard Weinberger +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ubifs/dir.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/ubifs/dir.c ++++ b/fs/ubifs/dir.c +@@ -454,6 +454,14 @@ out: + + if (err != -ENOENT) + ubifs_err(c, "cannot find next direntry, error %d", err); ++ else ++ /* ++ * -ENOENT is a non-fatal error in this context, the TNC uses ++ * it to indicate that the cursor moved past the current directory ++ * and readdir() has to stop. ++ */ ++ err = 0; ++ + + /* 2 is a special value indicating that there are no more direntries */ + ctx->pos = 2;