From: Greg Kroah-Hartman Date: Fri, 17 Apr 2015 13:22:59 +0000 (+0200) Subject: 3.19-stable patches X-Git-Tag: v3.10.75~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=43ae4d066438144a91a288f98c594cdb4c47d9ad;p=thirdparty%2Fkernel%2Fstable-queue.git 3.19-stable patches added patches: drm-i915-push-vblank-enable-disable-past-encoder-enable-disable.patch ext4-fix-indirect-punch-hole-corruption.patch kvm-avoid-page-allocation-failure-in-kvm_set_memory_region.patch timers-tick-broadcast-hrtimer-fix-suspicious-rcu-usage-in-idle-loop.patch xfs-ensure-truncate-forces-zeroed-blocks-to-disk.patch --- diff --git a/queue-3.19/drm-i915-push-vblank-enable-disable-past-encoder-enable-disable.patch b/queue-3.19/drm-i915-push-vblank-enable-disable-past-encoder-enable-disable.patch new file mode 100644 index 00000000000..7138df71c8a --- /dev/null +++ b/queue-3.19/drm-i915-push-vblank-enable-disable-past-encoder-enable-disable.patch @@ -0,0 +1,176 @@ +From f9b61ff6bce9a44555324b29e593fdffc9a115bc Mon Sep 17 00:00:00 2001 +From: Daniel Vetter +Date: Wed, 7 Jan 2015 13:54:39 +0100 +Subject: drm/i915: Push vblank enable/disable past encoder->enable/disable +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Daniel Vetter + +commit f9b61ff6bce9a44555324b29e593fdffc9a115bc upstream. + +It is platform/output depenedent when exactly the pipe will start +running. Sometimes we just need the (cpu) pipe enabled, in other cases +the pch transcoder is enough and in yet other cases the (DP) port is +sending the frame start signal. + +In a perfect world we'd put the drm_crtc_vblank_on call exactly where +the pipe starts running, but due to cloning and similar things this +will get messy. And the current approach of picking the most +conservative place for all combinations also doesn't work since that +results in legit vblank waits (in encoder->enable hooks, e.g. the 2 +vblank waits for sdvo) failing. + +Completely going back to the old world before + +commit 51e31d49c89055299e34b8f44d13f70e19aaaad1 +Author: Daniel Vetter +Date: Mon Sep 15 12:36:02 2014 +0200 + + drm/i915: Use generic vblank wait + +isn't great either since screaming when the vblank wait work because +the pipe is off is kinda nice. + +Pick a compromise and move the drm_crtc_vblank_on right before the +encoder->enable call. This is a lie on some outputs/platforms, but +after the ->enable callback the pipe is guaranteed to run everywhere. +So not that bad really. Suggested by Ville. + +v2: Same treatment for drm_crtc_vblank_off and encoder->disable: I've +missed the ibx pipe B select w/a, which also has a vblank wait in the +disable function (while the pipe is obviously still running). + +Cc: Ville Syrjälä +Cc: Chris Wilson +Acked-by: Ville Syrjälä +Signed-off-by: Daniel Vetter +Cc: Jani Nikula +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/i915/intel_display.c | 42 +++++++++++++++++------------------ + 1 file changed, 21 insertions(+), 21 deletions(-) + +--- a/drivers/gpu/drm/i915/intel_display.c ++++ b/drivers/gpu/drm/i915/intel_display.c +@@ -4372,15 +4372,15 @@ static void ironlake_crtc_enable(struct + if (intel_crtc->config.has_pch_encoder) + ironlake_pch_enable(crtc); + ++ assert_vblank_disabled(crtc); ++ drm_crtc_vblank_on(crtc); ++ + for_each_encoder_on_crtc(dev, crtc, encoder) + encoder->enable(encoder); + + if (HAS_PCH_CPT(dev)) + cpt_verify_modeset(dev, intel_crtc->pipe); + +- assert_vblank_disabled(crtc); +- drm_crtc_vblank_on(crtc); +- + intel_crtc_enable_planes(crtc); + } + +@@ -4492,14 +4492,14 @@ static void haswell_crtc_enable(struct d + if (intel_crtc->config.dp_encoder_is_mst) + intel_ddi_set_vc_payload_alloc(crtc, true); + ++ assert_vblank_disabled(crtc); ++ drm_crtc_vblank_on(crtc); ++ + for_each_encoder_on_crtc(dev, crtc, encoder) { + encoder->enable(encoder); + intel_opregion_notify_encoder(encoder, true); + } + +- assert_vblank_disabled(crtc); +- drm_crtc_vblank_on(crtc); +- + /* If we change the relative order between pipe/planes enabling, we need + * to change the workaround. */ + haswell_mode_set_planes_workaround(intel_crtc); +@@ -4550,12 +4550,12 @@ static void ironlake_crtc_disable(struct + + intel_crtc_disable_planes(crtc); + +- drm_crtc_vblank_off(crtc); +- assert_vblank_disabled(crtc); +- + for_each_encoder_on_crtc(dev, crtc, encoder) + encoder->disable(encoder); + ++ drm_crtc_vblank_off(crtc); ++ assert_vblank_disabled(crtc); ++ + if (intel_crtc->config.has_pch_encoder) + intel_set_pch_fifo_underrun_reporting(dev_priv, pipe, false); + +@@ -4614,14 +4614,14 @@ static void haswell_crtc_disable(struct + + intel_crtc_disable_planes(crtc); + +- drm_crtc_vblank_off(crtc); +- assert_vblank_disabled(crtc); +- + for_each_encoder_on_crtc(dev, crtc, encoder) { + intel_opregion_notify_encoder(encoder, false); + encoder->disable(encoder); + } + ++ drm_crtc_vblank_off(crtc); ++ assert_vblank_disabled(crtc); ++ + if (intel_crtc->config.has_pch_encoder) + intel_set_pch_fifo_underrun_reporting(dev_priv, TRANSCODER_A, + false); +@@ -5089,12 +5089,12 @@ static void valleyview_crtc_enable(struc + intel_update_watermarks(crtc); + intel_enable_pipe(intel_crtc); + +- for_each_encoder_on_crtc(dev, crtc, encoder) +- encoder->enable(encoder); +- + assert_vblank_disabled(crtc); + drm_crtc_vblank_on(crtc); + ++ for_each_encoder_on_crtc(dev, crtc, encoder) ++ encoder->enable(encoder); ++ + intel_crtc_enable_planes(crtc); + + /* Underruns don't raise interrupts, so check manually. */ +@@ -5150,12 +5150,12 @@ static void i9xx_crtc_enable(struct drm_ + intel_update_watermarks(crtc); + intel_enable_pipe(intel_crtc); + +- for_each_encoder_on_crtc(dev, crtc, encoder) +- encoder->enable(encoder); +- + assert_vblank_disabled(crtc); + drm_crtc_vblank_on(crtc); + ++ for_each_encoder_on_crtc(dev, crtc, encoder) ++ encoder->enable(encoder); ++ + intel_crtc_enable_planes(crtc); + + /* +@@ -5227,12 +5227,12 @@ static void i9xx_crtc_disable(struct drm + */ + intel_wait_for_vblank(dev, pipe); + +- drm_crtc_vblank_off(crtc); +- assert_vblank_disabled(crtc); +- + for_each_encoder_on_crtc(dev, crtc, encoder) + encoder->disable(encoder); + ++ drm_crtc_vblank_off(crtc); ++ assert_vblank_disabled(crtc); ++ + intel_disable_pipe(intel_crtc); + + i9xx_pfit_disable(intel_crtc); diff --git a/queue-3.19/ext4-fix-indirect-punch-hole-corruption.patch b/queue-3.19/ext4-fix-indirect-punch-hole-corruption.patch new file mode 100644 index 00000000000..e6b18792f52 --- /dev/null +++ b/queue-3.19/ext4-fix-indirect-punch-hole-corruption.patch @@ -0,0 +1,191 @@ +From 6f30b7e37a8239f9d27db626a1d3427bc7951908 Mon Sep 17 00:00:00 2001 +From: Omar Sandoval +Date: Sat, 14 Feb 2015 20:08:51 -0500 +Subject: ext4: fix indirect punch hole corruption + +From: Omar Sandoval + +commit 6f30b7e37a8239f9d27db626a1d3427bc7951908 upstream. + +Commit 4f579ae7de56 (ext4: fix punch hole on files with indirect +mapping) rewrote FALLOC_FL_PUNCH_HOLE for ext4 files with indirect +mapping. However, there are bugs in several corner cases. This fixes 5 +distinct bugs: + +1. When there is at least one entire level of indirection between the +start and end of the punch range and the end of the punch range is the +first block of its level, we can't return early; we have to free the +intervening levels. + +2. When the end is at a higher level of indirection than the start and +ext4_find_shared returns a top branch for the end, we still need to free +the rest of the shared branch it returns; we can't decrement partial2. + +3. When a punch happens within one level of indirection, we need to +converge on an indirect block that contains the start and end. However, +because the branches returned from ext4_find_shared do not necessarily +start at the same level (e.g., the partial2 chain will be shallower if +the last block occurs at the beginning of an indirect group), the walk +of the two chains can end up "missing" each other and freeing a bunch of +extra blocks in the process. This mismatch can be handled by first +making sure that the chains are at the same level, then walking them +together until they converge. + +4. When the punch happens within one level of indirection and +ext4_find_shared returns a top branch for the start, we must free it, +but only if the end does not occur within that branch. + +5. When the punch happens within one level of indirection and +ext4_find_shared returns a top branch for the end, then we shouldn't +free the block referenced by the end of the returned chain (this mirrors +the different levels case). + +Signed-off-by: Omar Sandoval +Cc: Chris J Arges +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/indirect.c | 105 +++++++++++++++++++++++++++++++++++------------------ + 1 file changed, 71 insertions(+), 34 deletions(-) + +--- a/fs/ext4/indirect.c ++++ b/fs/ext4/indirect.c +@@ -1393,10 +1393,7 @@ end_range: + * to free. Everything was covered by the start + * of the range. + */ +- return 0; +- } else { +- /* Shared branch grows from an indirect block */ +- partial2--; ++ goto do_indirects; + } + } else { + /* +@@ -1427,56 +1424,96 @@ end_range: + /* Punch happened within the same level (n == n2) */ + partial = ext4_find_shared(inode, n, offsets, chain, &nr); + partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); +- /* +- * ext4_find_shared returns Indirect structure which +- * points to the last element which should not be +- * removed by truncate. But this is end of the range +- * in punch_hole so we need to point to the next element +- */ +- partial2->p++; +- while ((partial > chain) || (partial2 > chain2)) { +- /* We're at the same block, so we're almost finished */ +- if ((partial->bh && partial2->bh) && +- (partial->bh->b_blocknr == partial2->bh->b_blocknr)) { +- if ((partial > chain) && (partial2 > chain2)) { ++ ++ /* Free top, but only if partial2 isn't its subtree. */ ++ if (nr) { ++ int level = min(partial - chain, partial2 - chain2); ++ int i; ++ int subtree = 1; ++ ++ for (i = 0; i <= level; i++) { ++ if (offsets[i] != offsets2[i]) { ++ subtree = 0; ++ break; ++ } ++ } ++ ++ if (!subtree) { ++ if (partial == chain) { ++ /* Shared branch grows from the inode */ ++ ext4_free_branches(handle, inode, NULL, ++ &nr, &nr+1, ++ (chain+n-1) - partial); ++ *partial->p = 0; ++ } else { ++ /* Shared branch grows from an indirect block */ ++ BUFFER_TRACE(partial->bh, "get_write_access"); + ext4_free_branches(handle, inode, partial->bh, +- partial->p + 1, +- partial2->p, ++ partial->p, ++ partial->p+1, + (chain+n-1) - partial); +- BUFFER_TRACE(partial->bh, "call brelse"); +- brelse(partial->bh); +- BUFFER_TRACE(partial2->bh, "call brelse"); +- brelse(partial2->bh); + } +- return 0; + } ++ } ++ ++ if (!nr2) { + /* +- * Clear the ends of indirect blocks on the shared branch +- * at the start of the range ++ * ext4_find_shared returns Indirect structure which ++ * points to the last element which should not be ++ * removed by truncate. But this is end of the range ++ * in punch_hole so we need to point to the next element + */ +- if (partial > chain) { ++ partial2->p++; ++ } ++ ++ while (partial > chain || partial2 > chain2) { ++ int depth = (chain+n-1) - partial; ++ int depth2 = (chain2+n2-1) - partial2; ++ ++ if (partial > chain && partial2 > chain2 && ++ partial->bh->b_blocknr == partial2->bh->b_blocknr) { ++ /* ++ * We've converged on the same block. Clear the range, ++ * then we're done. ++ */ + ext4_free_branches(handle, inode, partial->bh, +- partial->p + 1, +- (__le32 *)partial->bh->b_data+addr_per_block, +- (chain+n-1) - partial); ++ partial->p + 1, ++ partial2->p, ++ (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); +- partial--; ++ BUFFER_TRACE(partial2->bh, "call brelse"); ++ brelse(partial2->bh); ++ return 0; + } ++ + /* +- * Clear the ends of indirect blocks on the shared branch +- * at the end of the range ++ * The start and end partial branches may not be at the same ++ * level even though the punch happened within one level. So, we ++ * give them a chance to arrive at the same level, then walk ++ * them in step with each other until we converge on the same ++ * block. + */ +- if (partial2 > chain2) { ++ if (partial > chain && depth <= depth2) { ++ ext4_free_branches(handle, inode, partial->bh, ++ partial->p + 1, ++ (__le32 *)partial->bh->b_data+addr_per_block, ++ (chain+n-1) - partial); ++ BUFFER_TRACE(partial->bh, "call brelse"); ++ brelse(partial->bh); ++ partial--; ++ } ++ if (partial2 > chain2 && depth2 <= depth) { + ext4_free_branches(handle, inode, partial2->bh, + (__le32 *)partial2->bh->b_data, + partial2->p, +- (chain2+n-1) - partial2); ++ (chain2+n2-1) - partial2); + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + partial2--; + } + } ++ return 0; + + do_indirects: + /* Kill the remaining (whole) subtrees */ diff --git a/queue-3.19/kvm-avoid-page-allocation-failure-in-kvm_set_memory_region.patch b/queue-3.19/kvm-avoid-page-allocation-failure-in-kvm_set_memory_region.patch new file mode 100644 index 00000000000..edc99468b87 --- /dev/null +++ b/queue-3.19/kvm-avoid-page-allocation-failure-in-kvm_set_memory_region.patch @@ -0,0 +1,99 @@ +From 744961341d472db6272ed9b42319a90f5a2aa7c4 Mon Sep 17 00:00:00 2001 +From: Igor Mammedov +Date: Fri, 20 Mar 2015 12:21:37 +0000 +Subject: kvm: avoid page allocation failure in kvm_set_memory_region() + +From: Igor Mammedov + +commit 744961341d472db6272ed9b42319a90f5a2aa7c4 upstream. + +KVM guest can fail to startup with following trace on host: + +qemu-system-x86: page allocation failure: order:4, mode:0x40d0 +Call Trace: + dump_stack+0x47/0x67 + warn_alloc_failed+0xee/0x150 + __alloc_pages_direct_compact+0x14a/0x150 + __alloc_pages_nodemask+0x776/0xb80 + alloc_kmem_pages+0x3a/0x110 + kmalloc_order+0x13/0x50 + kmemdup+0x1b/0x40 + __kvm_set_memory_region+0x24a/0x9f0 [kvm] + kvm_set_ioapic+0x130/0x130 [kvm] + kvm_set_memory_region+0x21/0x40 [kvm] + kvm_vm_ioctl+0x43f/0x750 [kvm] + +Failure happens when attempting to allocate pages for +'struct kvm_memslots', however it doesn't have to be +present in physically contiguous (kmalloc-ed) address +space, change allocation to kvm_kvzalloc() so that +it will be vmalloc-ed when its size is more then a page. + +Signed-off-by: Igor Mammedov +Signed-off-by: Marcelo Tosatti +Signed-off-by: Greg Kroah-Hartman + +--- + virt/kvm/kvm_main.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +--- a/virt/kvm/kvm_main.c ++++ b/virt/kvm/kvm_main.c +@@ -466,7 +466,7 @@ static struct kvm *kvm_create_vm(unsigne + BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); + + r = -ENOMEM; +- kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); ++ kvm->memslots = kvm_kvzalloc(sizeof(struct kvm_memslots)); + if (!kvm->memslots) + goto out_err_no_srcu; + +@@ -517,7 +517,7 @@ out_err_no_srcu: + out_err_no_disable: + for (i = 0; i < KVM_NR_BUSES; i++) + kfree(kvm->buses[i]); +- kfree(kvm->memslots); ++ kvfree(kvm->memslots); + kvm_arch_free_vm(kvm); + return ERR_PTR(r); + } +@@ -573,7 +573,7 @@ static void kvm_free_physmem(struct kvm + kvm_for_each_memslot(memslot, slots) + kvm_free_physmem_slot(kvm, memslot, NULL); + +- kfree(kvm->memslots); ++ kvfree(kvm->memslots); + } + + static void kvm_destroy_devices(struct kvm *kvm) +@@ -865,10 +865,10 @@ int __kvm_set_memory_region(struct kvm * + goto out_free; + } + +- slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), +- GFP_KERNEL); ++ slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); + if (!slots) + goto out_free; ++ memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + + if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { + slot = id_to_memslot(slots, mem->slot); +@@ -911,7 +911,7 @@ int __kvm_set_memory_region(struct kvm * + kvm_arch_commit_memory_region(kvm, mem, &old, change); + + kvm_free_physmem_slot(kvm, &old, &new); +- kfree(old_memslots); ++ kvfree(old_memslots); + + /* + * IOMMU mapping: New slots need to be mapped. Old slots need to be +@@ -930,7 +930,7 @@ int __kvm_set_memory_region(struct kvm * + return 0; + + out_slots: +- kfree(slots); ++ kvfree(slots); + out_free: + kvm_free_physmem_slot(kvm, &new, &old); + out: diff --git a/queue-3.19/series b/queue-3.19/series index b7f3d42e24e..2e1bf37e375 100644 --- a/queue-3.19/series +++ b/queue-3.19/series @@ -94,3 +94,8 @@ vb2-fix-unbalanced-warnings-when-calling-vb2_thread_stop.patch clk-divider-fix-selection-of-divider-when-rounding-to-closest.patch clk-divider-fix-calculation-of-maximal-parent-rate-for-a-given-divider.patch ib-mlx4-saturate-roce-port-pma-counters-in-case-of-overflow.patch +timers-tick-broadcast-hrtimer-fix-suspicious-rcu-usage-in-idle-loop.patch +ext4-fix-indirect-punch-hole-corruption.patch +xfs-ensure-truncate-forces-zeroed-blocks-to-disk.patch +drm-i915-push-vblank-enable-disable-past-encoder-enable-disable.patch +kvm-avoid-page-allocation-failure-in-kvm_set_memory_region.patch diff --git a/queue-3.19/timers-tick-broadcast-hrtimer-fix-suspicious-rcu-usage-in-idle-loop.patch b/queue-3.19/timers-tick-broadcast-hrtimer-fix-suspicious-rcu-usage-in-idle-loop.patch new file mode 100644 index 00000000000..3e7577822d3 --- /dev/null +++ b/queue-3.19/timers-tick-broadcast-hrtimer-fix-suspicious-rcu-usage-in-idle-loop.patch @@ -0,0 +1,71 @@ +From a127d2bcf1fbc8c8e0b5cf0dab54f7d3ff50ce47 Mon Sep 17 00:00:00 2001 +From: Preeti U Murthy +Date: Wed, 18 Mar 2015 16:19:27 +0530 +Subject: timers/tick/broadcast-hrtimer: Fix suspicious RCU usage in idle loop + +From: Preeti U Murthy + +commit a127d2bcf1fbc8c8e0b5cf0dab54f7d3ff50ce47 upstream. + +The hrtimer mode of broadcast queues hrtimers in the idle entry +path so as to wakeup cpus in deep idle states. The associated +call graph is : + + cpuidle_idle_call() + |____ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, ....)) + |_____tick_broadcast_set_event() + |____clockevents_program_event() + |____bc_set_next() + +The hrtimer_{start/cancel} functions call into tracing which uses RCU. +But it is not legal to call into RCU in cpuidle because it is one of the +quiescent states. Hence protect this region with RCU_NONIDLE which informs +RCU that the cpu is momentarily non-idle. + +As an aside it is helpful to point out that the clock event device that is +programmed here is not a per-cpu clock device; it is a +pseudo clock device, used by the broadcast framework alone. +The per-cpu clock device programming never goes through bc_set_next(). + +Signed-off-by: Preeti U Murthy +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Paul E. McKenney +Cc: linuxppc-dev@ozlabs.org +Cc: mpe@ellerman.id.au +Cc: tglx@linutronix.de +Link: http://lkml.kernel.org/r/20150318104705.17763.56668.stgit@preeti.in.ibm.com +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/time/tick-broadcast-hrtimer.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +--- a/kernel/time/tick-broadcast-hrtimer.c ++++ b/kernel/time/tick-broadcast-hrtimer.c +@@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event + */ + static int bc_set_next(ktime_t expires, struct clock_event_device *bc) + { ++ int bc_moved; + /* + * We try to cancel the timer first. If the callback is on + * flight on some other cpu then we let it handle it. If we +@@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, + * restart the timer because we are in the callback, but we + * can set the expiry time and let the callback return + * HRTIMER_RESTART. ++ * ++ * Since we are in the idle loop at this point and because ++ * hrtimer_{start/cancel} functions call into tracing, ++ * calls to these functions must be bound within RCU_NONIDLE. + */ +- if (hrtimer_try_to_cancel(&bctimer) >= 0) { +- hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); ++ RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? ++ !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : ++ 0); ++ if (bc_moved) { + /* Bind the "device" to the cpu */ + bc->bound_on = smp_processor_id(); + } else if (bc->bound_on == smp_processor_id()) { diff --git a/queue-3.19/xfs-ensure-truncate-forces-zeroed-blocks-to-disk.patch b/queue-3.19/xfs-ensure-truncate-forces-zeroed-blocks-to-disk.patch new file mode 100644 index 00000000000..b5642a52a59 --- /dev/null +++ b/queue-3.19/xfs-ensure-truncate-forces-zeroed-blocks-to-disk.patch @@ -0,0 +1,186 @@ +From 5885ebda878b47c4b4602d4b0410cb4b282af024 Mon Sep 17 00:00:00 2001 +From: Dave Chinner +Date: Mon, 23 Feb 2015 22:37:08 +1100 +Subject: xfs: ensure truncate forces zeroed blocks to disk + +From: Dave Chinner + +commit 5885ebda878b47c4b4602d4b0410cb4b282af024 upstream. + +A new fsync vs power fail test in xfstests indicated that XFS can +have unreliable data consistency when doing extending truncates that +require block zeroing. The blocks beyond EOF get zeroed in memory, +but we never force those changes to disk before we run the +transaction that extends the file size and exposes those blocks to +userspace. This can result in the blocks not being correctly zeroed +after a crash. + +Because in-memory behaviour is correct, tools like fsx don't pick up +any coherency problems - it's not until the filesystem is shutdown +or the system crashes after writing the truncate transaction to the +journal but before the zeroed data in the page cache is flushed that +the issue is exposed. + +Fix this by also flushing the dirty data in memory region between +the old size and new size when we've found blocks that need zeroing +in the truncate process. + +Reported-by: Liu Bo +Signed-off-by: Dave Chinner +Reviewed-by: Brian Foster +Signed-off-by: Dave Chinner +Signed-off-by: Greg Kroah-Hartman + + +--- + fs/xfs/xfs_file.c | 14 ++++++++++---- + fs/xfs/xfs_inode.h | 5 +++-- + fs/xfs/xfs_iops.c | 36 ++++++++++++++---------------------- + 3 files changed, 27 insertions(+), 28 deletions(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -360,7 +360,8 @@ STATIC int /* error (positive) */ + xfs_zero_last_block( + struct xfs_inode *ip, + xfs_fsize_t offset, +- xfs_fsize_t isize) ++ xfs_fsize_t isize, ++ bool *did_zeroing) + { + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); +@@ -388,6 +389,7 @@ xfs_zero_last_block( + zero_len = mp->m_sb.sb_blocksize - zero_offset; + if (isize + zero_len > offset) + zero_len = offset - isize; ++ *did_zeroing = true; + return xfs_iozero(ip, isize, zero_len); + } + +@@ -406,7 +408,8 @@ int /* error (positive) */ + xfs_zero_eof( + struct xfs_inode *ip, + xfs_off_t offset, /* starting I/O offset */ +- xfs_fsize_t isize) /* current inode size */ ++ xfs_fsize_t isize, /* current inode size */ ++ bool *did_zeroing) + { + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_zero_fsb; +@@ -428,7 +431,7 @@ xfs_zero_eof( + * We only zero a part of that block so it is handled specially. + */ + if (XFS_B_FSB_OFFSET(mp, isize) != 0) { +- error = xfs_zero_last_block(ip, offset, isize); ++ error = xfs_zero_last_block(ip, offset, isize, did_zeroing); + if (error) + return error; + } +@@ -488,6 +491,7 @@ xfs_zero_eof( + if (error) + return error; + ++ *did_zeroing = true; + start_zero_fsb = imap.br_startoff + imap.br_blockcount; + ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); + } +@@ -526,13 +530,15 @@ restart: + * having to redo all checks before. + */ + if (*pos > i_size_read(inode)) { ++ bool zero = false; ++ + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + goto restart; + } +- error = xfs_zero_eof(ip, *pos, i_size_read(inode)); ++ error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero); + if (error) + return error; + } +--- a/fs/xfs/xfs_inode.h ++++ b/fs/xfs/xfs_inode.h +@@ -377,8 +377,9 @@ int xfs_droplink(struct xfs_trans *, st + int xfs_bumplink(struct xfs_trans *, struct xfs_inode *); + + /* from xfs_file.c */ +-int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); +-int xfs_iozero(struct xfs_inode *, loff_t, size_t); ++int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, ++ xfs_fsize_t isize, bool *did_zeroing); ++int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); + + + #define IHOLD(ip) \ +--- a/fs/xfs/xfs_iops.c ++++ b/fs/xfs/xfs_iops.c +@@ -741,6 +741,7 @@ xfs_setattr_size( + int error; + uint lock_flags = 0; + uint commit_flags = 0; ++ bool did_zeroing = false; + + trace_xfs_setattr(ip); + +@@ -784,20 +785,16 @@ xfs_setattr_size( + return error; + + /* +- * Now we can make the changes. Before we join the inode to the +- * transaction, take care of the part of the truncation that must be +- * done without the inode lock. This needs to be done before joining +- * the inode to the transaction, because the inode cannot be unlocked +- * once it is a part of the transaction. ++ * File data changes must be complete before we start the transaction to ++ * modify the inode. This needs to be done before joining the inode to ++ * the transaction because the inode cannot be unlocked once it is a ++ * part of the transaction. ++ * ++ * Start with zeroing any data block beyond EOF that we may expose on ++ * file extension. + */ + if (newsize > oldsize) { +- /* +- * Do the first part of growing a file: zero any data in the +- * last block that is beyond the old EOF. We need to do this +- * before the inode is joined to the transaction to modify +- * i_size. +- */ +- error = xfs_zero_eof(ip, newsize, oldsize); ++ error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); + if (error) + return error; + } +@@ -807,23 +804,18 @@ xfs_setattr_size( + * any previous writes that are beyond the on disk EOF and the new + * EOF that have not been written out need to be written here. If we + * do not write the data out, we expose ourselves to the null files +- * problem. +- * +- * Only flush from the on disk size to the smaller of the in memory +- * file size or the new size as that's the range we really care about +- * here and prevents waiting for other data not within the range we +- * care about here. ++ * problem. Note that this includes any block zeroing we did above; ++ * otherwise those blocks may not be zeroed after a crash. + */ +- if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { ++ if (newsize > ip->i_d.di_size && ++ (oldsize != ip->i_d.di_size || did_zeroing)) { + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + ip->i_d.di_size, newsize); + if (error) + return error; + } + +- /* +- * Wait for all direct I/O to complete. +- */ ++ /* Now wait for all direct I/O to complete. */ + inode_dio_wait(inode); + + /*