From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 17 Feb 2023 14:25:05 +0000 (+0100)
Subject: 5.4-stable patches
X-Git-Tag: v4.14.306~54
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=4c7248e6b8c83b14422749b5a2b3c5da6b995c4e;p=thirdparty%2Fkernel%2Fstable-queue.git

5.4-stable patches

added patches:
	ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch
	revert-ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch
	xfs-change-the-order-in-which-child-and-parent-defer-ops-are-finished.patch
	xfs-clean-up-bmap-intent-item-recovery-checking.patch
	xfs-clean-up-xfs_bui_item_recover-iget-trans_alloc-ilock-ordering.patch
	xfs-ensure-inobt-record-walks-always-make-forward-progress.patch
	xfs-expose-the-log-push-threshold.patch
	xfs-factor-out-a-xfs_defer_create_intent-helper.patch
	xfs-fix-an-incore-inode-uaf-in-xfs_bui_recover.patch
	xfs-fix-finobt-btree-block-recovery-ordering.patch
	xfs-fix-missing-cow-blocks-writeback-conversion-retry.patch
	xfs-fix-the-forward-progress-assertion-in-xfs_iwalk_run_callbacks.patch
	xfs-log-new-intent-items-created-as-part-of-finishing-recovered-intent-items.patch
	xfs-merge-the-diff_items-defer-op-into-create_intent.patch
	xfs-merge-the-log_item-defer-op-into-create_intent.patch
	xfs-only-relog-deferred-intent-items-if-free-space-in-the-log-gets-low.patch
	xfs-periodically-relog-deferred-intent-items.patch
	xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
	xfs-proper-replay-of-deferred-ops-queued-during-log-recovery.patch
	xfs-refactor-xfs_defer_finish_noroll.patch
	xfs-remove-the-xfs_efd_log_item_t-typedef.patch
	xfs-remove-the-xfs_efi_log_item_t-typedef.patch
	xfs-remove-the-xfs_inode_log_item_t-typedef.patch
	xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch
	xfs-turn-dfp_intent-into-a-xfs_log_item.patch
	xfs-xfs_defer_capture-should-absorb-remaining-block-reservations.patch
	xfs-xfs_defer_capture-should-absorb-remaining-transaction-reservation.patch
---

diff --git a/queue-5.4/ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch b/queue-5.4/ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch
new file mode 100644
index 00000000000..702e31a49e5
--- /dev/null
+++ b/queue-5.4/ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch
@@ -0,0 +1,81 @@
+From shaoyi@amazon.com  Fri Feb 17 15:23:36 2023
+From: Shaoying Xu <shaoyi@amazon.com>
+Date: Tue, 7 Feb 2023 18:28:20 +0000
+Subject: ipv4: Fix incorrect route flushing when source address is deleted
+To: <gregkh@linuxfoundation.org>
+Cc: <dsahern@kernel.org>, <idosch@nvidia.com>, <kuba@kernel.org>, <patches@lists.linux.dev>, <sashal@kernel.org>, <shaoyi@amazon.com>, <stable@vger.kernel.org>
+Message-ID: <20230207182820.4959-2-shaoyi@amazon.com>
+
+From: Ido Schimmel <idosch@nvidia.com>
+
+[ Upstream commit f96a3d74554df537b6db5c99c27c80e7afadc8d1 ]
+
+Cited commit added the table ID to the FIB info structure, but did not
+prevent structures with different table IDs from being consolidated.
+This can lead to routes being flushed from a VRF when an address is
+deleted from a different VRF.
+
+Fix by taking the table ID into account when looking for a matching FIB
+info. This is already done for FIB info structures backed by a nexthop
+object in fib_find_info_nh().
+
+Add test cases that fail before the fix:
+
+ # ./fib_tests.sh -t ipv4_del_addr
+
+ IPv4 delete address route tests
+     Regular FIB info
+     TEST: Route removed from VRF when source address deleted            [ OK ]
+     TEST: Route in default VRF not removed                              [ OK ]
+     TEST: Route removed in default VRF when source address deleted      [ OK ]
+     TEST: Route in VRF is not removed by address delete                 [ OK ]
+     Identical FIB info with different table ID
+     TEST: Route removed from VRF when source address deleted            [FAIL]
+     TEST: Route in default VRF not removed                              [ OK ]
+ RTNETLINK answers: File exists
+     TEST: Route removed in default VRF when source address deleted      [ OK ]
+     TEST: Route in VRF is not removed by address delete                 [FAIL]
+
+ Tests passed:   6
+ Tests failed:   2
+
+And pass after:
+
+ # ./fib_tests.sh -t ipv4_del_addr
+
+ IPv4 delete address route tests
+     Regular FIB info
+     TEST: Route removed from VRF when source address deleted            [ OK ]
+     TEST: Route in default VRF not removed                              [ OK ]
+     TEST: Route removed in default VRF when source address deleted      [ OK ]
+     TEST: Route in VRF is not removed by address delete                 [ OK ]
+     Identical FIB info with different table ID
+     TEST: Route removed from VRF when source address deleted            [ OK ]
+     TEST: Route in default VRF not removed                              [ OK ]
+     TEST: Route removed in default VRF when source address deleted      [ OK ]
+     TEST: Route in VRF is not removed by address delete                 [ OK ]
+
+ Tests passed:   8
+ Tests failed:   0
+
+Fixes: 5a56a0b3a45d ("net: Don't delete routes in different VRFs")
+Signed-off-by: Ido Schimmel <idosch@nvidia.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/fib_semantics.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ipv4/fib_semantics.c
++++ b/net/ipv4/fib_semantics.c
+@@ -421,6 +421,7 @@ static struct fib_info *fib_find_info(st
+ 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
+ 		    nfi->fib_priority == fi->fib_priority &&
+ 		    nfi->fib_type == fi->fib_type &&
++		    nfi->fib_tb_id == fi->fib_tb_id &&
+ 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
+ 			   sizeof(u32) * RTAX_MAX) == 0 &&
+ 		    !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
diff --git a/queue-5.4/revert-ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch b/queue-5.4/revert-ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch
new file mode 100644
index 00000000000..b2c4a7eb23a
--- /dev/null
+++ b/queue-5.4/revert-ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch
@@ -0,0 +1,1762 @@
+From shaoyi@amazon.com  Fri Feb 17 15:23:11 2023
+From: Shaoying Xu <shaoyi@amazon.com>
+Date: Tue, 7 Feb 2023 18:28:19 +0000
+Subject: Revert "ipv4: Fix incorrect route flushing when source address is deleted"
+To: <gregkh@linuxfoundation.org>
+Cc: <dsahern@kernel.org>, <idosch@nvidia.com>, <kuba@kernel.org>, <patches@lists.linux.dev>, <sashal@kernel.org>, <shaoyi@amazon.com>, <stable@vger.kernel.org>
+Message-ID: <20230207182820.4959-1-shaoyi@amazon.com>
+
+From: Shaoying Xu <shaoyi@amazon.com>
+
+This reverts commit 2537b637eac0bd546f63e1492a34edd30878e8d4 that
+deleted the whole fib_tests.sh by mistake and caused fib_tests failure
+in kselftests run.
+
+Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/fib_semantics.c                 |    1 
+ tools/testing/selftests/net/fib_tests.sh | 1727 +++++++++++++++++++++++++++++++
+ 2 files changed, 1727 insertions(+), 1 deletion(-)
+ create mode 100755 tools/testing/selftests/net/fib_tests.sh
+
+--- a/net/ipv4/fib_semantics.c
++++ b/net/ipv4/fib_semantics.c
+@@ -421,7 +421,6 @@ static struct fib_info *fib_find_info(st
+ 		    nfi->fib_prefsrc == fi->fib_prefsrc &&
+ 		    nfi->fib_priority == fi->fib_priority &&
+ 		    nfi->fib_type == fi->fib_type &&
+-		    nfi->fib_tb_id == fi->fib_tb_id &&
+ 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
+ 			   sizeof(u32) * RTAX_MAX) == 0 &&
+ 		    !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
+--- /dev/null
++++ b/tools/testing/selftests/net/fib_tests.sh
+@@ -0,0 +1,1727 @@
++#!/bin/bash
++# SPDX-License-Identifier: GPL-2.0
++
++# This test is for checking IPv4 and IPv6 FIB behavior in response to
++# different events.
++
++ret=0
++# Kselftest framework requirement - SKIP code is 4.
++ksft_skip=4
++
++# all tests in this script. Can be overridden with -t option
++TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter"
++
++VERBOSE=0
++PAUSE_ON_FAIL=no
++PAUSE=no
++IP="ip -netns ns1"
++NS_EXEC="ip netns exec ns1"
++
++which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
++
++log_test()
++{
++	local rc=$1
++	local expected=$2
++	local msg="$3"
++
++	if [ ${rc} -eq ${expected} ]; then
++		printf "    TEST: %-60s  [ OK ]\n" "${msg}"
++		nsuccess=$((nsuccess+1))
++	else
++		ret=1
++		nfail=$((nfail+1))
++		printf "    TEST: %-60s  [FAIL]\n" "${msg}"
++		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
++		echo
++			echo "hit enter to continue, 'q' to quit"
++			read a
++			[ "$a" = "q" ] && exit 1
++		fi
++	fi
++
++	if [ "${PAUSE}" = "yes" ]; then
++		echo
++		echo "hit enter to continue, 'q' to quit"
++		read a
++		[ "$a" = "q" ] && exit 1
++	fi
++}
++
++setup()
++{
++	set -e
++	ip netns add ns1
++	ip netns set ns1 auto
++	$IP link set dev lo up
++	ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1
++	ip netns exec ns1 sysctl -qw net.ipv6.conf.all.forwarding=1
++
++	$IP link add dummy0 type dummy
++	$IP link set dev dummy0 up
++	$IP address add 198.51.100.1/24 dev dummy0
++	$IP -6 address add 2001:db8:1::1/64 dev dummy0
++	set +e
++
++}
++
++cleanup()
++{
++	$IP link del dev dummy0 &> /dev/null
++	ip netns del ns1
++	ip netns del ns2 &> /dev/null
++}
++
++get_linklocal()
++{
++	local dev=$1
++	local addr
++
++	addr=$($IP -6 -br addr show dev ${dev} | \
++	awk '{
++		for (i = 3; i <= NF; ++i) {
++			if ($i ~ /^fe80/)
++				print $i
++		}
++	}'
++	)
++	addr=${addr/\/*}
++
++	[ -z "$addr" ] && return 1
++
++	echo $addr
++
++	return 0
++}
++
++fib_unreg_unicast_test()
++{
++	echo
++	echo "Single path route test"
++
++	setup
++
++	echo "    Start point"
++	$IP route get fibmatch 198.51.100.2 &> /dev/null
++	log_test $? 0 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++	log_test $? 0 "IPv6 fibmatch"
++
++	set -e
++	$IP link del dev dummy0
++	set +e
++
++	echo "    Nexthop device deleted"
++	$IP route get fibmatch 198.51.100.2 &> /dev/null
++	log_test $? 2 "IPv4 fibmatch - no route"
++	$IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++	log_test $? 2 "IPv6 fibmatch - no route"
++
++	cleanup
++}
++
++fib_unreg_multipath_test()
++{
++
++	echo
++	echo "Multipath route test"
++
++	setup
++
++	set -e
++	$IP link add dummy1 type dummy
++	$IP link set dev dummy1 up
++	$IP address add 192.0.2.1/24 dev dummy1
++	$IP -6 address add 2001:db8:2::1/64 dev dummy1
++
++	$IP route add 203.0.113.0/24 \
++		nexthop via 198.51.100.2 dev dummy0 \
++		nexthop via 192.0.2.2 dev dummy1
++	$IP -6 route add 2001:db8:3::/64 \
++		nexthop via 2001:db8:1::2 dev dummy0 \
++		nexthop via 2001:db8:2::2 dev dummy1
++	set +e
++
++	echo "    Start point"
++	$IP route get fibmatch 203.0.113.1 &> /dev/null
++	log_test $? 0 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
++	log_test $? 0 "IPv6 fibmatch"
++
++	set -e
++	$IP link del dev dummy0
++	set +e
++
++	echo "    One nexthop device deleted"
++	$IP route get fibmatch 203.0.113.1 &> /dev/null
++	log_test $? 2 "IPv4 - multipath route removed on delete"
++
++	$IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
++	# In IPv6 we do not flush the entire multipath route.
++	log_test $? 0 "IPv6 - multipath down to single path"
++
++	set -e
++	$IP link del dev dummy1
++	set +e
++
++	echo "    Second nexthop device deleted"
++	$IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
++	log_test $? 2 "IPv6 - no route"
++
++	cleanup
++}
++
++fib_unreg_test()
++{
++	fib_unreg_unicast_test
++	fib_unreg_multipath_test
++}
++
++fib_down_unicast_test()
++{
++	echo
++	echo "Single path, admin down"
++
++	setup
++
++	echo "    Start point"
++	$IP route get fibmatch 198.51.100.2 &> /dev/null
++	log_test $? 0 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++	log_test $? 0 "IPv6 fibmatch"
++
++	set -e
++	$IP link set dev dummy0 down
++	set +e
++
++	echo "    Route deleted on down"
++	$IP route get fibmatch 198.51.100.2 &> /dev/null
++	log_test $? 2 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++	log_test $? 2 "IPv6 fibmatch"
++
++	cleanup
++}
++
++fib_down_multipath_test_do()
++{
++	local down_dev=$1
++	local up_dev=$2
++
++	$IP route get fibmatch 203.0.113.1 \
++		oif $down_dev &> /dev/null
++	log_test $? 2 "IPv4 fibmatch on down device"
++	$IP -6 route get fibmatch 2001:db8:3::1 \
++		oif $down_dev &> /dev/null
++	log_test $? 2 "IPv6 fibmatch on down device"
++
++	$IP route get fibmatch 203.0.113.1 \
++		oif $up_dev &> /dev/null
++	log_test $? 0 "IPv4 fibmatch on up device"
++	$IP -6 route get fibmatch 2001:db8:3::1 \
++		oif $up_dev &> /dev/null
++	log_test $? 0 "IPv6 fibmatch on up device"
++
++	$IP route get fibmatch 203.0.113.1 | \
++		grep $down_dev | grep -q "dead linkdown"
++	log_test $? 0 "IPv4 flags on down device"
++	$IP -6 route get fibmatch 2001:db8:3::1 | \
++		grep $down_dev | grep -q "dead linkdown"
++	log_test $? 0 "IPv6 flags on down device"
++
++	$IP route get fibmatch 203.0.113.1 | \
++		grep $up_dev | grep -q "dead linkdown"
++	log_test $? 1 "IPv4 flags on up device"
++	$IP -6 route get fibmatch 2001:db8:3::1 | \
++		grep $up_dev | grep -q "dead linkdown"
++	log_test $? 1 "IPv6 flags on up device"
++}
++
++fib_down_multipath_test()
++{
++	echo
++	echo "Admin down multipath"
++
++	setup
++
++	set -e
++	$IP link add dummy1 type dummy
++	$IP link set dev dummy1 up
++
++	$IP address add 192.0.2.1/24 dev dummy1
++	$IP -6 address add 2001:db8:2::1/64 dev dummy1
++
++	$IP route add 203.0.113.0/24 \
++		nexthop via 198.51.100.2 dev dummy0 \
++		nexthop via 192.0.2.2 dev dummy1
++	$IP -6 route add 2001:db8:3::/64 \
++		nexthop via 2001:db8:1::2 dev dummy0 \
++		nexthop via 2001:db8:2::2 dev dummy1
++	set +e
++
++	echo "    Verify start point"
++	$IP route get fibmatch 203.0.113.1 &> /dev/null
++	log_test $? 0 "IPv4 fibmatch"
++
++	$IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
++	log_test $? 0 "IPv6 fibmatch"
++
++	set -e
++	$IP link set dev dummy0 down
++	set +e
++
++	echo "    One device down, one up"
++	fib_down_multipath_test_do "dummy0" "dummy1"
++
++	set -e
++	$IP link set dev dummy0 up
++	$IP link set dev dummy1 down
++	set +e
++
++	echo "    Other device down and up"
++	fib_down_multipath_test_do "dummy1" "dummy0"
++
++	set -e
++	$IP link set dev dummy0 down
++	set +e
++
++	echo "    Both devices down"
++	$IP route get fibmatch 203.0.113.1 &> /dev/null
++	log_test $? 2 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
++	log_test $? 2 "IPv6 fibmatch"
++
++	$IP link del dev dummy1
++	cleanup
++}
++
++fib_down_test()
++{
++	fib_down_unicast_test
++	fib_down_multipath_test
++}
++
++# Local routes should not be affected when carrier changes.
++fib_carrier_local_test()
++{
++	echo
++	echo "Local carrier tests - single path"
++
++	setup
++
++	set -e
++	$IP link set dev dummy0 carrier on
++	set +e
++
++	echo "    Start point"
++	$IP route get fibmatch 198.51.100.1 &> /dev/null
++	log_test $? 0 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:1::1 &> /dev/null
++	log_test $? 0 "IPv6 fibmatch"
++
++	$IP route get fibmatch 198.51.100.1 | \
++		grep -q "linkdown"
++	log_test $? 1 "IPv4 - no linkdown flag"
++	$IP -6 route get fibmatch 2001:db8:1::1 | \
++		grep -q "linkdown"
++	log_test $? 1 "IPv6 - no linkdown flag"
++
++	set -e
++	$IP link set dev dummy0 carrier off
++	sleep 1
++	set +e
++
++	echo "    Carrier off on nexthop"
++	$IP route get fibmatch 198.51.100.1 &> /dev/null
++	log_test $? 0 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:1::1 &> /dev/null
++	log_test $? 0 "IPv6 fibmatch"
++
++	$IP route get fibmatch 198.51.100.1 | \
++		grep -q "linkdown"
++	log_test $? 1 "IPv4 - linkdown flag set"
++	$IP -6 route get fibmatch 2001:db8:1::1 | \
++		grep -q "linkdown"
++	log_test $? 1 "IPv6 - linkdown flag set"
++
++	set -e
++	$IP address add 192.0.2.1/24 dev dummy0
++	$IP -6 address add 2001:db8:2::1/64 dev dummy0
++	set +e
++
++	echo "    Route to local address with carrier down"
++	$IP route get fibmatch 192.0.2.1 &> /dev/null
++	log_test $? 0 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:2::1 &> /dev/null
++	log_test $? 0 "IPv6 fibmatch"
++
++	$IP route get fibmatch 192.0.2.1 | \
++		grep -q "linkdown"
++	log_test $? 1 "IPv4 linkdown flag set"
++	$IP -6 route get fibmatch 2001:db8:2::1 | \
++		grep -q "linkdown"
++	log_test $? 1 "IPv6 linkdown flag set"
++
++	cleanup
++}
++
++fib_carrier_unicast_test()
++{
++	ret=0
++
++	echo
++	echo "Single path route carrier test"
++
++	setup
++
++	set -e
++	$IP link set dev dummy0 carrier on
++	set +e
++
++	echo "    Start point"
++	$IP route get fibmatch 198.51.100.2 &> /dev/null
++	log_test $? 0 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++	log_test $? 0 "IPv6 fibmatch"
++
++	$IP route get fibmatch 198.51.100.2 | \
++		grep -q "linkdown"
++	log_test $? 1 "IPv4 no linkdown flag"
++	$IP -6 route get fibmatch 2001:db8:1::2 | \
++		grep -q "linkdown"
++	log_test $? 1 "IPv6 no linkdown flag"
++
++	set -e
++	$IP link set dev dummy0 carrier off
++	sleep 1
++	set +e
++
++	echo "    Carrier down"
++	$IP route get fibmatch 198.51.100.2 &> /dev/null
++	log_test $? 0 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++	log_test $? 0 "IPv6 fibmatch"
++
++	$IP route get fibmatch 198.51.100.2 | \
++		grep -q "linkdown"
++	log_test $? 0 "IPv4 linkdown flag set"
++	$IP -6 route get fibmatch 2001:db8:1::2 | \
++		grep -q "linkdown"
++	log_test $? 0 "IPv6 linkdown flag set"
++
++	set -e
++	$IP address add 192.0.2.1/24 dev dummy0
++	$IP -6 address add 2001:db8:2::1/64 dev dummy0
++	set +e
++
++	echo "    Second address added with carrier down"
++	$IP route get fibmatch 192.0.2.2 &> /dev/null
++	log_test $? 0 "IPv4 fibmatch"
++	$IP -6 route get fibmatch 2001:db8:2::2 &> /dev/null
++	log_test $? 0 "IPv6 fibmatch"
++
++	$IP route get fibmatch 192.0.2.2 | \
++		grep -q "linkdown"
++	log_test $? 0 "IPv4 linkdown flag set"
++	$IP -6 route get fibmatch 2001:db8:2::2 | \
++		grep -q "linkdown"
++	log_test $? 0 "IPv6 linkdown flag set"
++
++	cleanup
++}
++
++fib_carrier_test()
++{
++	fib_carrier_local_test
++	fib_carrier_unicast_test
++}
++
++fib_rp_filter_test()
++{
++	echo
++	echo "IPv4 rp_filter tests"
++
++	setup
++
++	set -e
++	ip netns add ns2
++	ip netns set ns2 auto
++
++	ip -netns ns2 link set dev lo up
++
++	$IP link add name veth1 type veth peer name veth2
++	$IP link set dev veth2 netns ns2
++	$IP address add 192.0.2.1/24 dev veth1
++	ip -netns ns2 address add 192.0.2.1/24 dev veth2
++	$IP link set dev veth1 up
++	ip -netns ns2 link set dev veth2 up
++
++	$IP link set dev lo address 52:54:00:6a:c7:5e
++	$IP link set dev veth1 address 52:54:00:6a:c7:5e
++	ip -netns ns2 link set dev lo address 52:54:00:6a:c7:5e
++	ip -netns ns2 link set dev veth2 address 52:54:00:6a:c7:5e
++
++	# 1. (ns2) redirect lo's egress to veth2's egress
++	ip netns exec ns2 tc qdisc add dev lo parent root handle 1: fq_codel
++	ip netns exec ns2 tc filter add dev lo parent 1: protocol arp basic \
++		action mirred egress redirect dev veth2
++	ip netns exec ns2 tc filter add dev lo parent 1: protocol ip basic \
++		action mirred egress redirect dev veth2
++
++	# 2. (ns1) redirect veth1's ingress to lo's ingress
++	$NS_EXEC tc qdisc add dev veth1 ingress
++	$NS_EXEC tc filter add dev veth1 ingress protocol arp basic \
++		action mirred ingress redirect dev lo
++	$NS_EXEC tc filter add dev veth1 ingress protocol ip basic \
++		action mirred ingress redirect dev lo
++
++	# 3. (ns1) redirect lo's egress to veth1's egress
++	$NS_EXEC tc qdisc add dev lo parent root handle 1: fq_codel
++	$NS_EXEC tc filter add dev lo parent 1: protocol arp basic \
++		action mirred egress redirect dev veth1
++	$NS_EXEC tc filter add dev lo parent 1: protocol ip basic \
++		action mirred egress redirect dev veth1
++
++	# 4. (ns2) redirect veth2's ingress to lo's ingress
++	ip netns exec ns2 tc qdisc add dev veth2 ingress
++	ip netns exec ns2 tc filter add dev veth2 ingress protocol arp basic \
++		action mirred ingress redirect dev lo
++	ip netns exec ns2 tc filter add dev veth2 ingress protocol ip basic \
++		action mirred ingress redirect dev lo
++
++	$NS_EXEC sysctl -qw net.ipv4.conf.all.rp_filter=1
++	$NS_EXEC sysctl -qw net.ipv4.conf.all.accept_local=1
++	$NS_EXEC sysctl -qw net.ipv4.conf.all.route_localnet=1
++	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=1
++	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.accept_local=1
++	ip netns exec ns2 sysctl -qw net.ipv4.conf.all.route_localnet=1
++	set +e
++
++	run_cmd "ip netns exec ns2 ping -w1 -c1 192.0.2.1"
++	log_test $? 0 "rp_filter passes local packets"
++
++	run_cmd "ip netns exec ns2 ping -w1 -c1 127.0.0.1"
++	log_test $? 0 "rp_filter passes loopback packets"
++
++	cleanup
++}
++
++################################################################################
++# Tests on nexthop spec
++
++# run 'ip route add' with given spec
++add_rt()
++{
++	local desc="$1"
++	local erc=$2
++	local vrf=$3
++	local pfx=$4
++	local gw=$5
++	local dev=$6
++	local cmd out rc
++
++	[ "$vrf" = "-" ] && vrf="default"
++	[ -n "$gw" ] && gw="via $gw"
++	[ -n "$dev" ] && dev="dev $dev"
++
++	cmd="$IP route add vrf $vrf $pfx $gw $dev"
++	if [ "$VERBOSE" = "1" ]; then
++		printf "\n    COMMAND: $cmd\n"
++	fi
++
++	out=$(eval $cmd 2>&1)
++	rc=$?
++	if [ "$VERBOSE" = "1" -a -n "$out" ]; then
++		echo "    $out"
++	fi
++	log_test $rc $erc "$desc"
++}
++
++fib4_nexthop()
++{
++	echo
++	echo "IPv4 nexthop tests"
++
++	echo "<<< write me >>>"
++}
++
++fib6_nexthop()
++{
++	local lldummy=$(get_linklocal dummy0)
++	local llv1=$(get_linklocal dummy0)
++
++	if [ -z "$lldummy" ]; then
++		echo "Failed to get linklocal address for dummy0"
++		return 1
++	fi
++	if [ -z "$llv1" ]; then
++		echo "Failed to get linklocal address for veth1"
++		return 1
++	fi
++
++	echo
++	echo "IPv6 nexthop tests"
++
++	add_rt "Directly connected nexthop, unicast address" 0 \
++		- 2001:db8:101::/64 2001:db8:1::2
++	add_rt "Directly connected nexthop, unicast address with device" 0 \
++		- 2001:db8:102::/64 2001:db8:1::2 "dummy0"
++	add_rt "Gateway is linklocal address" 0 \
++		- 2001:db8:103::1/64 $llv1 "veth0"
++
++	# fails because LL address requires a device
++	add_rt "Gateway is linklocal address, no device" 2 \
++		- 2001:db8:104::1/64 $llv1
++
++	# local address can not be a gateway
++	add_rt "Gateway can not be local unicast address" 2 \
++		- 2001:db8:105::/64 2001:db8:1::1
++	add_rt "Gateway can not be local unicast address, with device" 2 \
++		- 2001:db8:106::/64 2001:db8:1::1 "dummy0"
++	add_rt "Gateway can not be a local linklocal address" 2 \
++		- 2001:db8:107::1/64 $lldummy "dummy0"
++
++	# VRF tests
++	add_rt "Gateway can be local address in a VRF" 0 \
++		- 2001:db8:108::/64 2001:db8:51::2
++	add_rt "Gateway can be local address in a VRF, with device" 0 \
++		- 2001:db8:109::/64 2001:db8:51::2 "veth0"
++	add_rt "Gateway can be local linklocal address in a VRF" 0 \
++		- 2001:db8:110::1/64 $llv1 "veth0"
++
++	add_rt "Redirect to VRF lookup" 0 \
++		- 2001:db8:111::/64 "" "red"
++
++	add_rt "VRF route, gateway can be local address in default VRF" 0 \
++		red 2001:db8:112::/64 2001:db8:51::1
++
++	# local address in same VRF fails
++	add_rt "VRF route, gateway can not be a local address" 2 \
++		red 2001:db8:113::1/64 2001:db8:2::1
++	add_rt "VRF route, gateway can not be a local addr with device" 2 \
++		red 2001:db8:114::1/64 2001:db8:2::1 "dummy1"
++}
++
++# Default VRF:
++#   dummy0 - 198.51.100.1/24 2001:db8:1::1/64
++#   veth0  - 192.0.2.1/24    2001:db8:51::1/64
++#
++# VRF red:
++#   dummy1 - 192.168.2.1/24 2001:db8:2::1/64
++#   veth1  - 192.0.2.2/24   2001:db8:51::2/64
++#
++#  [ dummy0   veth0 ]--[ veth1   dummy1 ]
++
++fib_nexthop_test()
++{
++	setup
++
++	set -e
++
++	$IP -4 rule add pref 32765 table local
++	$IP -4 rule del pref 0
++	$IP -6 rule add pref 32765 table local
++	$IP -6 rule del pref 0
++
++	$IP link add red type vrf table 1
++	$IP link set red up
++	$IP -4 route add vrf red unreachable default metric 4278198272
++	$IP -6 route add vrf red unreachable default metric 4278198272
++
++	$IP link add veth0 type veth peer name veth1
++	$IP link set dev veth0 up
++	$IP address add 192.0.2.1/24 dev veth0
++	$IP -6 address add 2001:db8:51::1/64 dev veth0
++
++	$IP link set dev veth1 vrf red up
++	$IP address add 192.0.2.2/24 dev veth1
++	$IP -6 address add 2001:db8:51::2/64 dev veth1
++
++	$IP link add dummy1 type dummy
++	$IP link set dev dummy1 vrf red up
++	$IP address add 192.168.2.1/24 dev dummy1
++	$IP -6 address add 2001:db8:2::1/64 dev dummy1
++	set +e
++
++	sleep 1
++	fib4_nexthop
++	fib6_nexthop
++
++	(
++	$IP link del dev dummy1
++	$IP link del veth0
++	$IP link del red
++	) 2>/dev/null
++	cleanup
++}
++
++fib_suppress_test()
++{
++	echo
++	echo "FIB rule with suppress_prefixlength"
++	setup
++
++	$IP link add dummy1 type dummy
++	$IP link set dummy1 up
++	$IP -6 route add default dev dummy1
++	$IP -6 rule add table main suppress_prefixlength 0
++	ping -f -c 1000 -W 1 1234::1 >/dev/null 2>&1
++	$IP -6 rule del table main suppress_prefixlength 0
++	$IP link del dummy1
++
++	# If we got here without crashing, we're good.
++	log_test 0 0 "FIB rule suppress test"
++
++	cleanup
++}
++
++################################################################################
++# Tests on route add and replace
++
++run_cmd()
++{
++	local cmd="$1"
++	local out
++	local stderr="2>/dev/null"
++
++	if [ "$VERBOSE" = "1" ]; then
++		printf "    COMMAND: $cmd\n"
++		stderr=
++	fi
++
++	out=$(eval $cmd $stderr)
++	rc=$?
++	if [ "$VERBOSE" = "1" -a -n "$out" ]; then
++		echo "    $out"
++	fi
++
++	[ "$VERBOSE" = "1" ] && echo
++
++	return $rc
++}
++
++check_expected()
++{
++	local out="$1"
++	local expected="$2"
++	local rc=0
++
++	[ "${out}" = "${expected}" ] && return 0
++
++	if [ -z "${out}" ]; then
++		if [ "$VERBOSE" = "1" ]; then
++			printf "\nNo route entry found\n"
++			printf "Expected:\n"
++			printf "    ${expected}\n"
++		fi
++		return 1
++	fi
++
++	# tricky way to convert output to 1-line without ip's
++	# messy '\'; this drops all extra white space
++	out=$(echo ${out})
++	if [ "${out}" != "${expected}" ]; then
++		rc=1
++		if [ "${VERBOSE}" = "1" ]; then
++			printf "    Unexpected route entry. Have:\n"
++			printf "        ${out}\n"
++			printf "    Expected:\n"
++			printf "        ${expected}\n\n"
++		fi
++	fi
++
++	return $rc
++}
++
++# add route for a prefix, flushing any existing routes first
++# expected to be the first step of a test
++add_route6()
++{
++	local pfx="$1"
++	local nh="$2"
++	local out
++
++	if [ "$VERBOSE" = "1" ]; then
++		echo
++		echo "    ##################################################"
++		echo
++	fi
++
++	run_cmd "$IP -6 ro flush ${pfx}"
++	[ $? -ne 0 ] && exit 1
++
++	out=$($IP -6 ro ls match ${pfx})
++	if [ -n "$out" ]; then
++		echo "Failed to flush routes for prefix used for tests."
++		exit 1
++	fi
++
++	run_cmd "$IP -6 ro add ${pfx} ${nh}"
++	if [ $? -ne 0 ]; then
++		echo "Failed to add initial route for test."
++		exit 1
++	fi
++}
++
++# add initial route - used in replace route tests
++add_initial_route6()
++{
++	add_route6 "2001:db8:104::/64" "$1"
++}
++
++check_route6()
++{
++	local pfx
++	local expected="$1"
++	local out
++	local rc=0
++
++	set -- $expected
++	pfx=$1
++
++	out=$($IP -6 ro ls match ${pfx} | sed -e 's/ pref medium//')
++	check_expected "${out}" "${expected}"
++}
++
++route_cleanup()
++{
++	$IP li del red 2>/dev/null
++	$IP li del dummy1 2>/dev/null
++	$IP li del veth1 2>/dev/null
++	$IP li del veth3 2>/dev/null
++
++	cleanup &> /dev/null
++}
++
++route_setup()
++{
++	route_cleanup
++	setup
++
++	[ "${VERBOSE}" = "1" ] && set -x
++	set -e
++
++	ip netns add ns2
++	ip netns set ns2 auto
++	ip -netns ns2 link set dev lo up
++	ip netns exec ns2 sysctl -qw net.ipv4.ip_forward=1
++	ip netns exec ns2 sysctl -qw net.ipv6.conf.all.forwarding=1
++
++	$IP li add veth1 type veth peer name veth2
++	$IP li add veth3 type veth peer name veth4
++
++	$IP li set veth1 up
++	$IP li set veth3 up
++	$IP li set veth2 netns ns2 up
++	$IP li set veth4 netns ns2 up
++	ip -netns ns2 li add dummy1 type dummy
++	ip -netns ns2 li set dummy1 up
++
++	$IP -6 addr add 2001:db8:101::1/64 dev veth1 nodad
++	$IP -6 addr add 2001:db8:103::1/64 dev veth3 nodad
++	$IP addr add 172.16.101.1/24 dev veth1
++	$IP addr add 172.16.103.1/24 dev veth3
++
++	ip -netns ns2 -6 addr add 2001:db8:101::2/64 dev veth2 nodad
++	ip -netns ns2 -6 addr add 2001:db8:103::2/64 dev veth4 nodad
++	ip -netns ns2 -6 addr add 2001:db8:104::1/64 dev dummy1 nodad
++
++	ip -netns ns2 addr add 172.16.101.2/24 dev veth2
++	ip -netns ns2 addr add 172.16.103.2/24 dev veth4
++	ip -netns ns2 addr add 172.16.104.1/24 dev dummy1
++
++	set +e
++}
++
++# assumption is that basic add of a single path route works
++# otherwise just adding an address on an interface is broken
++ipv6_rt_add()
++{
++	local rc
++
++	echo
++	echo "IPv6 route add / append tests"
++
++	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
++	run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::2"
++	log_test $? 2 "Attempt to add duplicate route - gw"
++
++	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
++	run_cmd "$IP -6 ro add 2001:db8:104::/64 dev veth3"
++	log_test $? 2 "Attempt to add duplicate route - dev only"
++
++	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
++	run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
++	log_test $? 2 "Attempt to add duplicate route - reject route"
++
++	# route append with same prefix adds a new route
++	# - iproute2 sets NLM_F_CREATE | NLM_F_APPEND
++	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
++	run_cmd "$IP -6 ro append 2001:db8:104::/64 via 2001:db8:103::2"
++	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++	log_test $? 0 "Append nexthop to existing route - gw"
++
++	# insert mpath directly
++	add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	check_route6  "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++	log_test $? 0 "Add multipath route"
++
++	add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	run_cmd "$IP -6 ro add 2001:db8:104::/64 nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	log_test $? 2 "Attempt to add duplicate multipath route"
++
++	# insert of a second route without append but different metric
++	add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
++	run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::2 metric 512"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::3 metric 256"
++		rc=$?
++	fi
++	log_test $rc 0 "Route add with different metrics"
++
++	run_cmd "$IP -6 ro del 2001:db8:104::/64 metric 512"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route6 "2001:db8:104::/64 via 2001:db8:103::3 dev veth3 metric 256 2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024"
++		rc=$?
++	fi
++	log_test $rc 0 "Route delete with metric"
++}
++
++ipv6_rt_replace_single()
++{
++	# single path with single path
++	#
++	add_initial_route6 "via 2001:db8:101::2"
++	run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:103::2"
++	check_route6 "2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
++	log_test $? 0 "Single path with single path"
++
++	# single path with multipath
++	#
++	add_initial_route6 "nexthop via 2001:db8:101::2"
++	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::2"
++	check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++	log_test $? 0 "Single path with multipath"
++
++	# single path with single path using MULTIPATH attribute
++	#
++	add_initial_route6 "via 2001:db8:101::2"
++	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:103::2"
++	check_route6 "2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
++	log_test $? 0 "Single path with single path via multipath attribute"
++
++	# route replace fails - invalid nexthop
++	add_initial_route6 "via 2001:db8:101::2"
++	run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:104::2"
++	if [ $? -eq 0 ]; then
++		# previous command is expected to fail so if it returns 0
++		# that means the test failed.
++		log_test 0 1 "Invalid nexthop"
++	else
++		check_route6 "2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024"
++		log_test $? 0 "Invalid nexthop"
++	fi
++
++	# replace non-existent route
++	# - note use of change versus replace since ip adds NLM_F_CREATE
++	#   for replace
++	add_initial_route6 "via 2001:db8:101::2"
++	run_cmd "$IP -6 ro change 2001:db8:105::/64 via 2001:db8:101::2"
++	log_test $? 2 "Single path - replace of non-existent route"
++}
++
++ipv6_rt_replace_mpath()
++{
++	# multipath with multipath
++	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::3"
++	check_route6  "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::3 dev veth3 weight 1"
++	log_test $? 0 "Multipath with multipath"
++
++	# multipath with single
++	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:101::3"
++	check_route6  "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
++	log_test $? 0 "Multipath with single path"
++
++	# multipath with single
++	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3"
++	check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
++	log_test $? 0 "Multipath with single path via multipath attribute"
++
++	# multipath with dev-only
++	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	run_cmd "$IP -6 ro replace 2001:db8:104::/64 dev veth1"
++	check_route6 "2001:db8:104::/64 dev veth1 metric 1024"
++	log_test $? 0 "Multipath with dev-only"
++
++	# route replace fails - invalid nexthop 1
++	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3"
++	check_route6  "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++	log_test $? 0 "Multipath - invalid first nexthop"
++
++	# route replace fails - invalid nexthop 2
++	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:113::3"
++	check_route6  "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++	log_test $? 0 "Multipath - invalid second nexthop"
++
++	# multipath non-existent route
++	add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	run_cmd "$IP -6 ro change 2001:db8:105::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::3"
++	log_test $? 2 "Multipath - replace of non-existent route"
++}
++
++ipv6_rt_replace()
++{
++	echo
++	echo "IPv6 route replace tests"
++
++	ipv6_rt_replace_single
++	ipv6_rt_replace_mpath
++}
++
++ipv6_route_test()
++{
++	route_setup
++
++	ipv6_rt_add
++	ipv6_rt_replace
++
++	route_cleanup
++}
++
++ip_addr_metric_check()
++{
++	ip addr help 2>&1 | grep -q metric
++	if [ $? -ne 0 ]; then
++		echo "iproute2 command does not support metric for addresses. Skipping test"
++		return 1
++	fi
++
++	return 0
++}
++
++ipv6_addr_metric_test()
++{
++	local rc
++
++	echo
++	echo "IPv6 prefix route tests"
++
++	ip_addr_metric_check || return 1
++
++	setup
++
++	set -e
++	$IP li add dummy1 type dummy
++	$IP li add dummy2 type dummy
++	$IP li set dummy1 up
++	$IP li set dummy2 up
++
++	# default entry is metric 256
++	run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64"
++	run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64"
++	set +e
++
++	check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 256 2001:db8:104::/64 dev dummy2 proto kernel metric 256"
++	log_test $? 0 "Default metric"
++
++	set -e
++	run_cmd "$IP -6 addr flush dev dummy1"
++	run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64 metric 257"
++	set +e
++
++	check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 256 2001:db8:104::/64 dev dummy1 proto kernel metric 257"
++	log_test $? 0 "User specified metric on first device"
++
++	set -e
++	run_cmd "$IP -6 addr flush dev dummy2"
++	run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64 metric 258"
++	set +e
++
++	check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 257 2001:db8:104::/64 dev dummy2 proto kernel metric 258"
++	log_test $? 0 "User specified metric on second device"
++
++	run_cmd "$IP -6 addr del dev dummy1 2001:db8:104::1/64 metric 257"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 258"
++		rc=$?
++	fi
++	log_test $rc 0 "Delete of address on first device"
++
++	run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::2/64 metric 259"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
++		rc=$?
++	fi
++	log_test $rc 0 "Modify metric of address"
++
++	# verify prefix route removed on down
++	run_cmd "ip netns exec ns1 sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1"
++	run_cmd "$IP li set dev dummy2 down"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		out=$($IP -6 ro ls match 2001:db8:104::/64)
++		check_expected "${out}" ""
++		rc=$?
++	fi
++	log_test $rc 0 "Prefix route removed on link down"
++
++	# verify prefix route re-inserted with assigned metric
++	run_cmd "$IP li set dev dummy2 up"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
++		rc=$?
++	fi
++	log_test $rc 0 "Prefix route with metric on link up"
++
++	# verify peer metric added correctly
++	set -e
++	run_cmd "$IP -6 addr flush dev dummy2"
++	run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::1 peer 2001:db8:104::2 metric 260"
++	set +e
++
++	check_route6 "2001:db8:104::1 dev dummy2 proto kernel metric 260"
++	log_test $? 0 "Set metric with peer route on local side"
++	check_route6 "2001:db8:104::2 dev dummy2 proto kernel metric 260"
++	log_test $? 0 "Set metric with peer route on peer side"
++
++	set -e
++	run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::1 peer 2001:db8:104::3 metric 261"
++	set +e
++
++	check_route6 "2001:db8:104::1 dev dummy2 proto kernel metric 261"
++	log_test $? 0 "Modify metric and peer address on local side"
++	check_route6 "2001:db8:104::3 dev dummy2 proto kernel metric 261"
++	log_test $? 0 "Modify metric and peer address on peer side"
++
++	$IP li del dummy1
++	$IP li del dummy2
++	cleanup
++}
++
++ipv6_route_metrics_test()
++{
++	local rc
++
++	echo
++	echo "IPv6 routes with metrics"
++
++	route_setup
++
++	#
++	# single path with metrics
++	#
++	run_cmd "$IP -6 ro add 2001:db8:111::/64 via 2001:db8:101::2 mtu 1400"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route6  "2001:db8:111::/64 via 2001:db8:101::2 dev veth1 metric 1024 mtu 1400"
++		rc=$?
++	fi
++	log_test $rc 0 "Single path route with mtu metric"
++
++
++	#
++	# multipath via separate routes with metrics
++	#
++	run_cmd "$IP -6 ro add 2001:db8:112::/64 via 2001:db8:101::2 mtu 1400"
++	run_cmd "$IP -6 ro append 2001:db8:112::/64 via 2001:db8:103::2"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route6 "2001:db8:112::/64 metric 1024 mtu 1400 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++		rc=$?
++	fi
++	log_test $rc 0 "Multipath route via 2 single routes with mtu metric on first"
++
++	# second route is coalesced to first to make a multipath route.
++	# MTU of the second path is hidden from display!
++	run_cmd "$IP -6 ro add 2001:db8:113::/64 via 2001:db8:101::2"
++	run_cmd "$IP -6 ro append 2001:db8:113::/64 via 2001:db8:103::2 mtu 1400"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route6 "2001:db8:113::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++		rc=$?
++	fi
++	log_test $rc 0 "Multipath route via 2 single routes with mtu metric on 2nd"
++
++	run_cmd "$IP -6 ro del 2001:db8:113::/64 via 2001:db8:101::2"
++	if [ $? -eq 0 ]; then
++		check_route6 "2001:db8:113::/64 via 2001:db8:103::2 dev veth3 metric 1024 mtu 1400"
++		log_test $? 0 "    MTU of second leg"
++	fi
++
++	#
++	# multipath with metrics
++	#
++	run_cmd "$IP -6 ro add 2001:db8:115::/64 mtu 1400 nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route6  "2001:db8:115::/64 metric 1024 mtu 1400 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++		rc=$?
++	fi
++	log_test $rc 0 "Multipath route with mtu metric"
++
++	$IP -6 ro add 2001:db8:104::/64 via 2001:db8:101::2 mtu 1300
++	run_cmd "ip netns exec ns1 ${ping6} -w1 -c1 -s 1500 2001:db8:104::1"
++	log_test $? 0 "Using route with mtu metric"
++
++	run_cmd "$IP -6 ro add 2001:db8:114::/64 via  2001:db8:101::2  congctl lock foo"
++	log_test $? 2 "Invalid metric (fails metric_convert)"
++
++	route_cleanup
++}
++
++# add route for a prefix, flushing any existing routes first
++# expected to be the first step of a test
++add_route()
++{
++	local pfx="$1"
++	local nh="$2"
++	local out
++
++	if [ "$VERBOSE" = "1" ]; then
++		echo
++		echo "    ##################################################"
++		echo
++	fi
++
++	run_cmd "$IP ro flush ${pfx}"
++	[ $? -ne 0 ] && exit 1
++
++	out=$($IP ro ls match ${pfx})
++	if [ -n "$out" ]; then
++		echo "Failed to flush routes for prefix used for tests."
++		exit 1
++	fi
++
++	run_cmd "$IP ro add ${pfx} ${nh}"
++	if [ $? -ne 0 ]; then
++		echo "Failed to add initial route for test."
++		exit 1
++	fi
++}
++
++# add initial route - used in replace route tests
++add_initial_route()
++{
++	add_route "172.16.104.0/24" "$1"
++}
++
++check_route()
++{
++	local pfx
++	local expected="$1"
++	local out
++
++	set -- $expected
++	pfx=$1
++	[ "${pfx}" = "unreachable" ] && pfx=$2
++
++	out=$($IP ro ls match ${pfx})
++	check_expected "${out}" "${expected}"
++}
++
++# assumption is that basic add of a single path route works
++# otherwise just adding an address on an interface is broken
++ipv4_rt_add()
++{
++	local rc
++
++	echo
++	echo "IPv4 route add / append tests"
++
++	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++	add_route "172.16.104.0/24" "via 172.16.101.2"
++	run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.2"
++	log_test $? 2 "Attempt to add duplicate route - gw"
++
++	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++	add_route "172.16.104.0/24" "via 172.16.101.2"
++	run_cmd "$IP ro add 172.16.104.0/24 dev veth3"
++	log_test $? 2 "Attempt to add duplicate route - dev only"
++
++	# route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++	add_route "172.16.104.0/24" "via 172.16.101.2"
++	run_cmd "$IP ro add unreachable 172.16.104.0/24"
++	log_test $? 2 "Attempt to add duplicate route - reject route"
++
++	# iproute2 prepend only sets NLM_F_CREATE
++	# - adds a new route; does NOT convert existing route to ECMP
++	add_route "172.16.104.0/24" "via 172.16.101.2"
++	run_cmd "$IP ro prepend 172.16.104.0/24 via 172.16.103.2"
++	check_route "172.16.104.0/24 via 172.16.103.2 dev veth3 172.16.104.0/24 via 172.16.101.2 dev veth1"
++	log_test $? 0 "Add new nexthop for existing prefix"
++
++	# route append with same prefix adds a new route
++	# - iproute2 sets NLM_F_CREATE | NLM_F_APPEND
++	add_route "172.16.104.0/24" "via 172.16.101.2"
++	run_cmd "$IP ro append 172.16.104.0/24 via 172.16.103.2"
++	check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 via 172.16.103.2 dev veth3"
++	log_test $? 0 "Append nexthop to existing route - gw"
++
++	add_route "172.16.104.0/24" "via 172.16.101.2"
++	run_cmd "$IP ro append 172.16.104.0/24 dev veth3"
++	check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 dev veth3 scope link"
++	log_test $? 0 "Append nexthop to existing route - dev only"
++
++	add_route "172.16.104.0/24" "via 172.16.101.2"
++	run_cmd "$IP ro append unreachable 172.16.104.0/24"
++	check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 unreachable 172.16.104.0/24"
++	log_test $? 0 "Append nexthop to existing route - reject route"
++
++	run_cmd "$IP ro flush 172.16.104.0/24"
++	run_cmd "$IP ro add unreachable 172.16.104.0/24"
++	run_cmd "$IP ro append 172.16.104.0/24 via 172.16.103.2"
++	check_route "unreachable 172.16.104.0/24 172.16.104.0/24 via 172.16.103.2 dev veth3"
++	log_test $? 0 "Append nexthop to existing reject route - gw"
++
++	run_cmd "$IP ro flush 172.16.104.0/24"
++	run_cmd "$IP ro add unreachable 172.16.104.0/24"
++	run_cmd "$IP ro append 172.16.104.0/24 dev veth3"
++	check_route "unreachable 172.16.104.0/24 172.16.104.0/24 dev veth3 scope link"
++	log_test $? 0 "Append nexthop to existing reject route - dev only"
++
++	# insert mpath directly
++	add_route "172.16.104.0/24" "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	check_route  "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++	log_test $? 0 "add multipath route"
++
++	add_route "172.16.104.0/24" "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	run_cmd "$IP ro add 172.16.104.0/24 nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	log_test $? 2 "Attempt to add duplicate multipath route"
++
++	# insert of a second route without append but different metric
++	add_route "172.16.104.0/24" "via 172.16.101.2"
++	run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.2 metric 512"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.3 metric 256"
++		rc=$?
++	fi
++	log_test $rc 0 "Route add with different metrics"
++
++	run_cmd "$IP ro del 172.16.104.0/24 metric 512"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 via 172.16.103.3 dev veth3 metric 256"
++		rc=$?
++	fi
++	log_test $rc 0 "Route delete with metric"
++}
++
++ipv4_rt_replace_single()
++{
++	# single path with single path
++	#
++	add_initial_route "via 172.16.101.2"
++	run_cmd "$IP ro replace 172.16.104.0/24 via 172.16.103.2"
++	check_route "172.16.104.0/24 via 172.16.103.2 dev veth3"
++	log_test $? 0 "Single path with single path"
++
++	# single path with multipath
++	#
++	add_initial_route "nexthop via 172.16.101.2"
++	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.2"
++	check_route "172.16.104.0/24 nexthop via 172.16.101.3 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++	log_test $? 0 "Single path with multipath"
++
++	# single path with reject
++	#
++	add_initial_route "nexthop via 172.16.101.2"
++	run_cmd "$IP ro replace unreachable 172.16.104.0/24"
++	check_route "unreachable 172.16.104.0/24"
++	log_test $? 0 "Single path with reject route"
++
++	# single path with single path using MULTIPATH attribute
++	#
++	add_initial_route "via 172.16.101.2"
++	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.103.2"
++	check_route "172.16.104.0/24 via 172.16.103.2 dev veth3"
++	log_test $? 0 "Single path with single path via multipath attribute"
++
++	# route replace fails - invalid nexthop
++	add_initial_route "via 172.16.101.2"
++	run_cmd "$IP ro replace 172.16.104.0/24 via 2001:db8:104::2"
++	if [ $? -eq 0 ]; then
++		# previous command is expected to fail so if it returns 0
++		# that means the test failed.
++		log_test 0 1 "Invalid nexthop"
++	else
++		check_route "172.16.104.0/24 via 172.16.101.2 dev veth1"
++		log_test $? 0 "Invalid nexthop"
++	fi
++
++	# replace non-existent route
++	# - note use of change versus replace since ip adds NLM_F_CREATE
++	#   for replace
++	add_initial_route "via 172.16.101.2"
++	run_cmd "$IP ro change 172.16.105.0/24 via 172.16.101.2"
++	log_test $? 2 "Single path - replace of non-existent route"
++}
++
++ipv4_rt_replace_mpath()
++{
++	# multipath with multipath
++	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.3"
++	check_route  "172.16.104.0/24 nexthop via 172.16.101.3 dev veth1 weight 1 nexthop via 172.16.103.3 dev veth3 weight 1"
++	log_test $? 0 "Multipath with multipath"
++
++	# multipath with single
++	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	run_cmd "$IP ro replace 172.16.104.0/24 via 172.16.101.3"
++	check_route  "172.16.104.0/24 via 172.16.101.3 dev veth1"
++	log_test $? 0 "Multipath with single path"
++
++	# multipath with single
++	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3"
++	check_route "172.16.104.0/24 via 172.16.101.3 dev veth1"
++	log_test $? 0 "Multipath with single path via multipath attribute"
++
++	# multipath with reject
++	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	run_cmd "$IP ro replace unreachable 172.16.104.0/24"
++	check_route "unreachable 172.16.104.0/24"
++	log_test $? 0 "Multipath with reject route"
++
++	# route replace fails - invalid nexthop 1
++	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.111.3 nexthop via 172.16.103.3"
++	check_route  "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++	log_test $? 0 "Multipath - invalid first nexthop"
++
++	# route replace fails - invalid nexthop 2
++	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.113.3"
++	check_route  "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++	log_test $? 0 "Multipath - invalid second nexthop"
++
++	# multipath non-existent route
++	add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	run_cmd "$IP ro change 172.16.105.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.3"
++	log_test $? 2 "Multipath - replace of non-existent route"
++}
++
++ipv4_rt_replace()
++{
++	echo
++	echo "IPv4 route replace tests"
++
++	ipv4_rt_replace_single
++	ipv4_rt_replace_mpath
++}
++
++ipv4_route_test()
++{
++	route_setup
++
++	ipv4_rt_add
++	ipv4_rt_replace
++
++	route_cleanup
++}
++
++ipv4_addr_metric_test()
++{
++	local rc
++
++	echo
++	echo "IPv4 prefix route tests"
++
++	ip_addr_metric_check || return 1
++
++	setup
++
++	set -e
++	$IP li add dummy1 type dummy
++	$IP li add dummy2 type dummy
++	$IP li set dummy1 up
++	$IP li set dummy2 up
++
++	# default entry is metric 256
++	run_cmd "$IP addr add dev dummy1 172.16.104.1/24"
++	run_cmd "$IP addr add dev dummy2 172.16.104.2/24"
++	set +e
++
++	check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2"
++	log_test $? 0 "Default metric"
++
++	set -e
++	run_cmd "$IP addr flush dev dummy1"
++	run_cmd "$IP addr add dev dummy1 172.16.104.1/24 metric 257"
++	set +e
++
++	check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257"
++	log_test $? 0 "User specified metric on first device"
++
++	set -e
++	run_cmd "$IP addr flush dev dummy2"
++	run_cmd "$IP addr add dev dummy2 172.16.104.2/24 metric 258"
++	set +e
++
++	check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
++	log_test $? 0 "User specified metric on second device"
++
++	run_cmd "$IP addr del dev dummy1 172.16.104.1/24 metric 257"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
++		rc=$?
++	fi
++	log_test $rc 0 "Delete of address on first device"
++
++	run_cmd "$IP addr change dev dummy2 172.16.104.2/24 metric 259"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
++		rc=$?
++	fi
++	log_test $rc 0 "Modify metric of address"
++
++	# verify prefix route removed on down
++	run_cmd "$IP li set dev dummy2 down"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		out=$($IP ro ls match 172.16.104.0/24)
++		check_expected "${out}" ""
++		rc=$?
++	fi
++	log_test $rc 0 "Prefix route removed on link down"
++
++	# verify prefix route re-inserted with assigned metric
++	run_cmd "$IP li set dev dummy2 up"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
++		rc=$?
++	fi
++	log_test $rc 0 "Prefix route with metric on link up"
++
++	# explicitly check for metric changes on edge scenarios
++	run_cmd "$IP addr flush dev dummy2"
++	run_cmd "$IP addr add dev dummy2 172.16.104.0/24 metric 259"
++	run_cmd "$IP addr change dev dummy2 172.16.104.0/24 metric 260"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.0 metric 260"
++		rc=$?
++	fi
++	log_test $rc 0 "Modify metric of .0/24 address"
++
++	run_cmd "$IP addr flush dev dummy2"
++	run_cmd "$IP addr add dev dummy2 172.16.104.1/32 peer 172.16.104.2 metric 260"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.104.2 dev dummy2 proto kernel scope link src 172.16.104.1 metric 260"
++		rc=$?
++	fi
++	log_test $rc 0 "Set metric of address with peer route"
++
++	run_cmd "$IP addr change dev dummy2 172.16.104.1/32 peer 172.16.104.3 metric 261"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.104.3 dev dummy2 proto kernel scope link src 172.16.104.1 metric 261"
++		rc=$?
++	fi
++	log_test $rc 0 "Modify metric and peer address for peer route"
++
++	$IP li del dummy1
++	$IP li del dummy2
++	cleanup
++}
++
++ipv4_route_metrics_test()
++{
++	local rc
++
++	echo
++	echo "IPv4 route add / append tests"
++
++	route_setup
++
++	run_cmd "$IP ro add 172.16.111.0/24 via 172.16.101.2 mtu 1400"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.111.0/24 via 172.16.101.2 dev veth1 mtu 1400"
++		rc=$?
++	fi
++	log_test $rc 0 "Single path route with mtu metric"
++
++
++	run_cmd "$IP ro add 172.16.112.0/24 mtu 1400 nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++	rc=$?
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.112.0/24 mtu 1400 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++		rc=$?
++	fi
++	log_test $rc 0 "Multipath route with mtu metric"
++
++	$IP ro add 172.16.104.0/24 via 172.16.101.2 mtu 1300
++	run_cmd "ip netns exec ns1 ping -w1 -c1 -s 1500 172.16.104.1"
++	log_test $? 0 "Using route with mtu metric"
++
++	run_cmd "$IP ro add 172.16.111.0/24 via 172.16.101.2 congctl lock foo"
++	log_test $? 2 "Invalid metric (fails metric_convert)"
++
++	route_cleanup
++}
++
++ipv4_route_v6_gw_test()
++{
++	local rc
++
++	echo
++	echo "IPv4 route with IPv6 gateway tests"
++
++	route_setup
++	sleep 2
++
++	#
++	# single path route
++	#
++	run_cmd "$IP ro add 172.16.104.0/24 via inet6 2001:db8:101::2"
++	rc=$?
++	log_test $rc 0 "Single path route with IPv6 gateway"
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.104.0/24 via inet6 2001:db8:101::2 dev veth1"
++	fi
++
++	run_cmd "ip netns exec ns1 ping -w1 -c1 172.16.104.1"
++	log_test $rc 0 "Single path route with IPv6 gateway - ping"
++
++	run_cmd "$IP ro del 172.16.104.0/24 via inet6 2001:db8:101::2"
++	rc=$?
++	log_test $rc 0 "Single path route delete"
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.112.0/24"
++	fi
++
++	#
++	# multipath - v6 then v4
++	#
++	run_cmd "$IP ro add 172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 nexthop via 172.16.103.2 dev veth3"
++	rc=$?
++	log_test $rc 0 "Multipath route add - v6 nexthop then v4"
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++	fi
++
++	run_cmd "$IP ro del 172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 nexthop via inet6 2001:db8:101::2 dev veth1"
++	log_test $? 2 "    Multipath route delete - nexthops in wrong order"
++
++	run_cmd "$IP ro del 172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 nexthop via 172.16.103.2 dev veth3"
++	log_test $? 0 "    Multipath route delete exact match"
++
++	#
++	# multipath - v4 then v6
++	#
++	run_cmd "$IP ro add 172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 nexthop via inet6 2001:db8:101::2 dev veth1"
++	rc=$?
++	log_test $rc 0 "Multipath route add - v4 nexthop then v6"
++	if [ $rc -eq 0 ]; then
++		check_route "172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 weight 1 nexthop via inet6 2001:db8:101::2 dev veth1 weight 1"
++	fi
++
++	run_cmd "$IP ro del 172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 nexthop via 172.16.103.2 dev veth3"
++	log_test $? 2 "    Multipath route delete - nexthops in wrong order"
++
++	run_cmd "$IP ro del 172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 nexthop via inet6 2001:db8:101::2 dev veth1"
++	log_test $? 0 "    Multipath route delete exact match"
++
++	route_cleanup
++}
++
++################################################################################
++# usage
++
++usage()
++{
++	cat <<EOF
++usage: ${0##*/} OPTS
++
++        -t <test>   Test(s) to run (default: all)
++                    (options: $TESTS)
++        -p          Pause on fail
++        -P          Pause after each test before cleanup
++        -v          verbose mode (show commands and output)
++EOF
++}
++
++################################################################################
++# main
++
++while getopts :t:pPhv o
++do
++	case $o in
++		t) TESTS=$OPTARG;;
++		p) PAUSE_ON_FAIL=yes;;
++		P) PAUSE=yes;;
++		v) VERBOSE=$(($VERBOSE + 1));;
++		h) usage; exit 0;;
++		*) usage; exit 1;;
++	esac
++done
++
++PEER_CMD="ip netns exec ${PEER_NS}"
++
++# make sure we don't pause twice
++[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no
++
++if [ "$(id -u)" -ne 0 ];then
++	echo "SKIP: Need root privileges"
++	exit $ksft_skip;
++fi
++
++if [ ! -x "$(command -v ip)" ]; then
++	echo "SKIP: Could not run test without ip tool"
++	exit $ksft_skip
++fi
++
++ip route help 2>&1 | grep -q fibmatch
++if [ $? -ne 0 ]; then
++	echo "SKIP: iproute2 too old, missing fibmatch"
++	exit $ksft_skip
++fi
++
++# start clean
++cleanup &> /dev/null
++
++for t in $TESTS
++do
++	case $t in
++	fib_unreg_test|unregister)	fib_unreg_test;;
++	fib_down_test|down)		fib_down_test;;
++	fib_carrier_test|carrier)	fib_carrier_test;;
++	fib_rp_filter_test|rp_filter)	fib_rp_filter_test;;
++	fib_nexthop_test|nexthop)	fib_nexthop_test;;
++	fib_suppress_test|suppress)	fib_suppress_test;;
++	ipv6_route_test|ipv6_rt)	ipv6_route_test;;
++	ipv4_route_test|ipv4_rt)	ipv4_route_test;;
++	ipv6_addr_metric)		ipv6_addr_metric_test;;
++	ipv4_addr_metric)		ipv4_addr_metric_test;;
++	ipv6_route_metrics)		ipv6_route_metrics_test;;
++	ipv4_route_metrics)		ipv4_route_metrics_test;;
++	ipv4_route_v6_gw)		ipv4_route_v6_gw_test;;
++
++	help) echo "Test names: $TESTS"; exit 0;;
++	esac
++done
++
++if [ "$TESTS" != "none" ]; then
++	printf "\nTests passed: %3d\n" ${nsuccess}
++	printf "Tests failed: %3d\n"   ${nfail}
++fi
++
++exit $ret
diff --git a/queue-5.4/series b/queue-5.4/series
index ce8a39f8697..9a7c49c5872 100644
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -99,3 +99,30 @@ nvme-fc-fix-a-missing-queue-put-in-nvmet_fc_ls_creat.patch
 aio-fix-mremap-after-fork-null-deref.patch
 btrfs-free-device-in-btrfs_close_devices-for-a-single-device-filesystem.patch
 netfilter-nft_tproxy-restrict-to-prerouting-hook.patch
+xfs-remove-the-xfs_efi_log_item_t-typedef.patch
+xfs-remove-the-xfs_efd_log_item_t-typedef.patch
+xfs-remove-the-xfs_inode_log_item_t-typedef.patch
+xfs-factor-out-a-xfs_defer_create_intent-helper.patch
+xfs-merge-the-log_item-defer-op-into-create_intent.patch
+xfs-merge-the-diff_items-defer-op-into-create_intent.patch
+xfs-turn-dfp_intent-into-a-xfs_log_item.patch
+xfs-refactor-xfs_defer_finish_noroll.patch
+xfs-log-new-intent-items-created-as-part-of-finishing-recovered-intent-items.patch
+xfs-fix-finobt-btree-block-recovery-ordering.patch
+xfs-proper-replay-of-deferred-ops-queued-during-log-recovery.patch
+xfs-xfs_defer_capture-should-absorb-remaining-block-reservations.patch
+xfs-xfs_defer_capture-should-absorb-remaining-transaction-reservation.patch
+xfs-clean-up-bmap-intent-item-recovery-checking.patch
+xfs-clean-up-xfs_bui_item_recover-iget-trans_alloc-ilock-ordering.patch
+xfs-fix-an-incore-inode-uaf-in-xfs_bui_recover.patch
+xfs-change-the-order-in-which-child-and-parent-defer-ops-are-finished.patch
+xfs-periodically-relog-deferred-intent-items.patch
+xfs-expose-the-log-push-threshold.patch
+xfs-only-relog-deferred-intent-items-if-free-space-in-the-log-gets-low.patch
+xfs-fix-missing-cow-blocks-writeback-conversion-retry.patch
+xfs-ensure-inobt-record-walks-always-make-forward-progress.patch
+xfs-fix-the-forward-progress-assertion-in-xfs_iwalk_run_callbacks.patch
+xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
+xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch
+revert-ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch
+ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch
diff --git a/queue-5.4/xfs-change-the-order-in-which-child-and-parent-defer-ops-are-finished.patch b/queue-5.4/xfs-change-the-order-in-which-child-and-parent-defer-ops-are-finished.patch
new file mode 100644
index 00000000000..a5d5505e8ae
--- /dev/null
+++ b/queue-5.4/xfs-change-the-order-in-which-child-and-parent-defer-ops-are-finished.patch
@@ -0,0 +1,216 @@
+From chandan.babu@oracle.com Thu Feb 16 06:22:40 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:11 +0530
+Subject: xfs: change the order in which child and parent defer ops are finished
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-18-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 27dada070d59c28a441f1907d2cec891b17dcb26 upstream.
+
+The defer ops code has been finishing items in the wrong order -- if a
+top level defer op creates items A and B, and finishing item A creates
+more defer ops A1 and A2, we'll put the new items on the end of the
+chain and process them in the order A B A1 A2.  This is kind of weird,
+since it's convenient for programmers to be able to think of A and B as
+an ordered sequence where all the sub-tasks for A must finish before we
+move on to B, e.g. A A1 A2 D.
+
+Right now, our log intent items are not so complex that this matters,
+but this will become important for the atomic extent swapping patchset.
+In order to maintain correct reference counting of extents, we have to
+unmap and remap extents in that order, and we want to complete that work
+before moving on to the next range that the user wants to swap.  This
+patch fixes defer ops to satsify that requirement.
+
+The primary symptom of the incorrect order was noticed in an early
+performance analysis of the atomic extent swap code.  An astonishingly
+large number of deferred work items accumulated when userspace requested
+an atomic update of two very fragmented files.  The cause of this was
+traced to the same ordering bug in the inner loop of
+xfs_defer_finish_noroll.
+
+If the ->finish_item method of a deferred operation queues new deferred
+operations, those new deferred ops are appended to the tail of the
+pending work list.  To illustrate, say that a caller creates a
+transaction t0 with four deferred operations D0-D3.  The first thing
+defer ops does is roll the transaction to t1, leaving us with:
+
+t1: D0(t0), D1(t0), D2(t0), D3(t0)
+
+Let's say that finishing each of D0-D3 will create two new deferred ops.
+After finish D0 and roll, we'll have the following chain:
+
+t2: D1(t0), D2(t0), D3(t0), d4(t1), d5(t1)
+
+d4 and d5 were logged to t1.  Notice that while we're about to start
+work on D1, we haven't actually completed all the work implied by D0
+being finished.  So far we've been careful (or lucky) to structure the
+dfops callers such that D1 doesn't depend on d4 or d5 being finished,
+but this is a potential logic bomb.
+
+There's a second problem lurking.  Let's see what happens as we finish
+D1-D3:
+
+t3: D2(t0), D3(t0), d4(t1), d5(t1), d6(t2), d7(t2)
+t4: D3(t0), d4(t1), d5(t1), d6(t2), d7(t2), d8(t3), d9(t3)
+t5: d4(t1), d5(t1), d6(t2), d7(t2), d8(t3), d9(t3), d10(t4), d11(t4)
+
+Let's say that d4-d11 are simple work items that don't queue any other
+operations, which means that we can complete each d4 and roll to t6:
+
+t6: d5(t1), d6(t2), d7(t2), d8(t3), d9(t3), d10(t4), d11(t4)
+t7: d6(t2), d7(t2), d8(t3), d9(t3), d10(t4), d11(t4)
+...
+t11: d10(t4), d11(t4)
+t12: d11(t4)
+<done>
+
+When we try to roll to transaction #12, we're holding defer op d11,
+which we logged way back in t4.  This means that the tail of the log is
+pinned at t4.  If the log is very small or there are a lot of other
+threads updating metadata, this means that we might have wrapped the log
+and cannot get roll to t11 because there isn't enough space left before
+we'd run into t4.
+
+Let's shift back to the original failure.  I mentioned before that I
+discovered this flaw while developing the atomic file update code.  In
+that scenario, we have a defer op (D0) that finds a range of file blocks
+to remap, creates a handful of new defer ops to do that, and then asks
+to be continued with however much work remains.
+
+So, D0 is the original swapext deferred op.  The first thing defer ops
+does is rolls to t1:
+
+t1: D0(t0)
+
+We try to finish D0, logging d1 and d2 in the process, but can't get all
+the work done.  We log a done item and a new intent item for the work
+that D0 still has to do, and roll to t2:
+
+t2: D0'(t1), d1(t1), d2(t1)
+
+We roll and try to finish D0', but still can't get all the work done, so
+we log a done item and a new intent item for it, requeue D0 a second
+time, and roll to t3:
+
+t3: D0''(t2), d1(t1), d2(t1), d3(t2), d4(t2)
+
+If it takes 48 more rolls to complete D0, then we'll finally dispense
+with D0 in t50:
+
+t50: D<fifty primes>(t49), d1(t1), ..., d102(t50)
+
+We then try to roll again to get a chain like this:
+
+t51: d1(t1), d2(t1), ..., d101(t50), d102(t50)
+...
+t152: d102(t50)
+<done>
+
+Notice that in rolling to transaction #51, we're holding on to a log
+intent item for d1 that was logged in transaction #1.  This means that
+the tail of the log is pinned at t1.  If the log is very small or there
+are a lot of other threads updating metadata, this means that we might
+have wrapped the log and cannot roll to t51 because there isn't enough
+space left before we'd run into t1.  This is of course problem #2 again.
+
+But notice the third problem with this scenario: we have 102 defer ops
+tied to this transaction!  Each of these items are backed by pinned
+kernel memory, which means that we risk OOM if the chains get too long.
+
+Yikes.  Problem #1 is a subtle logic bomb that could hit someone in the
+future; problem #2 applies (rarely) to the current upstream, and problem
+
+This is not how incremental deferred operations were supposed to work.
+The dfops design of logging in the same transaction an intent-done item
+and a new intent item for the work remaining was to make it so that we
+only have to juggle enough deferred work items to finish that one small
+piece of work.  Deferred log item recovery will find that first
+unfinished work item and restart it, no matter how many other intent
+items might follow it in the log.  Therefore, it's ok to put the new
+intents at the start of the dfops chain.
+
+For the first example, the chains look like this:
+
+t2: d4(t1), d5(t1), D1(t0), D2(t0), D3(t0)
+t3: d5(t1), D1(t0), D2(t0), D3(t0)
+...
+t9: d9(t7), D3(t0)
+t10: D3(t0)
+t11: d10(t10), d11(t10)
+t12: d11(t10)
+
+For the second example, the chains look like this:
+
+t1: D0(t0)
+t2: d1(t1), d2(t1), D0'(t1)
+t3: d2(t1), D0'(t1)
+t4: D0'(t1)
+t5: d1(t4), d2(t4), D0''(t4)
+...
+t148: D0<50 primes>(t147)
+t149: d101(t148), d102(t148)
+t150: d102(t148)
+<done>
+
+This actually sucks more for pinning the log tail (we try to roll to t10
+while holding an intent item that was logged in t1) but we've solved
+problem #1.  We've also reduced the maximum chain length from:
+
+    sum(all the new items) + nr_original_items
+
+to:
+
+    max(new items that each original item creates) + nr_original_items
+
+This solves problem #3 by sharply reducing the number of defer ops that
+can be attached to a transaction at any given time.  The change makes
+the problem of log tail pinning worse, but is improvement we need to
+solve problem #2.  Actually solving #2, however, is left to the next
+patch.
+
+Note that a subsequent analysis of some hard-to-trigger reflink and COW
+livelocks on extremely fragmented filesystems (or systems running a lot
+of IO threads) showed the same symptoms -- uncomfortably large numbers
+of incore deferred work items and occasional stalls in the transaction
+grant code while waiting for log reservations.  I think this patch and
+the next one will also solve these problems.
+
+As originally written, the code used list_splice_tail_init instead of
+list_splice_init, so change that, and leave a short comment explaining
+our actions.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c |   11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -431,8 +431,17 @@ xfs_defer_finish_noroll(
+ 
+ 	/* Until we run out of pending work to finish... */
+ 	while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
++		/*
++		 * Deferred items that are created in the process of finishing
++		 * other deferred work items should be queued at the head of
++		 * the pending list, which puts them ahead of the deferred work
++		 * that was created by the caller.  This keeps the number of
++		 * pending work items to a minimum, which decreases the amount
++		 * of time that any one intent item can stick around in memory,
++		 * pinning the log tail.
++		 */
+ 		xfs_defer_create_intents(*tp);
+-		list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
++		list_splice_init(&(*tp)->t_dfops, &dop_pending);
+ 
+ 		error = xfs_defer_trans_roll(tp);
+ 		if (error)
diff --git a/queue-5.4/xfs-clean-up-bmap-intent-item-recovery-checking.patch b/queue-5.4/xfs-clean-up-bmap-intent-item-recovery-checking.patch
new file mode 100644
index 00000000000..8864a53d473
--- /dev/null
+++ b/queue-5.4/xfs-clean-up-bmap-intent-item-recovery-checking.patch
@@ -0,0 +1,100 @@
+From chandan.babu@oracle.com Thu Feb 16 06:22:15 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:08 +0530
+Subject: xfs: clean up bmap intent item recovery checking
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-15-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 919522e89f8e71fc6a8f8abe17be4011573c6ea0 upstream.
+
+The bmap intent item checking code in xfs_bui_item_recover is spread all
+over the function.  We should check the recovered log item at the top
+before we allocate any resources or do anything else, so do that.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bmap_item.c |   38 ++++++++++++--------------------------
+ 1 file changed, 12 insertions(+), 26 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -434,9 +434,7 @@ xfs_bui_recover(
+ 	xfs_fsblock_t			startblock_fsb;
+ 	xfs_fsblock_t			inode_fsb;
+ 	xfs_filblks_t			count;
+-	bool				op_ok;
+ 	struct xfs_bud_log_item		*budp;
+-	enum xfs_bmap_intent_type	type;
+ 	int				whichfork;
+ 	xfs_exntst_t			state;
+ 	struct xfs_trans		*tp;
+@@ -462,16 +460,19 @@ xfs_bui_recover(
+ 			   XFS_FSB_TO_DADDR(mp, bmap->me_startblock));
+ 	inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp,
+ 			XFS_INO_TO_FSB(mp, bmap->me_owner)));
+-	switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
++	state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
++			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
++	whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
++			XFS_ATTR_FORK : XFS_DATA_FORK;
++	bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
++	switch (bui_type) {
+ 	case XFS_BMAP_MAP:
+ 	case XFS_BMAP_UNMAP:
+-		op_ok = true;
+ 		break;
+ 	default:
+-		op_ok = false;
+-		break;
++		return -EFSCORRUPTED;
+ 	}
+-	if (!op_ok || startblock_fsb == 0 ||
++	if (startblock_fsb == 0 ||
+ 	    bmap->me_len == 0 ||
+ 	    inode_fsb == 0 ||
+ 	    startblock_fsb >= mp->m_sb.sb_dblocks ||
+@@ -502,32 +503,17 @@ xfs_bui_recover(
+ 	if (VFS_I(ip)->i_nlink == 0)
+ 		xfs_iflags_set(ip, XFS_IRECOVERY);
+ 
+-	/* Process deferred bmap item. */
+-	state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+-			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+-	whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+-			XFS_ATTR_FORK : XFS_DATA_FORK;
+-	bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+-	switch (bui_type) {
+-	case XFS_BMAP_MAP:
+-	case XFS_BMAP_UNMAP:
+-		type = bui_type;
+-		break;
+-	default:
+-		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+-		error = -EFSCORRUPTED;
+-		goto err_inode;
+-	}
+ 	xfs_trans_ijoin(tp, ip, 0);
+ 
+ 	count = bmap->me_len;
+-	error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork,
+-			bmap->me_startoff, bmap->me_startblock, &count, state);
++	error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
++			whichfork, bmap->me_startoff, bmap->me_startblock,
++			&count, state);
+ 	if (error)
+ 		goto err_inode;
+ 
+ 	if (count > 0) {
+-		ASSERT(type == XFS_BMAP_UNMAP);
++		ASSERT(bui_type == XFS_BMAP_UNMAP);
+ 		irec.br_startblock = bmap->me_startblock;
+ 		irec.br_blockcount = count;
+ 		irec.br_startoff = bmap->me_startoff;
diff --git a/queue-5.4/xfs-clean-up-xfs_bui_item_recover-iget-trans_alloc-ilock-ordering.patch b/queue-5.4/xfs-clean-up-xfs_bui_item_recover-iget-trans_alloc-ilock-ordering.patch
new file mode 100644
index 00000000000..09c140b8daf
--- /dev/null
+++ b/queue-5.4/xfs-clean-up-xfs_bui_item_recover-iget-trans_alloc-ilock-ordering.patch
@@ -0,0 +1,108 @@
+From chandan.babu@oracle.com Thu Feb 16 06:22:24 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:09 +0530
+Subject: xfs: clean up xfs_bui_item_recover iget/trans_alloc/ilock ordering
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-16-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 64a3f3315bc60f710a0a25c1798ac0ea58c6fa1f upstream.
+
+In most places in XFS, we have a specific order in which we gather
+resources: grab the inode, allocate a transaction, then lock the inode.
+xfs_bui_item_recover doesn't do it in that order, so fix it to be more
+consistent.  This also makes the error bailout code a bit less weird.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bmap_item.c |   38 ++++++++++++++++++++++++--------------
+ 1 file changed, 24 insertions(+), 14 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -22,6 +22,7 @@
+ #include "xfs_bmap_btree.h"
+ #include "xfs_trans_space.h"
+ #include "xfs_error.h"
++#include "xfs_quota.h"
+ 
+ kmem_zone_t	*xfs_bui_zone;
+ kmem_zone_t	*xfs_bud_zone;
+@@ -488,21 +489,26 @@ xfs_bui_recover(
+ 		return -EFSCORRUPTED;
+ 	}
+ 
+-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+-			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
++	/* Grab the inode. */
++	error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip);
+ 	if (error)
+ 		return error;
+ 
+-	budp = xfs_trans_get_bud(tp, buip);
+-
+-	/* Grab the inode. */
+-	error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip);
++	error = xfs_qm_dqattach(ip);
+ 	if (error)
+-		goto err_inode;
++		goto err_rele;
+ 
+ 	if (VFS_I(ip)->i_nlink == 0)
+ 		xfs_iflags_set(ip, XFS_IRECOVERY);
+ 
++	/* Allocate transaction and do the work. */
++	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
++			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
++	if (error)
++		goto err_rele;
++
++	budp = xfs_trans_get_bud(tp, buip);
++	xfs_ilock(ip, XFS_ILOCK_EXCL);
+ 	xfs_trans_ijoin(tp, ip, 0);
+ 
+ 	count = bmap->me_len;
+@@ -510,7 +516,7 @@ xfs_bui_recover(
+ 			whichfork, bmap->me_startoff, bmap->me_startblock,
+ 			&count, state);
+ 	if (error)
+-		goto err_inode;
++		goto err_cancel;
+ 
+ 	if (count > 0) {
+ 		ASSERT(bui_type == XFS_BMAP_UNMAP);
+@@ -522,16 +528,20 @@ xfs_bui_recover(
+ 	}
+ 
+ 	set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
++	/* Commit transaction, which frees the transaction. */
+ 	error = xfs_defer_ops_capture_and_commit(tp, capture_list);
++	if (error)
++		goto err_unlock;
++
+ 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ 	xfs_irele(ip);
+-	return error;
++	return 0;
+ 
+-err_inode:
++err_cancel:
+ 	xfs_trans_cancel(tp);
+-	if (ip) {
+-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+-		xfs_irele(ip);
+-	}
++err_unlock:
++	xfs_iunlock(ip, XFS_ILOCK_EXCL);
++err_rele:
++	xfs_irele(ip);
+ 	return error;
+ }
diff --git a/queue-5.4/xfs-ensure-inobt-record-walks-always-make-forward-progress.patch b/queue-5.4/xfs-ensure-inobt-record-walks-always-make-forward-progress.patch
new file mode 100644
index 00000000000..b670d4ce02e
--- /dev/null
+++ b/queue-5.4/xfs-ensure-inobt-record-walks-always-make-forward-progress.patch
@@ -0,0 +1,142 @@
+From stable-owner@vger.kernel.org Thu Feb 16 07:28:47 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:16 +0530
+Subject: xfs: ensure inobt record walks always make forward progress
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-23-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 27c14b5daa82861220d6fa6e27b51f05f21ffaa7 upstream.
+
+[ In xfs_iwalk_ag(), Replace a call to XFS_IS_CORRUPT() with a call to
+  ASSERT() ]
+
+The aim of the inode btree record iterator function is to call a
+callback on every record in the btree.  To avoid having to tear down and
+recreate the inode btree cursor around every callback, it caches a
+certain number of records in a memory buffer.  After each batch of
+callback invocations, we have to perform a btree lookup to find the
+next record after where we left off.
+
+However, if the keys of the inode btree are corrupt, the lookup might
+put us in the wrong part of the inode btree, causing the walk function
+to loop forever.  Therefore, we add extra cursor tracking to make sure
+that we never go backwards neither when performing the lookup nor when
+jumping to the next inobt record.  This also fixes an off by one error
+where upon resume the lookup should have been for the inode /after/ the
+point at which we stopped.
+
+Found by fuzzing xfs/460 with keys[2].startino = ones causing bulkstat
+and quotacheck to hang.
+
+Fixes: a211432c27ff ("xfs: create simplified inode walk function")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iwalk.c |   27 ++++++++++++++++++++++++---
+ 1 file changed, 24 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_iwalk.c
++++ b/fs/xfs/xfs_iwalk.c
+@@ -55,6 +55,9 @@ struct xfs_iwalk_ag {
+ 	/* Where do we start the traversal? */
+ 	xfs_ino_t			startino;
+ 
++	/* What was the last inode number we saw when iterating the inobt? */
++	xfs_ino_t			lastino;
++
+ 	/* Array of inobt records we cache. */
+ 	struct xfs_inobt_rec_incore	*recs;
+ 
+@@ -300,6 +303,9 @@ xfs_iwalk_ag_start(
+ 		return error;
+ 	XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1);
+ 
++	iwag->lastino = XFS_AGINO_TO_INO(mp, agno,
++				irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
++
+ 	/*
+ 	 * If the LE lookup yielded an inobt record before the cursor position,
+ 	 * skip it and see if there's another one after it.
+@@ -346,15 +352,17 @@ xfs_iwalk_run_callbacks(
+ 	struct xfs_mount		*mp = iwag->mp;
+ 	struct xfs_trans		*tp = iwag->tp;
+ 	struct xfs_inobt_rec_incore	*irec;
+-	xfs_agino_t			restart;
++	xfs_agino_t			next_agino;
+ 	int				error;
+ 
++	next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
++
+ 	ASSERT(iwag->nr_recs > 0);
+ 
+ 	/* Delete cursor but remember the last record we cached... */
+ 	xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+ 	irec = &iwag->recs[iwag->nr_recs - 1];
+-	restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
++	ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
+ 
+ 	error = xfs_iwalk_ag_recs(iwag);
+ 	if (error)
+@@ -371,7 +379,7 @@ xfs_iwalk_run_callbacks(
+ 	if (error)
+ 		return error;
+ 
+-	return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
++	return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
+ }
+ 
+ /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
+@@ -395,6 +403,7 @@ xfs_iwalk_ag(
+ 
+ 	while (!error && has_more) {
+ 		struct xfs_inobt_rec_incore	*irec;
++		xfs_ino_t			rec_fsino;
+ 
+ 		cond_resched();
+ 		if (xfs_pwork_want_abort(&iwag->pwork))
+@@ -406,6 +415,15 @@ xfs_iwalk_ag(
+ 		if (error || !has_more)
+ 			break;
+ 
++		/* Make sure that we always move forward. */
++		rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino);
++		if (iwag->lastino != NULLFSINO && iwag->lastino >= rec_fsino) {
++			ASSERT(iwag->lastino < rec_fsino);
++			error = -EFSCORRUPTED;
++			goto out;
++		}
++		iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
++
+ 		/* No allocated inodes in this chunk; skip it. */
+ 		if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
+ 			error = xfs_btree_increment(cur, 0, &has_more);
+@@ -534,6 +552,7 @@ xfs_iwalk(
+ 		.trim_start	= 1,
+ 		.skip_empty	= 1,
+ 		.pwork		= XFS_PWORK_SINGLE_THREADED,
++		.lastino	= NULLFSINO,
+ 	};
+ 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+ 	int			error;
+@@ -622,6 +641,7 @@ xfs_iwalk_threaded(
+ 		iwag->data = data;
+ 		iwag->startino = startino;
+ 		iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
++		iwag->lastino = NULLFSINO;
+ 		xfs_pwork_queue(&pctl, &iwag->pwork);
+ 		startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ 		if (flags & XFS_INOBT_WALK_SAME_AG)
+@@ -695,6 +715,7 @@ xfs_inobt_walk(
+ 		.startino	= startino,
+ 		.sz_recs	= xfs_inobt_walk_prefetch(inobt_records),
+ 		.pwork		= XFS_PWORK_SINGLE_THREADED,
++		.lastino	= NULLFSINO,
+ 	};
+ 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
+ 	int			error;
diff --git a/queue-5.4/xfs-expose-the-log-push-threshold.patch b/queue-5.4/xfs-expose-the-log-push-threshold.patch
new file mode 100644
index 00000000000..8ab410b30cd
--- /dev/null
+++ b/queue-5.4/xfs-expose-the-log-push-threshold.patch
@@ -0,0 +1,118 @@
+From stable-owner@vger.kernel.org Thu Feb 16 07:30:39 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:13 +0530
+Subject: xfs: expose the log push threshold
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-20-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit ed1575daf71e4e21d8ae735b6e687c95454aaa17 upstream.
+
+Separate the computation of the log push threshold and the push logic in
+xlog_grant_push_ail.  This enables higher level code to determine (for
+example) that it is holding on to a logged intent item and the log is so
+busy that it is more than 75% full.  In that case, it would be desirable
+to move the log item towards the head to release the tail, which we will
+cover in the next patch.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icreate_item.c |    1 +
+ fs/xfs/xfs_log.c          |   40 ++++++++++++++++++++++++++++++----------
+ fs/xfs/xfs_log.h          |    2 ++
+ 3 files changed, 33 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_icreate_item.c
++++ b/fs/xfs/xfs_icreate_item.c
+@@ -10,6 +10,7 @@
+ #include "xfs_trans.h"
+ #include "xfs_trans_priv.h"
+ #include "xfs_icreate_item.h"
++#include "xfs_log_priv.h"
+ #include "xfs_log.h"
+ 
+ kmem_zone_t	*xfs_icreate_zone;		/* inode create item zone */
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -1537,14 +1537,14 @@ xlog_commit_record(
+ }
+ 
+ /*
+- * Push on the buffer cache code if we ever use more than 75% of the on-disk
+- * log space.  This code pushes on the lsn which would supposedly free up
+- * the 25% which we want to leave free.  We may need to adopt a policy which
+- * pushes on an lsn which is further along in the log once we reach the high
+- * water mark.  In this manner, we would be creating a low water mark.
++ * Compute the LSN that we'd need to push the log tail towards in order to have
++ * (a) enough on-disk log space to log the number of bytes specified, (b) at
++ * least 25% of the log space free, and (c) at least 256 blocks free.  If the
++ * log free space already meets all three thresholds, this function returns
++ * NULLCOMMITLSN.
+  */
+-STATIC void
+-xlog_grant_push_ail(
++xfs_lsn_t
++xlog_grant_push_threshold(
+ 	struct xlog	*log,
+ 	int		need_bytes)
+ {
+@@ -1570,7 +1570,7 @@ xlog_grant_push_ail(
+ 	free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
+ 	free_threshold = max(free_threshold, 256);
+ 	if (free_blocks >= free_threshold)
+-		return;
++		return NULLCOMMITLSN;
+ 
+ 	xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
+ 						&threshold_block);
+@@ -1590,13 +1590,33 @@ xlog_grant_push_ail(
+ 	if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+ 		threshold_lsn = last_sync_lsn;
+ 
++	return threshold_lsn;
++}
++
++/*
++ * Push the tail of the log if we need to do so to maintain the free log space
++ * thresholds set out by xlog_grant_push_threshold.  We may need to adopt a
++ * policy which pushes on an lsn which is further along in the log once we
++ * reach the high water mark.  In this manner, we would be creating a low water
++ * mark.
++ */
++STATIC void
++xlog_grant_push_ail(
++	struct xlog	*log,
++	int		need_bytes)
++{
++	xfs_lsn_t	threshold_lsn;
++
++	threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
++	if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log))
++		return;
++
+ 	/*
+ 	 * Get the transaction layer to kick the dirty buffers out to
+ 	 * disk asynchronously. No point in trying to do this if
+ 	 * the filesystem is shutting down.
+ 	 */
+-	if (!XLOG_FORCED_SHUTDOWN(log))
+-		xfs_ail_push(log->l_ailp, threshold_lsn);
++	xfs_ail_push(log->l_ailp, threshold_lsn);
+ }
+ 
+ /*
+--- a/fs/xfs/xfs_log.h
++++ b/fs/xfs/xfs_log.h
+@@ -146,4 +146,6 @@ void	xfs_log_quiesce(struct xfs_mount *m
+ bool	xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
+ bool	xfs_log_in_recovery(struct xfs_mount *);
+ 
++xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
++
+ #endif	/* __XFS_LOG_H__ */
diff --git a/queue-5.4/xfs-factor-out-a-xfs_defer_create_intent-helper.patch b/queue-5.4/xfs-factor-out-a-xfs_defer_create_intent-helper.patch
new file mode 100644
index 00000000000..051cdcba794
--- /dev/null
+++ b/queue-5.4/xfs-factor-out-a-xfs_defer_create_intent-helper.patch
@@ -0,0 +1,96 @@
+From chandan.babu@oracle.com Thu Feb 16 06:21:02 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:49:58 +0530
+Subject: xfs: factor out a xfs_defer_create_intent helper
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-5-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit e046e949486ec92d83b2ccdf0e7e9144f74ef028 upstream.
+
+Create a helper that encapsulates the whole logic to create a defer
+intent.  This reorders some of the work that was done, but none of
+that has an affect on the operation as only fields that don't directly
+interact are affected.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c |   39 +++++++++++++++++++++++----------------
+ 1 file changed, 23 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -178,6 +178,23 @@ static const struct xfs_defer_op_type *d
+ 	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
+ };
+ 
++static void
++xfs_defer_create_intent(
++	struct xfs_trans		*tp,
++	struct xfs_defer_pending	*dfp,
++	bool				sort)
++{
++	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
++	struct list_head		*li;
++
++	if (sort)
++		list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
++
++	dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
++	list_for_each(li, &dfp->dfp_work)
++		ops->log_item(tp, dfp->dfp_intent, li);
++}
++
+ /*
+  * For each pending item in the intake list, log its intent item and the
+  * associated extents, then add the entire intake list to the end of
+@@ -187,17 +204,11 @@ STATIC void
+ xfs_defer_create_intents(
+ 	struct xfs_trans		*tp)
+ {
+-	struct list_head		*li;
+ 	struct xfs_defer_pending	*dfp;
+-	const struct xfs_defer_op_type	*ops;
+ 
+ 	list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
+-		ops = defer_op_types[dfp->dfp_type];
+-		dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
+ 		trace_xfs_defer_create_intent(tp->t_mountp, dfp);
+-		list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
+-		list_for_each(li, &dfp->dfp_work)
+-			ops->log_item(tp, dfp->dfp_intent, li);
++		xfs_defer_create_intent(tp, dfp, true);
+ 	}
+ }
+ 
+@@ -427,17 +438,13 @@ xfs_defer_finish_noroll(
+ 		}
+ 		if (error == -EAGAIN) {
+ 			/*
+-			 * Caller wants a fresh transaction, so log a
+-			 * new log intent item to replace the old one
+-			 * and roll the transaction.  See "Requesting
+-			 * a Fresh Transaction while Finishing
+-			 * Deferred Work" above.
++			 * Caller wants a fresh transaction, so log a new log
++			 * intent item to replace the old one and roll the
++			 * transaction.  See "Requesting a Fresh Transaction
++			 * while Finishing Deferred Work" above.
+ 			 */
+-			dfp->dfp_intent = ops->create_intent(*tp,
+-					dfp->dfp_count);
+ 			dfp->dfp_done = NULL;
+-			list_for_each(li, &dfp->dfp_work)
+-				ops->log_item(*tp, dfp->dfp_intent, li);
++			xfs_defer_create_intent(*tp, dfp, false);
+ 		} else {
+ 			/* Done with the dfp, free it. */
+ 			list_del(&dfp->dfp_list);
diff --git a/queue-5.4/xfs-fix-an-incore-inode-uaf-in-xfs_bui_recover.patch b/queue-5.4/xfs-fix-an-incore-inode-uaf-in-xfs_bui_recover.patch
new file mode 100644
index 00000000000..ff22f6a28f0
--- /dev/null
+++ b/queue-5.4/xfs-fix-an-incore-inode-uaf-in-xfs_bui_recover.patch
@@ -0,0 +1,251 @@
+From stable-owner@vger.kernel.org Thu Feb 16 08:00:00 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:10 +0530
+Subject: xfs: fix an incore inode UAF in xfs_bui_recover
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-17-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit ff4ab5e02a0447dd1e290883eb6cd7d94848e590 upstream.
+
+In xfs_bui_item_recover, there exists a use-after-free bug with regards
+to the inode that is involved in the bmap replay operation.  If the
+mapping operation does not complete, we call xfs_bmap_unmap_extent to
+create a deferred op to finish the unmapping work, and we retain a
+pointer to the incore inode.
+
+Unfortunately, the very next thing we do is commit the transaction and
+drop the inode.  If reclaim tears down the inode before we try to finish
+the defer ops, we dereference garbage and blow up.  Therefore, create a
+way to join inodes to the defer ops freezer so that we can maintain the
+xfs_inode reference until we're done with the inode.
+
+Note: This imposes the requirement that there be enough memory to keep
+every incore inode in memory throughout recovery.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c  |   43 ++++++++++++++++++++++++++++++++++++++-----
+ fs/xfs/libxfs/xfs_defer.h  |   11 +++++++++--
+ fs/xfs/xfs_bmap_item.c     |    7 +++++--
+ fs/xfs/xfs_extfree_item.c  |    2 +-
+ fs/xfs/xfs_log_recover.c   |    7 ++++++-
+ fs/xfs/xfs_refcount_item.c |    2 +-
+ fs/xfs/xfs_rmap_item.c     |    2 +-
+ 7 files changed, 61 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -16,6 +16,7 @@
+ #include "xfs_inode.h"
+ #include "xfs_inode_item.h"
+ #include "xfs_trace.h"
++#include "xfs_icache.h"
+ 
+ /*
+  * Deferred Operations in XFS
+@@ -567,10 +568,14 @@ xfs_defer_move(
+  * deferred ops state is transferred to the capture structure and the
+  * transaction is then ready for the caller to commit it.  If there are no
+  * intent items to capture, this function returns NULL.
++ *
++ * If capture_ip is not NULL, the capture structure will obtain an extra
++ * reference to the inode.
+  */
+ static struct xfs_defer_capture *
+ xfs_defer_ops_capture(
+-	struct xfs_trans		*tp)
++	struct xfs_trans		*tp,
++	struct xfs_inode		*capture_ip)
+ {
+ 	struct xfs_defer_capture	*dfc;
+ 
+@@ -596,6 +601,15 @@ xfs_defer_ops_capture(
+ 	/* Preserve the log reservation size. */
+ 	dfc->dfc_logres = tp->t_log_res;
+ 
++	/*
++	 * Grab an extra reference to this inode and attach it to the capture
++	 * structure.
++	 */
++	if (capture_ip) {
++		ihold(VFS_I(capture_ip));
++		dfc->dfc_capture_ip = capture_ip;
++	}
++
+ 	return dfc;
+ }
+ 
+@@ -606,24 +620,33 @@ xfs_defer_ops_release(
+ 	struct xfs_defer_capture	*dfc)
+ {
+ 	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
++	if (dfc->dfc_capture_ip)
++		xfs_irele(dfc->dfc_capture_ip);
+ 	kmem_free(dfc);
+ }
+ 
+ /*
+  * Capture any deferred ops and commit the transaction.  This is the last step
+- * needed to finish a log intent item that we recovered from the log.
++ * needed to finish a log intent item that we recovered from the log.  If any
++ * of the deferred ops operate on an inode, the caller must pass in that inode
++ * so that the reference can be transferred to the capture structure.  The
++ * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
++ * xfs_defer_ops_continue.
+  */
+ int
+ xfs_defer_ops_capture_and_commit(
+ 	struct xfs_trans		*tp,
++	struct xfs_inode		*capture_ip,
+ 	struct list_head		*capture_list)
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	struct xfs_defer_capture	*dfc;
+ 	int				error;
+ 
++	ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL));
++
+ 	/* If we don't capture anything, commit transaction and exit. */
+-	dfc = xfs_defer_ops_capture(tp);
++	dfc = xfs_defer_ops_capture(tp, capture_ip);
+ 	if (!dfc)
+ 		return xfs_trans_commit(tp);
+ 
+@@ -640,16 +663,26 @@ xfs_defer_ops_capture_and_commit(
+ 
+ /*
+  * Attach a chain of captured deferred ops to a new transaction and free the
+- * capture structure.
++ * capture structure.  If an inode was captured, it will be passed back to the
++ * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
++ * The caller now owns the inode reference.
+  */
+ void
+ xfs_defer_ops_continue(
+ 	struct xfs_defer_capture	*dfc,
+-	struct xfs_trans		*tp)
++	struct xfs_trans		*tp,
++	struct xfs_inode		**captured_ipp)
+ {
+ 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ 	ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+ 
++	/* Lock and join the captured inode to the new transaction. */
++	if (dfc->dfc_capture_ip) {
++		xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL);
++		xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0);
++	}
++	*captured_ipp = dfc->dfc_capture_ip;
++
+ 	/* Move captured dfops chain and state to the transaction. */
+ 	list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
+ 	tp->t_flags |= dfc->dfc_tpflags;
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -80,6 +80,12 @@ struct xfs_defer_capture {
+ 
+ 	/* Log reservation saved from the transaction. */
+ 	unsigned int		dfc_logres;
++
++	/*
++	 * An inode reference that must be maintained to complete the deferred
++	 * work.
++	 */
++	struct xfs_inode	*dfc_capture_ip;
+ };
+ 
+ /*
+@@ -87,8 +93,9 @@ struct xfs_defer_capture {
+  * This doesn't normally happen except log recovery.
+  */
+ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
+-		struct list_head *capture_list);
+-void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp);
++		struct xfs_inode *capture_ip, struct list_head *capture_list);
++void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
++		struct xfs_inode **captured_ipp);
+ void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+ 
+ #endif /* __XFS_DEFER_H__ */
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -528,8 +528,11 @@ xfs_bui_recover(
+ 	}
+ 
+ 	set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+-	/* Commit transaction, which frees the transaction. */
+-	error = xfs_defer_ops_capture_and_commit(tp, capture_list);
++	/*
++	 * Commit transaction, which frees the transaction and saves the inode
++	 * for later replay activities.
++	 */
++	error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list);
+ 	if (error)
+ 		goto err_unlock;
+ 
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -639,7 +639,7 @@ xfs_efi_recover(
+ 
+ 	set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+ 
+-	return xfs_defer_ops_capture_and_commit(tp, capture_list);
++	return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ 
+ abort_error:
+ 	xfs_trans_cancel(tp);
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4766,6 +4766,7 @@ xlog_finish_defer_ops(
+ {
+ 	struct xfs_defer_capture *dfc, *next;
+ 	struct xfs_trans	*tp;
++	struct xfs_inode	*ip;
+ 	int			error = 0;
+ 
+ 	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+@@ -4791,9 +4792,13 @@ xlog_finish_defer_ops(
+ 		 * from recovering a single intent item.
+ 		 */
+ 		list_del_init(&dfc->dfc_list);
+-		xfs_defer_ops_continue(dfc, tp);
++		xfs_defer_ops_continue(dfc, tp, &ip);
+ 
+ 		error = xfs_trans_commit(tp);
++		if (ip) {
++			xfs_iunlock(ip, XFS_ILOCK_EXCL);
++			xfs_irele(ip);
++		}
+ 		if (error)
+ 			return error;
+ 	}
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -569,7 +569,7 @@ xfs_cui_recover(
+ 
+ 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+ 	set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+-	return xfs_defer_ops_capture_and_commit(tp, capture_list);
++	return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ 
+ abort_error:
+ 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -593,7 +593,7 @@ xfs_rui_recover(
+ 
+ 	xfs_rmap_finish_one_cleanup(tp, rcur, error);
+ 	set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
+-	return xfs_defer_ops_capture_and_commit(tp, capture_list);
++	return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+ 
+ abort_error:
+ 	xfs_rmap_finish_one_cleanup(tp, rcur, error);
diff --git a/queue-5.4/xfs-fix-finobt-btree-block-recovery-ordering.patch b/queue-5.4/xfs-fix-finobt-btree-block-recovery-ordering.patch
new file mode 100644
index 00000000000..6ac6b43c8ce
--- /dev/null
+++ b/queue-5.4/xfs-fix-finobt-btree-block-recovery-ordering.patch
@@ -0,0 +1,52 @@
+From chandan.babu@oracle.com Thu Feb 16 06:21:43 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:04 +0530
+Subject: xfs: fix finobt btree block recovery ordering
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-11-chandan.babu@oracle.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 671459676ab0e1d371c8d6b184ad1faa05b6941e upstream.
+
+[ In 5.4.y, xlog_recover_get_buf_lsn() is defined inside
+  fs/xfs/xfs_log_recover.c ]
+
+Nathan popped up on #xfs and pointed out that we fail to handle
+finobt btree blocks in xlog_recover_get_buf_lsn(). This means they
+always fall through the entire magic number matching code to "recover
+immediately". Whilst most of the time this is the correct behaviour,
+occasionally it will be incorrect and could potentially overwrite
+more recent metadata because we don't check the LSN in the on disk
+metadata at all.
+
+This bug has been present since the finobt was first introduced, and
+is a potential cause of the occasional xfs_iget_check_free_state()
+failures we see that indicate that the inode btree state does not
+match the on disk inode state.
+
+Fixes: aafc3c246529 ("xfs: support the XFS_BTNUM_FINOBT free inode btree type")
+Reported-by: Nathan Scott <nathans@redhat.com>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -2206,6 +2206,8 @@ xlog_recover_get_buf_lsn(
+ 	case XFS_ABTC_MAGIC:
+ 	case XFS_RMAP_CRC_MAGIC:
+ 	case XFS_REFC_CRC_MAGIC:
++	case XFS_FIBT_CRC_MAGIC:
++	case XFS_FIBT_MAGIC:
+ 	case XFS_IBT_CRC_MAGIC:
+ 	case XFS_IBT_MAGIC: {
+ 		struct xfs_btree_block *btb = blk;
diff --git a/queue-5.4/xfs-fix-missing-cow-blocks-writeback-conversion-retry.patch b/queue-5.4/xfs-fix-missing-cow-blocks-writeback-conversion-retry.patch
new file mode 100644
index 00000000000..62b05de47d9
--- /dev/null
+++ b/queue-5.4/xfs-fix-missing-cow-blocks-writeback-conversion-retry.patch
@@ -0,0 +1,72 @@
+From chandan.babu@oracle.com Thu Feb 16 06:23:08 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:15 +0530
+Subject: xfs: fix missing CoW blocks writeback conversion retry
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-22-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit c2f09217a4305478c55adc9a98692488dd19cd32 upstream.
+
+[ Set xfs_writepage_ctx->fork to XFS_DATA_FORK since 5.4.y tracks current
+  extent's fork in this variable ]
+
+In commit 7588cbeec6df, we tried to fix a race stemming from the lack of
+coordination between higher level code that wants to allocate and remap
+CoW fork extents into the data fork.  Christoph cites as examples the
+always_cow mode, and a directio write completion racing with writeback.
+
+According to the comments before the goto retry, we want to restart the
+lookup to catch the extent in the data fork, but we don't actually reset
+whichfork or cow_fsb, which means the second try executes using stale
+information.  Up until now I think we've gotten lucky that either
+there's something left in the CoW fork to cause cow_fsb to be reset, or
+either data/cow fork sequence numbers have advanced enough to force a
+fresh lookup from the data fork.  However, if we reach the retry with an
+empty stable CoW fork and a stable data fork, neither of those things
+happens.  The retry foolishly re-calls xfs_convert_blocks on the CoW
+fork which fails again.  This time, we toss the write.
+
+I've recently been working on extending reflink to the realtime device.
+When the realtime extent size is larger than a single block, we have to
+force the page cache to CoW the entire rt extent if a write (or
+fallocate) are not aligned with the rt extent size.  The strategy I've
+chosen to deal with this is derived from Dave's blocksize > pagesize
+series: dirtying around the write range, and ensuring that writeback
+always starts mapping on an rt extent boundary.  This has brought this
+race front and center, since generic/522 blows up immediately.
+
+However, I'm pretty sure this is a bug outright, independent of that.
+
+Fixes: 7588cbeec6df ("xfs: retry COW fork delalloc conversion when no extent was found")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -495,7 +495,7 @@ xfs_map_blocks(
+ 	ssize_t			count = i_blocksize(inode);
+ 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
+-	xfs_fileoff_t		cow_fsb = NULLFILEOFF;
++	xfs_fileoff_t		cow_fsb;
+ 	struct xfs_bmbt_irec	imap;
+ 	struct xfs_iext_cursor	icur;
+ 	int			retries = 0;
+@@ -529,6 +529,8 @@ xfs_map_blocks(
+ 	 * landed in a hole and we skip the block.
+ 	 */
+ retry:
++	cow_fsb = NULLFILEOFF;
++	wpc->fork = XFS_DATA_FORK;
+ 	xfs_ilock(ip, XFS_ILOCK_SHARED);
+ 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+ 	       (ip->i_df.if_flags & XFS_IFEXTENTS));
diff --git a/queue-5.4/xfs-fix-the-forward-progress-assertion-in-xfs_iwalk_run_callbacks.patch b/queue-5.4/xfs-fix-the-forward-progress-assertion-in-xfs_iwalk_run_callbacks.patch
new file mode 100644
index 00000000000..2b10e0781eb
--- /dev/null
+++ b/queue-5.4/xfs-fix-the-forward-progress-assertion-in-xfs_iwalk_run_callbacks.patch
@@ -0,0 +1,43 @@
+From chandan.babu@oracle.com Thu Feb 16 06:23:20 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:17 +0530
+Subject: xfs: fix the forward progress assertion in xfs_iwalk_run_callbacks
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-24-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit a5336d6bb2d02d0e9d4d3c8be04b80b8b68d56c8 upstream.
+
+In commit 27c14b5daa82 we started tracking the last inode seen during an
+inode walk to avoid infinite loops if a corrupt inobt record happens to
+have a lower ir_startino than the record preceeding it.  Unfortunately,
+the assertion trips over the case where there are completely empty inobt
+records (which can happen quite easily on 64k page filesystems) because
+we advance the tracking cursor without actually putting the empty record
+into the processing buffer.  Fix the assert to allow for this case.
+
+Reported-by: zlang@redhat.com
+Fixes: 27c14b5daa82 ("xfs: ensure inobt record walks always make forward progress")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Zorro Lang <zlang@redhat.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iwalk.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_iwalk.c
++++ b/fs/xfs/xfs_iwalk.c
+@@ -362,7 +362,7 @@ xfs_iwalk_run_callbacks(
+ 	/* Delete cursor but remember the last record we cached... */
+ 	xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+ 	irec = &iwag->recs[iwag->nr_recs - 1];
+-	ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
++	ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
+ 
+ 	error = xfs_iwalk_ag_recs(iwag);
+ 	if (error)
diff --git a/queue-5.4/xfs-log-new-intent-items-created-as-part-of-finishing-recovered-intent-items.patch b/queue-5.4/xfs-log-new-intent-items-created-as-part-of-finishing-recovered-intent-items.patch
new file mode 100644
index 00000000000..1e545fc2ad5
--- /dev/null
+++ b/queue-5.4/xfs-log-new-intent-items-created-as-part-of-finishing-recovered-intent-items.patch
@@ -0,0 +1,130 @@
+From stable-owner@vger.kernel.org Thu Feb 16 07:27:30 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:03 +0530
+Subject: xfs: log new intent items created as part of finishing recovered intent items
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-10-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 93293bcbde93567efaf4e6bcd58cad270e1fcbf5 upstream.
+
+[Slightly edit fs/xfs/xfs_bmap_item.c & fs/xfs/xfs_refcount_item.c to resolve
+merge conflicts]
+
+During a code inspection, I found a serious bug in the log intent item
+recovery code when an intent item cannot complete all the work and
+decides to requeue itself to get that done.  When this happens, the
+item recovery creates a new incore deferred op representing the
+remaining work and attaches it to the transaction that it allocated.  At
+the end of _item_recover, it moves the entire chain of deferred ops to
+the dummy parent_tp that xlog_recover_process_intents passed to it, but
+fail to log a new intent item for the remaining work before committing
+the transaction for the single unit of work.
+
+xlog_finish_defer_ops logs those new intent items once recovery has
+finished dealing with the intent items that it recovered, but this isn't
+sufficient.  If the log is forced to disk after a recovered log item
+decides to requeue itself and the system goes down before we call
+xlog_finish_defer_ops, the second log recovery will never see the new
+intent item and therefore has no idea that there was more work to do.
+It will finish recovery leaving the filesystem in a corrupted state.
+
+The same logic applies to /any/ deferred ops added during intent item
+recovery, not just the one handling the remaining work.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c  |   26 ++++++++++++++++++++++++--
+ fs/xfs/libxfs/xfs_defer.h  |    6 ++++++
+ fs/xfs/xfs_bmap_item.c     |    2 +-
+ fs/xfs/xfs_refcount_item.c |    2 +-
+ 4 files changed, 32 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -186,8 +186,9 @@ xfs_defer_create_intent(
+ {
+ 	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
+ 
+-	dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
+-			dfp->dfp_count, sort);
++	if (!dfp->dfp_intent)
++		dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
++						     dfp->dfp_count, sort);
+ }
+ 
+ /*
+@@ -390,6 +391,7 @@ xfs_defer_finish_one(
+ 			list_add(li, &dfp->dfp_work);
+ 			dfp->dfp_count++;
+ 			dfp->dfp_done = NULL;
++			dfp->dfp_intent = NULL;
+ 			xfs_defer_create_intent(tp, dfp, false);
+ 		}
+ 
+@@ -552,3 +554,23 @@ xfs_defer_move(
+ 
+ 	xfs_defer_reset(stp);
+ }
++
++/*
++ * Prepare a chain of fresh deferred ops work items to be completed later.  Log
++ * recovery requires the ability to put off until later the actual finishing
++ * work so that it can process unfinished items recovered from the log in
++ * correct order.
++ *
++ * Create and log intent items for all the work that we're capturing so that we
++ * can be assured that the items will get replayed if the system goes down
++ * before log recovery gets a chance to finish the work it put off.  Then we
++ * move the chain from stp to dtp.
++ */
++void
++xfs_defer_capture(
++	struct xfs_trans	*dtp,
++	struct xfs_trans	*stp)
++{
++	xfs_defer_create_intents(stp);
++	xfs_defer_move(dtp, stp);
++}
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -61,4 +61,10 @@ extern const struct xfs_defer_op_type xf
+ extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
+ extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+ 
++/*
++ * Functions to capture a chain of deferred operations and continue them later.
++ * This doesn't normally happen except log recovery.
++ */
++void xfs_defer_capture(struct xfs_trans *dtp, struct xfs_trans *stp);
++
+ #endif /* __XFS_DEFER_H__ */
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -541,7 +541,7 @@ xfs_bui_recover(
+ 	}
+ 
+ 	set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+-	xfs_defer_move(parent_tp, tp);
++	xfs_defer_capture(parent_tp, tp);
+ 	error = xfs_trans_commit(tp);
+ 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ 	xfs_irele(ip);
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -574,7 +574,7 @@ xfs_cui_recover(
+ 
+ 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+ 	set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+-	xfs_defer_move(parent_tp, tp);
++	xfs_defer_capture(parent_tp, tp);
+ 	error = xfs_trans_commit(tp);
+ 	return error;
+ 
diff --git a/queue-5.4/xfs-merge-the-diff_items-defer-op-into-create_intent.patch b/queue-5.4/xfs-merge-the-diff_items-defer-op-into-create_intent.patch
new file mode 100644
index 00000000000..fb9cc67770d
--- /dev/null
+++ b/queue-5.4/xfs-merge-the-diff_items-defer-op-into-create_intent.patch
@@ -0,0 +1,185 @@
+From chandan.babu@oracle.com Thu Feb 16 06:21:18 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:00 +0530
+Subject: xfs: merge the ->diff_items defer op into ->create_intent
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-7-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit d367a868e46b025a8ced8e00ef2b3a3c2f3bf732 upstream.
+
+This avoids a per-item indirect call, and also simplifies the interface
+a bit.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c  |    5 +----
+ fs/xfs/libxfs/xfs_defer.h  |    3 +--
+ fs/xfs/xfs_bmap_item.c     |    9 ++++++---
+ fs/xfs/xfs_extfree_item.c  |    7 ++++---
+ fs/xfs/xfs_refcount_item.c |    6 ++++--
+ fs/xfs/xfs_rmap_item.c     |    6 ++++--
+ 6 files changed, 20 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -186,11 +186,8 @@ xfs_defer_create_intent(
+ {
+ 	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
+ 
+-	if (sort)
+-		list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
+-
+ 	dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
+-			dfp->dfp_count);
++			dfp->dfp_count, sort);
+ }
+ 
+ /*
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -49,9 +49,8 @@ struct xfs_defer_op_type {
+ 			void **);
+ 	void (*finish_cleanup)(struct xfs_trans *, void *, int);
+ 	void (*cancel_item)(struct list_head *);
+-	int (*diff_items)(void *, struct list_head *, struct list_head *);
+ 	void *(*create_intent)(struct xfs_trans *tp, struct list_head *items,
+-			unsigned int count);
++			unsigned int count, bool sort);
+ 	unsigned int		max_items;
+ };
+ 
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -334,14 +334,18 @@ STATIC void *
+ xfs_bmap_update_create_intent(
+ 	struct xfs_trans		*tp,
+ 	struct list_head		*items,
+-	unsigned int			count)
++	unsigned int			count,
++	bool				sort)
+ {
+-	struct xfs_bui_log_item		*buip = xfs_bui_init(tp->t_mountp);
++	struct xfs_mount		*mp = tp->t_mountp;
++	struct xfs_bui_log_item		*buip = xfs_bui_init(mp);
+ 	struct xfs_bmap_intent		*bmap;
+ 
+ 	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+ 
+ 	xfs_trans_add_item(tp, &buip->bui_item);
++	if (sort)
++		list_sort(mp, items, xfs_bmap_update_diff_items);
+ 	list_for_each_entry(bmap, items, bi_list)
+ 		xfs_bmap_update_log_item(tp, buip, bmap);
+ 	return buip;
+@@ -408,7 +412,6 @@ xfs_bmap_update_cancel_item(
+ 
+ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+ 	.max_items	= XFS_BUI_MAX_FAST_EXTENTS,
+-	.diff_items	= xfs_bmap_update_diff_items,
+ 	.create_intent	= xfs_bmap_update_create_intent,
+ 	.abort_intent	= xfs_bmap_update_abort_intent,
+ 	.create_done	= xfs_bmap_update_create_done,
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -441,7 +441,8 @@ STATIC void *
+ xfs_extent_free_create_intent(
+ 	struct xfs_trans		*tp,
+ 	struct list_head		*items,
+-	unsigned int			count)
++	unsigned int			count,
++	bool				sort)
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	struct xfs_efi_log_item		*efip = xfs_efi_init(mp, count);
+@@ -450,6 +451,8 @@ xfs_extent_free_create_intent(
+ 	ASSERT(count > 0);
+ 
+ 	xfs_trans_add_item(tp, &efip->efi_item);
++	if (sort)
++		list_sort(mp, items, xfs_extent_free_diff_items);
+ 	list_for_each_entry(free, items, xefi_list)
+ 		xfs_extent_free_log_item(tp, efip, free);
+ 	return efip;
+@@ -506,7 +509,6 @@ xfs_extent_free_cancel_item(
+ 
+ const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+ 	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
+-	.diff_items	= xfs_extent_free_diff_items,
+ 	.create_intent	= xfs_extent_free_create_intent,
+ 	.abort_intent	= xfs_extent_free_abort_intent,
+ 	.create_done	= xfs_extent_free_create_done,
+@@ -571,7 +573,6 @@ xfs_agfl_free_finish_item(
+ /* sub-type with special handling for AGFL deferred frees */
+ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+ 	.max_items	= XFS_EFI_MAX_FAST_EXTENTS,
+-	.diff_items	= xfs_extent_free_diff_items,
+ 	.create_intent	= xfs_extent_free_create_intent,
+ 	.abort_intent	= xfs_extent_free_abort_intent,
+ 	.create_done	= xfs_extent_free_create_done,
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -333,7 +333,8 @@ STATIC void *
+ xfs_refcount_update_create_intent(
+ 	struct xfs_trans		*tp,
+ 	struct list_head		*items,
+-	unsigned int			count)
++	unsigned int			count,
++	bool				sort)
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	struct xfs_cui_log_item		*cuip = xfs_cui_init(mp, count);
+@@ -342,6 +343,8 @@ xfs_refcount_update_create_intent(
+ 	ASSERT(count > 0);
+ 
+ 	xfs_trans_add_item(tp, &cuip->cui_item);
++	if (sort)
++		list_sort(mp, items, xfs_refcount_update_diff_items);
+ 	list_for_each_entry(refc, items, ri_list)
+ 		xfs_refcount_update_log_item(tp, cuip, refc);
+ 	return cuip;
+@@ -422,7 +425,6 @@ xfs_refcount_update_cancel_item(
+ 
+ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+ 	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
+-	.diff_items	= xfs_refcount_update_diff_items,
+ 	.create_intent	= xfs_refcount_update_create_intent,
+ 	.abort_intent	= xfs_refcount_update_abort_intent,
+ 	.create_done	= xfs_refcount_update_create_done,
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -385,7 +385,8 @@ STATIC void *
+ xfs_rmap_update_create_intent(
+ 	struct xfs_trans		*tp,
+ 	struct list_head		*items,
+-	unsigned int			count)
++	unsigned int			count,
++	bool				sort)
+ {
+ 	struct xfs_mount		*mp = tp->t_mountp;
+ 	struct xfs_rui_log_item		*ruip = xfs_rui_init(mp, count);
+@@ -394,6 +395,8 @@ xfs_rmap_update_create_intent(
+ 	ASSERT(count > 0);
+ 
+ 	xfs_trans_add_item(tp, &ruip->rui_item);
++	if (sort)
++		list_sort(mp, items, xfs_rmap_update_diff_items);
+ 	list_for_each_entry(rmap, items, ri_list)
+ 		xfs_rmap_update_log_item(tp, ruip, rmap);
+ 	return ruip;
+@@ -466,7 +469,6 @@ xfs_rmap_update_cancel_item(
+ 
+ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+ 	.max_items	= XFS_RUI_MAX_FAST_EXTENTS,
+-	.diff_items	= xfs_rmap_update_diff_items,
+ 	.create_intent	= xfs_rmap_update_create_intent,
+ 	.abort_intent	= xfs_rmap_update_abort_intent,
+ 	.create_done	= xfs_rmap_update_create_done,
diff --git a/queue-5.4/xfs-merge-the-log_item-defer-op-into-create_intent.patch b/queue-5.4/xfs-merge-the-log_item-defer-op-into-create_intent.patch
new file mode 100644
index 00000000000..93a4d18055e
--- /dev/null
+++ b/queue-5.4/xfs-merge-the-log_item-defer-op-into-create_intent.patch
@@ -0,0 +1,393 @@
+From stable-owner@vger.kernel.org Thu Feb 16 07:13:43 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:49:59 +0530
+Subject: xfs: merge the ->log_item defer op into ->create_intent
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-6-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit c1f09188e8de0ae65433cb9c8ace4feb66359bcc upstream.
+
+These are aways called together, and my merging them we reduce the amount
+of indirect calls, improve type safety and in general clean up the code
+a bit.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c  |    6 +----
+ fs/xfs/libxfs/xfs_defer.h  |    4 +--
+ fs/xfs/xfs_bmap_item.c     |   47 +++++++++++++++++--------------------------
+ fs/xfs/xfs_extfree_item.c  |   49 ++++++++++++++++++---------------------------
+ fs/xfs/xfs_refcount_item.c |   48 ++++++++++++++++++--------------------------
+ fs/xfs/xfs_rmap_item.c     |   48 ++++++++++++++++++--------------------------
+ 6 files changed, 83 insertions(+), 119 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -185,14 +185,12 @@ xfs_defer_create_intent(
+ 	bool				sort)
+ {
+ 	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
+-	struct list_head		*li;
+ 
+ 	if (sort)
+ 		list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
+ 
+-	dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
+-	list_for_each(li, &dfp->dfp_work)
+-		ops->log_item(tp, dfp->dfp_intent, li);
++	dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
++			dfp->dfp_count);
+ }
+ 
+ /*
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -50,8 +50,8 @@ struct xfs_defer_op_type {
+ 	void (*finish_cleanup)(struct xfs_trans *, void *, int);
+ 	void (*cancel_item)(struct list_head *);
+ 	int (*diff_items)(void *, struct list_head *, struct list_head *);
+-	void *(*create_intent)(struct xfs_trans *, uint);
+-	void (*log_item)(struct xfs_trans *, void *, struct list_head *);
++	void *(*create_intent)(struct xfs_trans *tp, struct list_head *items,
++			unsigned int count);
+ 	unsigned int		max_items;
+ };
+ 
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -278,27 +278,6 @@ xfs_bmap_update_diff_items(
+ 	return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
+ }
+ 
+-/* Get an BUI. */
+-STATIC void *
+-xfs_bmap_update_create_intent(
+-	struct xfs_trans		*tp,
+-	unsigned int			count)
+-{
+-	struct xfs_bui_log_item		*buip;
+-
+-	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+-	ASSERT(tp != NULL);
+-
+-	buip = xfs_bui_init(tp->t_mountp);
+-	ASSERT(buip != NULL);
+-
+-	/*
+-	 * Get a log_item_desc to point at the new item.
+-	 */
+-	xfs_trans_add_item(tp, &buip->bui_item);
+-	return buip;
+-}
+-
+ /* Set the map extent flags for this mapping. */
+ static void
+ xfs_trans_set_bmap_flags(
+@@ -326,16 +305,12 @@ xfs_trans_set_bmap_flags(
+ STATIC void
+ xfs_bmap_update_log_item(
+ 	struct xfs_trans		*tp,
+-	void				*intent,
+-	struct list_head		*item)
++	struct xfs_bui_log_item		*buip,
++	struct xfs_bmap_intent		*bmap)
+ {
+-	struct xfs_bui_log_item		*buip = intent;
+-	struct xfs_bmap_intent		*bmap;
+ 	uint				next_extent;
+ 	struct xfs_map_extent		*map;
+ 
+-	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+-
+ 	tp->t_flags |= XFS_TRANS_DIRTY;
+ 	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+ 
+@@ -355,6 +330,23 @@ xfs_bmap_update_log_item(
+ 			bmap->bi_bmap.br_state);
+ }
+ 
++STATIC void *
++xfs_bmap_update_create_intent(
++	struct xfs_trans		*tp,
++	struct list_head		*items,
++	unsigned int			count)
++{
++	struct xfs_bui_log_item		*buip = xfs_bui_init(tp->t_mountp);
++	struct xfs_bmap_intent		*bmap;
++
++	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
++
++	xfs_trans_add_item(tp, &buip->bui_item);
++	list_for_each_entry(bmap, items, bi_list)
++		xfs_bmap_update_log_item(tp, buip, bmap);
++	return buip;
++}
++
+ /* Get an BUD so we can process all the deferred rmap updates. */
+ STATIC void *
+ xfs_bmap_update_create_done(
+@@ -419,7 +411,6 @@ const struct xfs_defer_op_type xfs_bmap_
+ 	.diff_items	= xfs_bmap_update_diff_items,
+ 	.create_intent	= xfs_bmap_update_create_intent,
+ 	.abort_intent	= xfs_bmap_update_abort_intent,
+-	.log_item	= xfs_bmap_update_log_item,
+ 	.create_done	= xfs_bmap_update_create_done,
+ 	.finish_item	= xfs_bmap_update_finish_item,
+ 	.cancel_item	= xfs_bmap_update_cancel_item,
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -412,41 +412,16 @@ xfs_extent_free_diff_items(
+ 		XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+ }
+ 
+-/* Get an EFI. */
+-STATIC void *
+-xfs_extent_free_create_intent(
+-	struct xfs_trans		*tp,
+-	unsigned int			count)
+-{
+-	struct xfs_efi_log_item		*efip;
+-
+-	ASSERT(tp != NULL);
+-	ASSERT(count > 0);
+-
+-	efip = xfs_efi_init(tp->t_mountp, count);
+-	ASSERT(efip != NULL);
+-
+-	/*
+-	 * Get a log_item_desc to point at the new item.
+-	 */
+-	xfs_trans_add_item(tp, &efip->efi_item);
+-	return efip;
+-}
+-
+ /* Log a free extent to the intent item. */
+ STATIC void
+ xfs_extent_free_log_item(
+ 	struct xfs_trans		*tp,
+-	void				*intent,
+-	struct list_head		*item)
++	struct xfs_efi_log_item		*efip,
++	struct xfs_extent_free_item	*free)
+ {
+-	struct xfs_efi_log_item		*efip = intent;
+-	struct xfs_extent_free_item	*free;
+ 	uint				next_extent;
+ 	struct xfs_extent		*extp;
+ 
+-	free = container_of(item, struct xfs_extent_free_item, xefi_list);
+-
+ 	tp->t_flags |= XFS_TRANS_DIRTY;
+ 	set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+ 
+@@ -462,6 +437,24 @@ xfs_extent_free_log_item(
+ 	extp->ext_len = free->xefi_blockcount;
+ }
+ 
++STATIC void *
++xfs_extent_free_create_intent(
++	struct xfs_trans		*tp,
++	struct list_head		*items,
++	unsigned int			count)
++{
++	struct xfs_mount		*mp = tp->t_mountp;
++	struct xfs_efi_log_item		*efip = xfs_efi_init(mp, count);
++	struct xfs_extent_free_item	*free;
++
++	ASSERT(count > 0);
++
++	xfs_trans_add_item(tp, &efip->efi_item);
++	list_for_each_entry(free, items, xefi_list)
++		xfs_extent_free_log_item(tp, efip, free);
++	return efip;
++}
++
+ /* Get an EFD so we can process all the free extents. */
+ STATIC void *
+ xfs_extent_free_create_done(
+@@ -516,7 +509,6 @@ const struct xfs_defer_op_type xfs_exten
+ 	.diff_items	= xfs_extent_free_diff_items,
+ 	.create_intent	= xfs_extent_free_create_intent,
+ 	.abort_intent	= xfs_extent_free_abort_intent,
+-	.log_item	= xfs_extent_free_log_item,
+ 	.create_done	= xfs_extent_free_create_done,
+ 	.finish_item	= xfs_extent_free_finish_item,
+ 	.cancel_item	= xfs_extent_free_cancel_item,
+@@ -582,7 +574,6 @@ const struct xfs_defer_op_type xfs_agfl_
+ 	.diff_items	= xfs_extent_free_diff_items,
+ 	.create_intent	= xfs_extent_free_create_intent,
+ 	.abort_intent	= xfs_extent_free_abort_intent,
+-	.log_item	= xfs_extent_free_log_item,
+ 	.create_done	= xfs_extent_free_create_done,
+ 	.finish_item	= xfs_agfl_free_finish_item,
+ 	.cancel_item	= xfs_extent_free_cancel_item,
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -284,27 +284,6 @@ xfs_refcount_update_diff_items(
+ 		XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+ }
+ 
+-/* Get an CUI. */
+-STATIC void *
+-xfs_refcount_update_create_intent(
+-	struct xfs_trans		*tp,
+-	unsigned int			count)
+-{
+-	struct xfs_cui_log_item		*cuip;
+-
+-	ASSERT(tp != NULL);
+-	ASSERT(count > 0);
+-
+-	cuip = xfs_cui_init(tp->t_mountp, count);
+-	ASSERT(cuip != NULL);
+-
+-	/*
+-	 * Get a log_item_desc to point at the new item.
+-	 */
+-	xfs_trans_add_item(tp, &cuip->cui_item);
+-	return cuip;
+-}
+-
+ /* Set the phys extent flags for this reverse mapping. */
+ static void
+ xfs_trans_set_refcount_flags(
+@@ -328,16 +307,12 @@ xfs_trans_set_refcount_flags(
+ STATIC void
+ xfs_refcount_update_log_item(
+ 	struct xfs_trans		*tp,
+-	void				*intent,
+-	struct list_head		*item)
++	struct xfs_cui_log_item		*cuip,
++	struct xfs_refcount_intent	*refc)
+ {
+-	struct xfs_cui_log_item		*cuip = intent;
+-	struct xfs_refcount_intent	*refc;
+ 	uint				next_extent;
+ 	struct xfs_phys_extent		*ext;
+ 
+-	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+-
+ 	tp->t_flags |= XFS_TRANS_DIRTY;
+ 	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+ 
+@@ -354,6 +329,24 @@ xfs_refcount_update_log_item(
+ 	xfs_trans_set_refcount_flags(ext, refc->ri_type);
+ }
+ 
++STATIC void *
++xfs_refcount_update_create_intent(
++	struct xfs_trans		*tp,
++	struct list_head		*items,
++	unsigned int			count)
++{
++	struct xfs_mount		*mp = tp->t_mountp;
++	struct xfs_cui_log_item		*cuip = xfs_cui_init(mp, count);
++	struct xfs_refcount_intent	*refc;
++
++	ASSERT(count > 0);
++
++	xfs_trans_add_item(tp, &cuip->cui_item);
++	list_for_each_entry(refc, items, ri_list)
++		xfs_refcount_update_log_item(tp, cuip, refc);
++	return cuip;
++}
++
+ /* Get an CUD so we can process all the deferred refcount updates. */
+ STATIC void *
+ xfs_refcount_update_create_done(
+@@ -432,7 +425,6 @@ const struct xfs_defer_op_type xfs_refco
+ 	.diff_items	= xfs_refcount_update_diff_items,
+ 	.create_intent	= xfs_refcount_update_create_intent,
+ 	.abort_intent	= xfs_refcount_update_abort_intent,
+-	.log_item	= xfs_refcount_update_log_item,
+ 	.create_done	= xfs_refcount_update_create_done,
+ 	.finish_item	= xfs_refcount_update_finish_item,
+ 	.finish_cleanup = xfs_refcount_update_finish_cleanup,
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -352,41 +352,16 @@ xfs_rmap_update_diff_items(
+ 		XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
+ }
+ 
+-/* Get an RUI. */
+-STATIC void *
+-xfs_rmap_update_create_intent(
+-	struct xfs_trans		*tp,
+-	unsigned int			count)
+-{
+-	struct xfs_rui_log_item		*ruip;
+-
+-	ASSERT(tp != NULL);
+-	ASSERT(count > 0);
+-
+-	ruip = xfs_rui_init(tp->t_mountp, count);
+-	ASSERT(ruip != NULL);
+-
+-	/*
+-	 * Get a log_item_desc to point at the new item.
+-	 */
+-	xfs_trans_add_item(tp, &ruip->rui_item);
+-	return ruip;
+-}
+-
+ /* Log rmap updates in the intent item. */
+ STATIC void
+ xfs_rmap_update_log_item(
+ 	struct xfs_trans		*tp,
+-	void				*intent,
+-	struct list_head		*item)
++	struct xfs_rui_log_item		*ruip,
++	struct xfs_rmap_intent		*rmap)
+ {
+-	struct xfs_rui_log_item		*ruip = intent;
+-	struct xfs_rmap_intent		*rmap;
+ 	uint				next_extent;
+ 	struct xfs_map_extent		*map;
+ 
+-	rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+-
+ 	tp->t_flags |= XFS_TRANS_DIRTY;
+ 	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+ 
+@@ -406,6 +381,24 @@ xfs_rmap_update_log_item(
+ 			rmap->ri_bmap.br_state);
+ }
+ 
++STATIC void *
++xfs_rmap_update_create_intent(
++	struct xfs_trans		*tp,
++	struct list_head		*items,
++	unsigned int			count)
++{
++	struct xfs_mount		*mp = tp->t_mountp;
++	struct xfs_rui_log_item		*ruip = xfs_rui_init(mp, count);
++	struct xfs_rmap_intent		*rmap;
++
++	ASSERT(count > 0);
++
++	xfs_trans_add_item(tp, &ruip->rui_item);
++	list_for_each_entry(rmap, items, ri_list)
++		xfs_rmap_update_log_item(tp, ruip, rmap);
++	return ruip;
++}
++
+ /* Get an RUD so we can process all the deferred rmap updates. */
+ STATIC void *
+ xfs_rmap_update_create_done(
+@@ -476,7 +469,6 @@ const struct xfs_defer_op_type xfs_rmap_
+ 	.diff_items	= xfs_rmap_update_diff_items,
+ 	.create_intent	= xfs_rmap_update_create_intent,
+ 	.abort_intent	= xfs_rmap_update_abort_intent,
+-	.log_item	= xfs_rmap_update_log_item,
+ 	.create_done	= xfs_rmap_update_create_done,
+ 	.finish_item	= xfs_rmap_update_finish_item,
+ 	.finish_cleanup = xfs_rmap_update_finish_cleanup,
diff --git a/queue-5.4/xfs-only-relog-deferred-intent-items-if-free-space-in-the-log-gets-low.patch b/queue-5.4/xfs-only-relog-deferred-intent-items-if-free-space-in-the-log-gets-low.patch
new file mode 100644
index 00000000000..c439b009bec
--- /dev/null
+++ b/queue-5.4/xfs-only-relog-deferred-intent-items-if-free-space-in-the-log-gets-low.patch
@@ -0,0 +1,61 @@
+From chandan.babu@oracle.com Thu Feb 16 06:23:01 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:14 +0530
+Subject: xfs: only relog deferred intent items if free space in the log gets low
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-21-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 74f4d6a1e065c92428c5b588099e307a582d79d9 upstream.
+
+Now that we have the ability to ask the log how far the tail needs to be
+pushed to maintain its free space targets, augment the decision to relog
+an intent item so that we only do it if the log has hit the 75% full
+threshold.  There's no point in relogging an intent into the same
+checkpoint, and there's no need to relog if there's plenty of free space
+in the log.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c |   16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -372,7 +372,10 @@ xfs_defer_relog(
+ 	struct xfs_trans		**tpp,
+ 	struct list_head		*dfops)
+ {
++	struct xlog			*log = (*tpp)->t_mountp->m_log;
+ 	struct xfs_defer_pending	*dfp;
++	xfs_lsn_t			threshold_lsn = NULLCOMMITLSN;
++
+ 
+ 	ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+ 
+@@ -388,6 +391,19 @@ xfs_defer_relog(
+ 		    xfs_log_item_in_current_chkpt(dfp->dfp_intent))
+ 			continue;
+ 
++		/*
++		 * Figure out where we need the tail to be in order to maintain
++		 * the minimum required free space in the log.  Only sample
++		 * the log threshold once per call.
++		 */
++		if (threshold_lsn == NULLCOMMITLSN) {
++			threshold_lsn = xlog_grant_push_threshold(log, 0);
++			if (threshold_lsn == NULLCOMMITLSN)
++				break;
++		}
++		if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
++			continue;
++
+ 		trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
+ 		XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
+ 		dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
diff --git a/queue-5.4/xfs-periodically-relog-deferred-intent-items.patch b/queue-5.4/xfs-periodically-relog-deferred-intent-items.patch
new file mode 100644
index 00000000000..49ee58f5c96
--- /dev/null
+++ b/queue-5.4/xfs-periodically-relog-deferred-intent-items.patch
@@ -0,0 +1,628 @@
+From stable-owner@vger.kernel.org Thu Feb 16 07:39:54 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:12 +0530
+Subject: xfs: periodically relog deferred intent items
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-19-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 4e919af7827a6adfc28e82cd6c4ffcfcc3dd6118 upstream.
+
+[ Modify xfs_{bmap|extfree|refcount|rmap}_item.c to fix merge conflicts ]
+
+There's a subtle design flaw in the deferred log item code that can lead
+to pinning the log tail.  Taking up the defer ops chain examples from
+the previous commit, we can get trapped in sequences like this:
+
+Caller hands us a transaction t0 with D0-D3 attached.  The defer ops
+chain will look like the following if the transaction rolls succeed:
+
+t1: D0(t0), D1(t0), D2(t0), D3(t0)
+t2: d4(t1), d5(t1), D1(t0), D2(t0), D3(t0)
+t3: d5(t1), D1(t0), D2(t0), D3(t0)
+...
+t9: d9(t7), D3(t0)
+t10: D3(t0)
+t11: d10(t10), d11(t10)
+t12: d11(t10)
+
+In transaction 9, we finish d9 and try to roll to t10 while holding onto
+an intent item for D3 that we logged in t0.
+
+The previous commit changed the order in which we place new defer ops in
+the defer ops processing chain to reduce the maximum chain length.  Now
+make xfs_defer_finish_noroll capable of relogging the entire chain
+periodically so that we can always move the log tail forward.  Most
+chains will never get relogged, except for operations that generate very
+long chains (large extents containing many blocks with different sharing
+levels) or are on filesystems with small logs and a lot of ongoing
+metadata updates.
+
+Callers are now required to ensure that the transaction reservation is
+large enough to handle logging done items and new intent items for the
+maximum possible chain length.  Most callers are careful to keep the
+chain lengths low, so the overhead should be minimal.
+
+The decision to relog an intent item is made based on whether the intent
+was logged in a previous checkpoint, since there's no point in relogging
+an intent into the same checkpoint.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c  |   42 ++++++++++++++++++
+ fs/xfs/xfs_bmap_item.c     |   83 +++++++++++++++++++++++------------
+ fs/xfs/xfs_extfree_item.c  |  104 ++++++++++++++++++++++++++++-----------------
+ fs/xfs/xfs_refcount_item.c |   95 ++++++++++++++++++++++++++---------------
+ fs/xfs/xfs_rmap_item.c     |   93 +++++++++++++++++++++++++---------------
+ fs/xfs/xfs_stats.c         |    4 +
+ fs/xfs/xfs_stats.h         |    1 
+ fs/xfs/xfs_trace.h         |    1 
+ fs/xfs/xfs_trans.h         |   10 ++++
+ 9 files changed, 300 insertions(+), 133 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -17,6 +17,7 @@
+ #include "xfs_inode_item.h"
+ #include "xfs_trace.h"
+ #include "xfs_icache.h"
++#include "xfs_log.h"
+ 
+ /*
+  * Deferred Operations in XFS
+@@ -362,6 +363,42 @@ xfs_defer_cancel_list(
+ }
+ 
+ /*
++ * Prevent a log intent item from pinning the tail of the log by logging a
++ * done item to release the intent item; and then log a new intent item.
++ * The caller should provide a fresh transaction and roll it after we're done.
++ */
++static int
++xfs_defer_relog(
++	struct xfs_trans		**tpp,
++	struct list_head		*dfops)
++{
++	struct xfs_defer_pending	*dfp;
++
++	ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
++
++	list_for_each_entry(dfp, dfops, dfp_list) {
++		/*
++		 * If the log intent item for this deferred op is not a part of
++		 * the current log checkpoint, relog the intent item to keep
++		 * the log tail moving forward.  We're ok with this being racy
++		 * because an incorrect decision means we'll be a little slower
++		 * at pushing the tail.
++		 */
++		if (dfp->dfp_intent == NULL ||
++		    xfs_log_item_in_current_chkpt(dfp->dfp_intent))
++			continue;
++
++		trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
++		XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
++		dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
++	}
++
++	if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
++		return xfs_defer_trans_roll(tpp);
++	return 0;
++}
++
++/*
+  * Log an intent-done item for the first pending intent, and finish the work
+  * items.
+  */
+@@ -447,6 +484,11 @@ xfs_defer_finish_noroll(
+ 		if (error)
+ 			goto out_shutdown;
+ 
++		/* Possibly relog intent items to keep the log moving. */
++		error = xfs_defer_relog(tp, &dop_pending);
++		if (error)
++			goto out_shutdown;
++
+ 		dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
+ 				       dfp_list);
+ 		error = xfs_defer_finish_one(*tp, dfp);
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -125,34 +125,6 @@ xfs_bui_item_release(
+ 	xfs_bui_release(BUI_ITEM(lip));
+ }
+ 
+-static const struct xfs_item_ops xfs_bui_item_ops = {
+-	.iop_size	= xfs_bui_item_size,
+-	.iop_format	= xfs_bui_item_format,
+-	.iop_unpin	= xfs_bui_item_unpin,
+-	.iop_release	= xfs_bui_item_release,
+-};
+-
+-/*
+- * Allocate and initialize an bui item with the given number of extents.
+- */
+-struct xfs_bui_log_item *
+-xfs_bui_init(
+-	struct xfs_mount		*mp)
+-
+-{
+-	struct xfs_bui_log_item		*buip;
+-
+-	buip = kmem_zone_zalloc(xfs_bui_zone, 0);
+-
+-	xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
+-	buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
+-	buip->bui_format.bui_id = (uintptr_t)(void *)buip;
+-	atomic_set(&buip->bui_next_extent, 0);
+-	atomic_set(&buip->bui_refcount, 2);
+-
+-	return buip;
+-}
+-
+ static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip)
+ {
+ 	return container_of(lip, struct xfs_bud_log_item, bud_item);
+@@ -548,3 +520,58 @@ err_rele:
+ 	xfs_irele(ip);
+ 	return error;
+ }
++
++/* Relog an intent item to push the log tail forward. */
++static struct xfs_log_item *
++xfs_bui_item_relog(
++	struct xfs_log_item		*intent,
++	struct xfs_trans		*tp)
++{
++	struct xfs_bud_log_item		*budp;
++	struct xfs_bui_log_item		*buip;
++	struct xfs_map_extent		*extp;
++	unsigned int			count;
++
++	count = BUI_ITEM(intent)->bui_format.bui_nextents;
++	extp = BUI_ITEM(intent)->bui_format.bui_extents;
++
++	tp->t_flags |= XFS_TRANS_DIRTY;
++	budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
++	set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
++
++	buip = xfs_bui_init(tp->t_mountp);
++	memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
++	atomic_set(&buip->bui_next_extent, count);
++	xfs_trans_add_item(tp, &buip->bui_item);
++	set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
++	return &buip->bui_item;
++}
++
++static const struct xfs_item_ops xfs_bui_item_ops = {
++	.iop_size	= xfs_bui_item_size,
++	.iop_format	= xfs_bui_item_format,
++	.iop_unpin	= xfs_bui_item_unpin,
++	.iop_release	= xfs_bui_item_release,
++	.iop_relog	= xfs_bui_item_relog,
++};
++
++/*
++ * Allocate and initialize an bui item with the given number of extents.
++ */
++struct xfs_bui_log_item *
++xfs_bui_init(
++	struct xfs_mount		*mp)
++
++{
++	struct xfs_bui_log_item		*buip;
++
++	buip = kmem_zone_zalloc(xfs_bui_zone, 0);
++
++	xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
++	buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
++	buip->bui_format.bui_id = (uintptr_t)(void *)buip;
++	atomic_set(&buip->bui_next_extent, 0);
++	atomic_set(&buip->bui_refcount, 2);
++
++	return buip;
++}
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -139,44 +139,6 @@ xfs_efi_item_release(
+ 	xfs_efi_release(EFI_ITEM(lip));
+ }
+ 
+-static const struct xfs_item_ops xfs_efi_item_ops = {
+-	.iop_size	= xfs_efi_item_size,
+-	.iop_format	= xfs_efi_item_format,
+-	.iop_unpin	= xfs_efi_item_unpin,
+-	.iop_release	= xfs_efi_item_release,
+-};
+-
+-
+-/*
+- * Allocate and initialize an efi item with the given number of extents.
+- */
+-struct xfs_efi_log_item *
+-xfs_efi_init(
+-	struct xfs_mount	*mp,
+-	uint			nextents)
+-
+-{
+-	struct xfs_efi_log_item	*efip;
+-	uint			size;
+-
+-	ASSERT(nextents > 0);
+-	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+-		size = (uint)(sizeof(struct xfs_efi_log_item) +
+-			((nextents - 1) * sizeof(xfs_extent_t)));
+-		efip = kmem_zalloc(size, 0);
+-	} else {
+-		efip = kmem_zone_zalloc(xfs_efi_zone, 0);
+-	}
+-
+-	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
+-	efip->efi_format.efi_nextents = nextents;
+-	efip->efi_format.efi_id = (uintptr_t)(void *)efip;
+-	atomic_set(&efip->efi_next_extent, 0);
+-	atomic_set(&efip->efi_refcount, 2);
+-
+-	return efip;
+-}
+-
+ /*
+  * Copy an EFI format buffer from the given buf, and into the destination
+  * EFI format structure.
+@@ -645,3 +607,69 @@ abort_error:
+ 	xfs_trans_cancel(tp);
+ 	return error;
+ }
++
++/* Relog an intent item to push the log tail forward. */
++static struct xfs_log_item *
++xfs_efi_item_relog(
++	struct xfs_log_item		*intent,
++	struct xfs_trans		*tp)
++{
++	struct xfs_efd_log_item		*efdp;
++	struct xfs_efi_log_item		*efip;
++	struct xfs_extent		*extp;
++	unsigned int			count;
++
++	count = EFI_ITEM(intent)->efi_format.efi_nextents;
++	extp = EFI_ITEM(intent)->efi_format.efi_extents;
++
++	tp->t_flags |= XFS_TRANS_DIRTY;
++	efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
++	efdp->efd_next_extent = count;
++	memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
++	set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
++
++	efip = xfs_efi_init(tp->t_mountp, count);
++	memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
++	atomic_set(&efip->efi_next_extent, count);
++	xfs_trans_add_item(tp, &efip->efi_item);
++	set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
++	return &efip->efi_item;
++}
++
++static const struct xfs_item_ops xfs_efi_item_ops = {
++	.iop_size	= xfs_efi_item_size,
++	.iop_format	= xfs_efi_item_format,
++	.iop_unpin	= xfs_efi_item_unpin,
++	.iop_release	= xfs_efi_item_release,
++	.iop_relog	= xfs_efi_item_relog,
++};
++
++/*
++ * Allocate and initialize an efi item with the given number of extents.
++ */
++struct xfs_efi_log_item *
++xfs_efi_init(
++	struct xfs_mount	*mp,
++	uint			nextents)
++
++{
++	struct xfs_efi_log_item	*efip;
++	uint			size;
++
++	ASSERT(nextents > 0);
++	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
++		size = (uint)(sizeof(struct xfs_efi_log_item) +
++			((nextents - 1) * sizeof(xfs_extent_t)));
++		efip = kmem_zalloc(size, 0);
++	} else {
++		efip = kmem_zone_zalloc(xfs_efi_zone, 0);
++	}
++
++	xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
++	efip->efi_format.efi_nextents = nextents;
++	efip->efi_format.efi_id = (uintptr_t)(void *)efip;
++	atomic_set(&efip->efi_next_extent, 0);
++	atomic_set(&efip->efi_refcount, 2);
++
++	return efip;
++}
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -123,40 +123,6 @@ xfs_cui_item_release(
+ 	xfs_cui_release(CUI_ITEM(lip));
+ }
+ 
+-static const struct xfs_item_ops xfs_cui_item_ops = {
+-	.iop_size	= xfs_cui_item_size,
+-	.iop_format	= xfs_cui_item_format,
+-	.iop_unpin	= xfs_cui_item_unpin,
+-	.iop_release	= xfs_cui_item_release,
+-};
+-
+-/*
+- * Allocate and initialize an cui item with the given number of extents.
+- */
+-struct xfs_cui_log_item *
+-xfs_cui_init(
+-	struct xfs_mount		*mp,
+-	uint				nextents)
+-
+-{
+-	struct xfs_cui_log_item		*cuip;
+-
+-	ASSERT(nextents > 0);
+-	if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
+-		cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
+-				0);
+-	else
+-		cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
+-
+-	xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+-	cuip->cui_format.cui_nextents = nextents;
+-	cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
+-	atomic_set(&cuip->cui_next_extent, 0);
+-	atomic_set(&cuip->cui_refcount, 2);
+-
+-	return cuip;
+-}
+-
+ static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip)
+ {
+ 	return container_of(lip, struct xfs_cud_log_item, cud_item);
+@@ -576,3 +542,64 @@ abort_error:
+ 	xfs_trans_cancel(tp);
+ 	return error;
+ }
++
++/* Relog an intent item to push the log tail forward. */
++static struct xfs_log_item *
++xfs_cui_item_relog(
++	struct xfs_log_item		*intent,
++	struct xfs_trans		*tp)
++{
++	struct xfs_cud_log_item		*cudp;
++	struct xfs_cui_log_item		*cuip;
++	struct xfs_phys_extent		*extp;
++	unsigned int			count;
++
++	count = CUI_ITEM(intent)->cui_format.cui_nextents;
++	extp = CUI_ITEM(intent)->cui_format.cui_extents;
++
++	tp->t_flags |= XFS_TRANS_DIRTY;
++	cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
++	set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
++
++	cuip = xfs_cui_init(tp->t_mountp, count);
++	memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
++	atomic_set(&cuip->cui_next_extent, count);
++	xfs_trans_add_item(tp, &cuip->cui_item);
++	set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
++	return &cuip->cui_item;
++}
++
++static const struct xfs_item_ops xfs_cui_item_ops = {
++	.iop_size	= xfs_cui_item_size,
++	.iop_format	= xfs_cui_item_format,
++	.iop_unpin	= xfs_cui_item_unpin,
++	.iop_release	= xfs_cui_item_release,
++	.iop_relog	= xfs_cui_item_relog,
++};
++
++/*
++ * Allocate and initialize an cui item with the given number of extents.
++ */
++struct xfs_cui_log_item *
++xfs_cui_init(
++	struct xfs_mount		*mp,
++	uint				nextents)
++
++{
++	struct xfs_cui_log_item		*cuip;
++
++	ASSERT(nextents > 0);
++	if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
++		cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
++				0);
++	else
++		cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
++
++	xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
++	cuip->cui_format.cui_nextents = nextents;
++	cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
++	atomic_set(&cuip->cui_next_extent, 0);
++	atomic_set(&cuip->cui_refcount, 2);
++
++	return cuip;
++}
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -122,39 +122,6 @@ xfs_rui_item_release(
+ 	xfs_rui_release(RUI_ITEM(lip));
+ }
+ 
+-static const struct xfs_item_ops xfs_rui_item_ops = {
+-	.iop_size	= xfs_rui_item_size,
+-	.iop_format	= xfs_rui_item_format,
+-	.iop_unpin	= xfs_rui_item_unpin,
+-	.iop_release	= xfs_rui_item_release,
+-};
+-
+-/*
+- * Allocate and initialize an rui item with the given number of extents.
+- */
+-struct xfs_rui_log_item *
+-xfs_rui_init(
+-	struct xfs_mount		*mp,
+-	uint				nextents)
+-
+-{
+-	struct xfs_rui_log_item		*ruip;
+-
+-	ASSERT(nextents > 0);
+-	if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
+-		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
+-	else
+-		ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
+-
+-	xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
+-	ruip->rui_format.rui_nextents = nextents;
+-	ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
+-	atomic_set(&ruip->rui_next_extent, 0);
+-	atomic_set(&ruip->rui_refcount, 2);
+-
+-	return ruip;
+-}
+-
+ /*
+  * Copy an RUI format buffer from the given buf, and into the destination
+  * RUI format structure.  The RUI/RUD items were designed not to need any
+@@ -600,3 +567,63 @@ abort_error:
+ 	xfs_trans_cancel(tp);
+ 	return error;
+ }
++
++/* Relog an intent item to push the log tail forward. */
++static struct xfs_log_item *
++xfs_rui_item_relog(
++	struct xfs_log_item		*intent,
++	struct xfs_trans		*tp)
++{
++	struct xfs_rud_log_item		*rudp;
++	struct xfs_rui_log_item		*ruip;
++	struct xfs_map_extent		*extp;
++	unsigned int			count;
++
++	count = RUI_ITEM(intent)->rui_format.rui_nextents;
++	extp = RUI_ITEM(intent)->rui_format.rui_extents;
++
++	tp->t_flags |= XFS_TRANS_DIRTY;
++	rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
++	set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
++
++	ruip = xfs_rui_init(tp->t_mountp, count);
++	memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
++	atomic_set(&ruip->rui_next_extent, count);
++	xfs_trans_add_item(tp, &ruip->rui_item);
++	set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
++	return &ruip->rui_item;
++}
++
++static const struct xfs_item_ops xfs_rui_item_ops = {
++	.iop_size	= xfs_rui_item_size,
++	.iop_format	= xfs_rui_item_format,
++	.iop_unpin	= xfs_rui_item_unpin,
++	.iop_release	= xfs_rui_item_release,
++	.iop_relog	= xfs_rui_item_relog,
++};
++
++/*
++ * Allocate and initialize an rui item with the given number of extents.
++ */
++struct xfs_rui_log_item *
++xfs_rui_init(
++	struct xfs_mount		*mp,
++	uint				nextents)
++
++{
++	struct xfs_rui_log_item		*ruip;
++
++	ASSERT(nextents > 0);
++	if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
++		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
++	else
++		ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
++
++	xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
++	ruip->rui_format.rui_nextents = nextents;
++	ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
++	atomic_set(&ruip->rui_next_extent, 0);
++	atomic_set(&ruip->rui_refcount, 2);
++
++	return ruip;
++}
+--- a/fs/xfs/xfs_stats.c
++++ b/fs/xfs/xfs_stats.c
+@@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __p
+ 	uint64_t	xs_xstrat_bytes = 0;
+ 	uint64_t	xs_write_bytes = 0;
+ 	uint64_t	xs_read_bytes = 0;
++	uint64_t	defer_relog = 0;
+ 
+ 	static const struct xstats_entry {
+ 		char	*desc;
+@@ -70,10 +71,13 @@ int xfs_stats_format(struct xfsstats __p
+ 		xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
+ 		xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
+ 		xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
++		defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
+ 	}
+ 
+ 	len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
+ 			xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
++	len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n",
++			defer_relog);
+ 	len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n",
+ #if defined(DEBUG)
+ 		1);
+--- a/fs/xfs/xfs_stats.h
++++ b/fs/xfs/xfs_stats.h
+@@ -137,6 +137,7 @@ struct __xfsstats {
+ 	uint64_t		xs_xstrat_bytes;
+ 	uint64_t		xs_write_bytes;
+ 	uint64_t		xs_read_bytes;
++	uint64_t		defer_relog;
+ };
+ 
+ #define	xfsstats_offset(f)	(offsetof(struct __xfsstats, f)/sizeof(uint32_t))
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -2418,6 +2418,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cre
+ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
+ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
+ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
++DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+ 
+ #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -77,6 +77,8 @@ struct xfs_item_ops {
+ 	void (*iop_release)(struct xfs_log_item *);
+ 	xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+ 	void (*iop_error)(struct xfs_log_item *, xfs_buf_t *);
++	struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
++			struct xfs_trans *tp);
+ };
+ 
+ /*
+@@ -244,4 +246,12 @@ void		xfs_trans_buf_copy_type(struct xfs
+ 
+ extern kmem_zone_t	*xfs_trans_zone;
+ 
++static inline struct xfs_log_item *
++xfs_trans_item_relog(
++	struct xfs_log_item	*lip,
++	struct xfs_trans	*tp)
++{
++	return lip->li_ops->iop_relog(lip, tp);
++}
++
+ #endif	/* __XFS_TRANS_H__ */
diff --git a/queue-5.4/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch b/queue-5.4/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
new file mode 100644
index 00000000000..3cf2ef1b6c8
--- /dev/null
+++ b/queue-5.4/xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
@@ -0,0 +1,162 @@
+From stable-owner@vger.kernel.org Thu Feb 16 08:02:00 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:18 +0530
+Subject: xfs: prevent UAF in xfs_log_item_in_current_chkpt
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-25-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit f8d92a66e810acbef6ddbc0bd0cbd9b117ce8acd upstream.
+
+[ Continue to interpret xfs_log_item->li_seq as an LSN rather than a CIL sequence
+  number. ]
+
+While I was running with KASAN and lockdep enabled, I stumbled upon an
+KASAN report about a UAF to a freed CIL checkpoint.  Looking at the
+comment for xfs_log_item_in_current_chkpt, it seems pretty obvious to me
+that the original patch to xfs_defer_finish_noroll should have done
+something to lock the CIL to prevent it from switching the CIL contexts
+while the predicate runs.
+
+For upper level code that needs to know if a given log item is new
+enough not to need relogging, add a new wrapper that takes the CIL
+context lock long enough to sample the current CIL context.  This is
+kind of racy in that the CIL can switch the contexts immediately after
+sampling, but that's ok because the consequence is that the defer ops
+code is a little slow to relog items.
+
+ ==================================================================
+ BUG: KASAN: use-after-free in xfs_log_item_in_current_chkpt+0x139/0x160 [xfs]
+ Read of size 8 at addr ffff88804ea5f608 by task fsstress/527999
+
+ CPU: 1 PID: 527999 Comm: fsstress Tainted: G      D      5.16.0-rc4-xfsx #rc4
+ Call Trace:
+  <TASK>
+  dump_stack_lvl+0x45/0x59
+  print_address_description.constprop.0+0x1f/0x140
+  kasan_report.cold+0x83/0xdf
+  xfs_log_item_in_current_chkpt+0x139/0x160
+  xfs_defer_finish_noroll+0x3bb/0x1e30
+  __xfs_trans_commit+0x6c8/0xcf0
+  xfs_reflink_remap_extent+0x66f/0x10e0
+  xfs_reflink_remap_blocks+0x2dd/0xa90
+  xfs_file_remap_range+0x27b/0xc30
+  vfs_dedupe_file_range_one+0x368/0x420
+  vfs_dedupe_file_range+0x37c/0x5d0
+  do_vfs_ioctl+0x308/0x1260
+  __x64_sys_ioctl+0xa1/0x170
+  do_syscall_64+0x35/0x80
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+ RIP: 0033:0x7f2c71a2950b
+ Code: 0f 1e fa 48 8b 05 85 39 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff
+ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01
+f0 ff ff 73 01 c3 48 8b 0d 55 39 0d 00 f7 d8 64 89 01 48
+ RSP: 002b:00007ffe8c0e03c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+ RAX: ffffffffffffffda RBX: 00005600862a8740 RCX: 00007f2c71a2950b
+ RDX: 00005600862a7be0 RSI: 00000000c0189436 RDI: 0000000000000004
+ RBP: 000000000000000b R08: 0000000000000027 R09: 0000000000000003
+ R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000005a
+ R13: 00005600862804a8 R14: 0000000000016000 R15: 00005600862a8a20
+  </TASK>
+
+ Allocated by task 464064:
+  kasan_save_stack+0x1e/0x50
+  __kasan_kmalloc+0x81/0xa0
+  kmem_alloc+0xcd/0x2c0 [xfs]
+  xlog_cil_ctx_alloc+0x17/0x1e0 [xfs]
+  xlog_cil_push_work+0x141/0x13d0 [xfs]
+  process_one_work+0x7f6/0x1380
+  worker_thread+0x59d/0x1040
+  kthread+0x3b0/0x490
+  ret_from_fork+0x1f/0x30
+
+ Freed by task 51:
+  kasan_save_stack+0x1e/0x50
+  kasan_set_track+0x21/0x30
+  kasan_set_free_info+0x20/0x30
+  __kasan_slab_free+0xed/0x130
+  slab_free_freelist_hook+0x7f/0x160
+  kfree+0xde/0x340
+  xlog_cil_committed+0xbfd/0xfe0 [xfs]
+  xlog_cil_process_committed+0x103/0x1c0 [xfs]
+  xlog_state_do_callback+0x45d/0xbd0 [xfs]
+  xlog_ioend_work+0x116/0x1c0 [xfs]
+  process_one_work+0x7f6/0x1380
+  worker_thread+0x59d/0x1040
+  kthread+0x3b0/0x490
+  ret_from_fork+0x1f/0x30
+
+ Last potentially related work creation:
+  kasan_save_stack+0x1e/0x50
+  __kasan_record_aux_stack+0xb7/0xc0
+  insert_work+0x48/0x2e0
+  __queue_work+0x4e7/0xda0
+  queue_work_on+0x69/0x80
+  xlog_cil_push_now.isra.0+0x16b/0x210 [xfs]
+  xlog_cil_force_seq+0x1b7/0x850 [xfs]
+  xfs_log_force_seq+0x1c7/0x670 [xfs]
+  xfs_file_fsync+0x7c1/0xa60 [xfs]
+  __x64_sys_fsync+0x52/0x80
+  do_syscall_64+0x35/0x80
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+ The buggy address belongs to the object at ffff88804ea5f600
+  which belongs to the cache kmalloc-256 of size 256
+ The buggy address is located 8 bytes inside of
+  256-byte region [ffff88804ea5f600, ffff88804ea5f700)
+ The buggy address belongs to the page:
+ page:ffffea00013a9780 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88804ea5ea00 pfn:0x4ea5e
+ head:ffffea00013a9780 order:1 compound_mapcount:0
+ flags: 0x4fff80000010200(slab|head|node=1|zone=1|lastcpupid=0xfff)
+ raw: 04fff80000010200 ffffea0001245908 ffffea00011bd388 ffff888004c42b40
+ raw: ffff88804ea5ea00 0000000000100009 00000001ffffffff 0000000000000000
+ page dumped because: kasan: bad access detected
+
+ Memory state around the buggy address:
+  ffff88804ea5f500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+  ffff88804ea5f580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ >ffff88804ea5f600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+                       ^
+  ffff88804ea5f680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+  ffff88804ea5f700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ==================================================================
+
+Fixes: 4e919af7827a ("xfs: periodically relog deferred intent items")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_cil.c |    8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_log_cil.c
++++ b/fs/xfs/xfs_log_cil.c
+@@ -1178,21 +1178,19 @@ out_shutdown:
+  */
+ bool
+ xfs_log_item_in_current_chkpt(
+-	struct xfs_log_item *lip)
++	struct xfs_log_item	*lip)
+ {
+-	struct xfs_cil_ctx *ctx;
++	struct xfs_cil		*cil = lip->li_mountp->m_log->l_cilp;
+ 
+ 	if (list_empty(&lip->li_cil))
+ 		return false;
+ 
+-	ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+-
+ 	/*
+ 	 * li_seq is written on the first commit of a log item to record the
+ 	 * first checkpoint it is written to. Hence if it is different to the
+ 	 * current sequence, we're in a new checkpoint.
+ 	 */
+-	if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
++	if (XFS_LSN_CMP(lip->li_seq, READ_ONCE(cil->xc_current_sequence)) != 0)
+ 		return false;
+ 	return true;
+ }
diff --git a/queue-5.4/xfs-proper-replay-of-deferred-ops-queued-during-log-recovery.patch b/queue-5.4/xfs-proper-replay-of-deferred-ops-queued-during-log-recovery.patch
new file mode 100644
index 00000000000..246e9ba5c9f
--- /dev/null
+++ b/queue-5.4/xfs-proper-replay-of-deferred-ops-queued-during-log-recovery.patch
@@ -0,0 +1,666 @@
+From chandan.babu@oracle.com Thu Feb 16 06:21:53 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:05 +0530
+Subject: xfs: proper replay of deferred ops queued during log recovery
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-12-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit e6fff81e487089e47358a028526a9f63cdbcd503 upstream.
+
+When we replay unfinished intent items that have been recovered from the
+log, it's possible that the replay will cause the creation of more
+deferred work items.  As outlined in commit 509955823cc9c ("xfs: log
+recovery should replay deferred ops in order"), later work items have an
+implicit ordering dependency on earlier work items.  Therefore, recovery
+must replay the items (both recovered and created) in the same order
+that they would have been during normal operation.
+
+For log recovery, we enforce this ordering by using an empty transaction
+to collect deferred ops that get created in the process of recovering a
+log intent item to prevent them from being committed before the rest of
+the recovered intent items.  After we finish committing all the
+recovered log items, we allocate a transaction with an enormous block
+reservation, splice our huge list of created deferred ops into that
+transaction, and commit it, thereby finishing all those ops.
+
+This is /really/ hokey -- it's the one place in XFS where we allow
+nested transactions; the splicing of the defer ops list is is inelegant
+and has to be done twice per recovery function; and the broken way we
+handle inode pointers and block reservations cause subtle use-after-free
+and allocator problems that will be fixed by this patch and the two
+patches after it.
+
+Therefore, replace the hokey empty transaction with a structure designed
+to capture each chain of deferred ops that are created as part of
+recovering a single unfinished log intent.  Finally, refactor the loop
+that replays those chains to do so using one transaction per chain.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c  |   89 ++++++++++++++++++++++++--
+ fs/xfs/libxfs/xfs_defer.h  |   19 +++++
+ fs/xfs/xfs_bmap_item.c     |   18 +----
+ fs/xfs/xfs_bmap_item.h     |    3 
+ fs/xfs/xfs_extfree_item.c  |    9 +-
+ fs/xfs/xfs_extfree_item.h  |    4 -
+ fs/xfs/xfs_log_recover.c   |  151 +++++++++++++++++++++++++--------------------
+ fs/xfs/xfs_refcount_item.c |   18 +----
+ fs/xfs/xfs_refcount_item.h |    3 
+ fs/xfs/xfs_rmap_item.c     |    8 +-
+ fs/xfs/xfs_rmap_item.h     |    3 
+ 11 files changed, 213 insertions(+), 112 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -563,14 +563,89 @@ xfs_defer_move(
+  *
+  * Create and log intent items for all the work that we're capturing so that we
+  * can be assured that the items will get replayed if the system goes down
+- * before log recovery gets a chance to finish the work it put off.  Then we
+- * move the chain from stp to dtp.
++ * before log recovery gets a chance to finish the work it put off.  The entire
++ * deferred ops state is transferred to the capture structure and the
++ * transaction is then ready for the caller to commit it.  If there are no
++ * intent items to capture, this function returns NULL.
++ */
++static struct xfs_defer_capture *
++xfs_defer_ops_capture(
++	struct xfs_trans		*tp)
++{
++	struct xfs_defer_capture	*dfc;
++
++	if (list_empty(&tp->t_dfops))
++		return NULL;
++
++	/* Create an object to capture the defer ops. */
++	dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
++	INIT_LIST_HEAD(&dfc->dfc_list);
++	INIT_LIST_HEAD(&dfc->dfc_dfops);
++
++	xfs_defer_create_intents(tp);
++
++	/* Move the dfops chain and transaction state to the capture struct. */
++	list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
++	dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
++	tp->t_flags &= ~XFS_TRANS_LOWMODE;
++
++	return dfc;
++}
++
++/* Release all resources that we used to capture deferred ops. */
++void
++xfs_defer_ops_release(
++	struct xfs_mount		*mp,
++	struct xfs_defer_capture	*dfc)
++{
++	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
++	kmem_free(dfc);
++}
++
++/*
++ * Capture any deferred ops and commit the transaction.  This is the last step
++ * needed to finish a log intent item that we recovered from the log.
++ */
++int
++xfs_defer_ops_capture_and_commit(
++	struct xfs_trans		*tp,
++	struct list_head		*capture_list)
++{
++	struct xfs_mount		*mp = tp->t_mountp;
++	struct xfs_defer_capture	*dfc;
++	int				error;
++
++	/* If we don't capture anything, commit transaction and exit. */
++	dfc = xfs_defer_ops_capture(tp);
++	if (!dfc)
++		return xfs_trans_commit(tp);
++
++	/* Commit the transaction and add the capture structure to the list. */
++	error = xfs_trans_commit(tp);
++	if (error) {
++		xfs_defer_ops_release(mp, dfc);
++		return error;
++	}
++
++	list_add_tail(&dfc->dfc_list, capture_list);
++	return 0;
++}
++
++/*
++ * Attach a chain of captured deferred ops to a new transaction and free the
++ * capture structure.
+  */
+ void
+-xfs_defer_capture(
+-	struct xfs_trans	*dtp,
+-	struct xfs_trans	*stp)
++xfs_defer_ops_continue(
++	struct xfs_defer_capture	*dfc,
++	struct xfs_trans		*tp)
+ {
+-	xfs_defer_create_intents(stp);
+-	xfs_defer_move(dtp, stp);
++	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
++	ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
++
++	/* Move captured dfops chain and state to the transaction. */
++	list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
++	tp->t_flags |= dfc->dfc_tpflags;
++
++	kmem_free(dfc);
+ }
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -7,6 +7,7 @@
+ #define	__XFS_DEFER_H__
+ 
+ struct xfs_defer_op_type;
++struct xfs_defer_capture;
+ 
+ /*
+  * Header for deferred operation list.
+@@ -62,9 +63,25 @@ extern const struct xfs_defer_op_type xf
+ extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+ 
+ /*
++ * This structure enables a dfops user to detach the chain of deferred
++ * operations from a transaction so that they can be continued later.
++ */
++struct xfs_defer_capture {
++	/* List of other capture structures. */
++	struct list_head	dfc_list;
++
++	/* Deferred ops state saved from the transaction. */
++	struct list_head	dfc_dfops;
++	unsigned int		dfc_tpflags;
++};
++
++/*
+  * Functions to capture a chain of deferred operations and continue them later.
+  * This doesn't normally happen except log recovery.
+  */
+-void xfs_defer_capture(struct xfs_trans *dtp, struct xfs_trans *stp);
++int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
++		struct list_head *capture_list);
++void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp);
++void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+ 
+ #endif /* __XFS_DEFER_H__ */
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -425,8 +425,8 @@ const struct xfs_defer_op_type xfs_bmap_
+  */
+ int
+ xfs_bui_recover(
+-	struct xfs_trans		*parent_tp,
+-	struct xfs_bui_log_item		*buip)
++	struct xfs_bui_log_item		*buip,
++	struct list_head		*capture_list)
+ {
+ 	int				error = 0;
+ 	unsigned int			bui_type;
+@@ -442,7 +442,7 @@ xfs_bui_recover(
+ 	struct xfs_trans		*tp;
+ 	struct xfs_inode		*ip = NULL;
+ 	struct xfs_bmbt_irec		irec;
+-	struct xfs_mount		*mp = parent_tp->t_mountp;
++	struct xfs_mount		*mp = buip->bui_item.li_mountp;
+ 
+ 	ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
+ 
+@@ -491,12 +491,7 @@ xfs_bui_recover(
+ 			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ 	if (error)
+ 		return error;
+-	/*
+-	 * Recovery stashes all deferred ops during intent processing and
+-	 * finishes them on completion. Transfer current dfops state to this
+-	 * transaction and transfer the result back before we return.
+-	 */
+-	xfs_defer_move(tp, parent_tp);
++
+ 	budp = xfs_trans_get_bud(tp, buip);
+ 
+ 	/* Grab the inode. */
+@@ -541,15 +536,12 @@ xfs_bui_recover(
+ 	}
+ 
+ 	set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+-	xfs_defer_capture(parent_tp, tp);
+-	error = xfs_trans_commit(tp);
++	error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+ 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ 	xfs_irele(ip);
+-
+ 	return error;
+ 
+ err_inode:
+-	xfs_defer_move(parent_tp, tp);
+ 	xfs_trans_cancel(tp);
+ 	if (ip) {
+ 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+--- a/fs/xfs/xfs_bmap_item.h
++++ b/fs/xfs/xfs_bmap_item.h
+@@ -77,6 +77,7 @@ extern struct kmem_zone	*xfs_bud_zone;
+ struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
+ void xfs_bui_item_free(struct xfs_bui_log_item *);
+ void xfs_bui_release(struct xfs_bui_log_item *);
+-int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip);
++int xfs_bui_recover(struct xfs_bui_log_item *buip,
++		struct list_head *capture_list);
+ 
+ #endif	/* __XFS_BMAP_ITEM_H__ */
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -586,9 +586,10 @@ const struct xfs_defer_op_type xfs_agfl_
+  */
+ int
+ xfs_efi_recover(
+-	struct xfs_mount	*mp,
+-	struct xfs_efi_log_item	*efip)
++	struct xfs_efi_log_item	*efip,
++	struct list_head	*capture_list)
+ {
++	struct xfs_mount	*mp = efip->efi_item.li_mountp;
+ 	struct xfs_efd_log_item	*efdp;
+ 	struct xfs_trans	*tp;
+ 	int			i;
+@@ -637,8 +638,8 @@ xfs_efi_recover(
+ 	}
+ 
+ 	set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+-	error = xfs_trans_commit(tp);
+-	return error;
++
++	return xfs_defer_ops_capture_and_commit(tp, capture_list);
+ 
+ abort_error:
+ 	xfs_trans_cancel(tp);
+--- a/fs/xfs/xfs_extfree_item.h
++++ b/fs/xfs/xfs_extfree_item.h
+@@ -84,7 +84,7 @@ int			xfs_efi_copy_format(xfs_log_iovec_
+ void			xfs_efi_item_free(struct xfs_efi_log_item *);
+ void			xfs_efi_release(struct xfs_efi_log_item *);
+ 
+-int			xfs_efi_recover(struct xfs_mount *mp,
+-					struct xfs_efi_log_item *efip);
++int			xfs_efi_recover(struct xfs_efi_log_item *efip,
++					struct list_head *capture_list);
+ 
+ #endif	/* __XFS_EXTFREE_ITEM_H__ */
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4587,9 +4587,9 @@ xlog_recover_process_data(
+ /* Recover the EFI if necessary. */
+ STATIC int
+ xlog_recover_process_efi(
+-	struct xfs_mount		*mp,
+ 	struct xfs_ail			*ailp,
+-	struct xfs_log_item		*lip)
++	struct xfs_log_item		*lip,
++	struct list_head		*capture_list)
+ {
+ 	struct xfs_efi_log_item		*efip;
+ 	int				error;
+@@ -4602,7 +4602,7 @@ xlog_recover_process_efi(
+ 		return 0;
+ 
+ 	spin_unlock(&ailp->ail_lock);
+-	error = xfs_efi_recover(mp, efip);
++	error = xfs_efi_recover(efip, capture_list);
+ 	spin_lock(&ailp->ail_lock);
+ 
+ 	return error;
+@@ -4627,9 +4627,9 @@ xlog_recover_cancel_efi(
+ /* Recover the RUI if necessary. */
+ STATIC int
+ xlog_recover_process_rui(
+-	struct xfs_mount		*mp,
+ 	struct xfs_ail			*ailp,
+-	struct xfs_log_item		*lip)
++	struct xfs_log_item		*lip,
++	struct list_head		*capture_list)
+ {
+ 	struct xfs_rui_log_item		*ruip;
+ 	int				error;
+@@ -4642,7 +4642,7 @@ xlog_recover_process_rui(
+ 		return 0;
+ 
+ 	spin_unlock(&ailp->ail_lock);
+-	error = xfs_rui_recover(mp, ruip);
++	error = xfs_rui_recover(ruip, capture_list);
+ 	spin_lock(&ailp->ail_lock);
+ 
+ 	return error;
+@@ -4667,9 +4667,9 @@ xlog_recover_cancel_rui(
+ /* Recover the CUI if necessary. */
+ STATIC int
+ xlog_recover_process_cui(
+-	struct xfs_trans		*parent_tp,
+ 	struct xfs_ail			*ailp,
+-	struct xfs_log_item		*lip)
++	struct xfs_log_item		*lip,
++	struct list_head		*capture_list)
+ {
+ 	struct xfs_cui_log_item		*cuip;
+ 	int				error;
+@@ -4682,7 +4682,7 @@ xlog_recover_process_cui(
+ 		return 0;
+ 
+ 	spin_unlock(&ailp->ail_lock);
+-	error = xfs_cui_recover(parent_tp, cuip);
++	error = xfs_cui_recover(cuip, capture_list);
+ 	spin_lock(&ailp->ail_lock);
+ 
+ 	return error;
+@@ -4707,9 +4707,9 @@ xlog_recover_cancel_cui(
+ /* Recover the BUI if necessary. */
+ STATIC int
+ xlog_recover_process_bui(
+-	struct xfs_trans		*parent_tp,
+ 	struct xfs_ail			*ailp,
+-	struct xfs_log_item		*lip)
++	struct xfs_log_item		*lip,
++	struct list_head		*capture_list)
+ {
+ 	struct xfs_bui_log_item		*buip;
+ 	int				error;
+@@ -4722,7 +4722,7 @@ xlog_recover_process_bui(
+ 		return 0;
+ 
+ 	spin_unlock(&ailp->ail_lock);
+-	error = xfs_bui_recover(parent_tp, buip);
++	error = xfs_bui_recover(buip, capture_list);
+ 	spin_lock(&ailp->ail_lock);
+ 
+ 	return error;
+@@ -4761,37 +4761,65 @@ static inline bool xlog_item_is_intent(s
+ /* Take all the collected deferred ops and finish them in order. */
+ static int
+ xlog_finish_defer_ops(
+-	struct xfs_trans	*parent_tp)
++	struct xfs_mount	*mp,
++	struct list_head	*capture_list)
+ {
+-	struct xfs_mount	*mp = parent_tp->t_mountp;
++	struct xfs_defer_capture *dfc, *next;
+ 	struct xfs_trans	*tp;
+ 	int64_t			freeblks;
+-	uint			resblks;
+-	int			error;
++	uint64_t		resblks;
++	int			error = 0;
+ 
+-	/*
+-	 * We're finishing the defer_ops that accumulated as a result of
+-	 * recovering unfinished intent items during log recovery.  We
+-	 * reserve an itruncate transaction because it is the largest
+-	 * permanent transaction type.  Since we're the only user of the fs
+-	 * right now, take 93% (15/16) of the available free blocks.  Use
+-	 * weird math to avoid a 64-bit division.
+-	 */
+-	freeblks = percpu_counter_sum(&mp->m_fdblocks);
+-	if (freeblks <= 0)
+-		return -ENOSPC;
+-	resblks = min_t(int64_t, UINT_MAX, freeblks);
+-	resblks = (resblks * 15) >> 4;
+-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+-			0, XFS_TRANS_RESERVE, &tp);
+-	if (error)
+-		return error;
+-	/* transfer all collected dfops to this transaction */
+-	xfs_defer_move(tp, parent_tp);
++	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
++		/*
++		 * We're finishing the defer_ops that accumulated as a result
++		 * of recovering unfinished intent items during log recovery.
++		 * We reserve an itruncate transaction because it is the
++		 * largest permanent transaction type.  Since we're the only
++		 * user of the fs right now, take 93% (15/16) of the available
++		 * free blocks.  Use weird math to avoid a 64-bit division.
++		 */
++		freeblks = percpu_counter_sum(&mp->m_fdblocks);
++		if (freeblks <= 0)
++			return -ENOSPC;
++
++		resblks = min_t(uint64_t, UINT_MAX, freeblks);
++		resblks = (resblks * 15) >> 4;
++		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
++				0, XFS_TRANS_RESERVE, &tp);
++		if (error)
++			return error;
++
++		/*
++		 * Transfer to this new transaction all the dfops we captured
++		 * from recovering a single intent item.
++		 */
++		list_del_init(&dfc->dfc_list);
++		xfs_defer_ops_continue(dfc, tp);
++
++		error = xfs_trans_commit(tp);
++		if (error)
++			return error;
++	}
+ 
+-	return xfs_trans_commit(tp);
++	ASSERT(list_empty(capture_list));
++	return 0;
+ }
+ 
++/* Release all the captured defer ops and capture structures in this list. */
++static void
++xlog_abort_defer_ops(
++	struct xfs_mount		*mp,
++	struct list_head		*capture_list)
++{
++	struct xfs_defer_capture	*dfc;
++	struct xfs_defer_capture	*next;
++
++	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
++		list_del_init(&dfc->dfc_list);
++		xfs_defer_ops_release(mp, dfc);
++	}
++}
+ /*
+  * When this is called, all of the log intent items which did not have
+  * corresponding log done items should be in the AIL.  What we do now
+@@ -4812,35 +4840,23 @@ STATIC int
+ xlog_recover_process_intents(
+ 	struct xlog		*log)
+ {
+-	struct xfs_trans	*parent_tp;
++	LIST_HEAD(capture_list);
+ 	struct xfs_ail_cursor	cur;
+ 	struct xfs_log_item	*lip;
+ 	struct xfs_ail		*ailp;
+-	int			error;
++	int			error = 0;
+ #if defined(DEBUG) || defined(XFS_WARN)
+ 	xfs_lsn_t		last_lsn;
+ #endif
+ 
+-	/*
+-	 * The intent recovery handlers commit transactions to complete recovery
+-	 * for individual intents, but any new deferred operations that are
+-	 * queued during that process are held off until the very end. The
+-	 * purpose of this transaction is to serve as a container for deferred
+-	 * operations. Each intent recovery handler must transfer dfops here
+-	 * before its local transaction commits, and we'll finish the entire
+-	 * list below.
+-	 */
+-	error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
+-	if (error)
+-		return error;
+-
+ 	ailp = log->l_ailp;
+ 	spin_lock(&ailp->ail_lock);
+-	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ #if defined(DEBUG) || defined(XFS_WARN)
+ 	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
+ #endif
+-	while (lip != NULL) {
++	for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
++	     lip != NULL;
++	     lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
+ 		/*
+ 		 * We're done when we see something other than an intent.
+ 		 * There should be no intents left in the AIL now.
+@@ -4862,35 +4878,40 @@ xlog_recover_process_intents(
+ 
+ 		/*
+ 		 * NOTE: If your intent processing routine can create more
+-		 * deferred ops, you /must/ attach them to the dfops in this
+-		 * routine or else those subsequent intents will get
++		 * deferred ops, you /must/ attach them to the capture list in
++		 * the recover routine or else those subsequent intents will be
+ 		 * replayed in the wrong order!
+ 		 */
+ 		switch (lip->li_type) {
+ 		case XFS_LI_EFI:
+-			error = xlog_recover_process_efi(log->l_mp, ailp, lip);
++			error = xlog_recover_process_efi(ailp, lip, &capture_list);
+ 			break;
+ 		case XFS_LI_RUI:
+-			error = xlog_recover_process_rui(log->l_mp, ailp, lip);
++			error = xlog_recover_process_rui(ailp, lip, &capture_list);
+ 			break;
+ 		case XFS_LI_CUI:
+-			error = xlog_recover_process_cui(parent_tp, ailp, lip);
++			error = xlog_recover_process_cui(ailp, lip, &capture_list);
+ 			break;
+ 		case XFS_LI_BUI:
+-			error = xlog_recover_process_bui(parent_tp, ailp, lip);
++			error = xlog_recover_process_bui(ailp, lip, &capture_list);
+ 			break;
+ 		}
+ 		if (error)
+-			goto out;
+-		lip = xfs_trans_ail_cursor_next(ailp, &cur);
++			break;
+ 	}
+-out:
++
+ 	xfs_trans_ail_cursor_done(&cur);
+ 	spin_unlock(&ailp->ail_lock);
+-	if (!error)
+-		error = xlog_finish_defer_ops(parent_tp);
+-	xfs_trans_cancel(parent_tp);
++	if (error)
++		goto err;
++
++	error = xlog_finish_defer_ops(log->l_mp, &capture_list);
++	if (error)
++		goto err;
+ 
++	return 0;
++err:
++	xlog_abort_defer_ops(log->l_mp, &capture_list);
+ 	return error;
+ }
+ 
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -439,8 +439,8 @@ const struct xfs_defer_op_type xfs_refco
+  */
+ int
+ xfs_cui_recover(
+-	struct xfs_trans		*parent_tp,
+-	struct xfs_cui_log_item		*cuip)
++	struct xfs_cui_log_item		*cuip,
++	struct list_head		*capture_list)
+ {
+ 	int				i;
+ 	int				error = 0;
+@@ -456,7 +456,7 @@ xfs_cui_recover(
+ 	xfs_extlen_t			new_len;
+ 	struct xfs_bmbt_irec		irec;
+ 	bool				requeue_only = false;
+-	struct xfs_mount		*mp = parent_tp->t_mountp;
++	struct xfs_mount		*mp = cuip->cui_item.li_mountp;
+ 
+ 	ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
+ 
+@@ -511,12 +511,7 @@ xfs_cui_recover(
+ 			mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
+ 	if (error)
+ 		return error;
+-	/*
+-	 * Recovery stashes all deferred ops during intent processing and
+-	 * finishes them on completion. Transfer current dfops state to this
+-	 * transaction and transfer the result back before we return.
+-	 */
+-	xfs_defer_move(tp, parent_tp);
++
+ 	cudp = xfs_trans_get_cud(tp, cuip);
+ 
+ 	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+@@ -574,13 +569,10 @@ xfs_cui_recover(
+ 
+ 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+ 	set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+-	xfs_defer_capture(parent_tp, tp);
+-	error = xfs_trans_commit(tp);
+-	return error;
++	return xfs_defer_ops_capture_and_commit(tp, capture_list);
+ 
+ abort_error:
+ 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+-	xfs_defer_move(parent_tp, tp);
+ 	xfs_trans_cancel(tp);
+ 	return error;
+ }
+--- a/fs/xfs/xfs_refcount_item.h
++++ b/fs/xfs/xfs_refcount_item.h
+@@ -80,6 +80,7 @@ extern struct kmem_zone	*xfs_cud_zone;
+ struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
+ void xfs_cui_item_free(struct xfs_cui_log_item *);
+ void xfs_cui_release(struct xfs_cui_log_item *);
+-int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip);
++int xfs_cui_recover(struct xfs_cui_log_item *cuip,
++		struct list_head *capture_list);
+ 
+ #endif	/* __XFS_REFCOUNT_ITEM_H__ */
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -483,9 +483,10 @@ const struct xfs_defer_op_type xfs_rmap_
+  */
+ int
+ xfs_rui_recover(
+-	struct xfs_mount		*mp,
+-	struct xfs_rui_log_item		*ruip)
++	struct xfs_rui_log_item		*ruip,
++	struct list_head		*capture_list)
+ {
++	struct xfs_mount		*mp = ruip->rui_item.li_mountp;
+ 	int				i;
+ 	int				error = 0;
+ 	struct xfs_map_extent		*rmap;
+@@ -592,8 +593,7 @@ xfs_rui_recover(
+ 
+ 	xfs_rmap_finish_one_cleanup(tp, rcur, error);
+ 	set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
+-	error = xfs_trans_commit(tp);
+-	return error;
++	return xfs_defer_ops_capture_and_commit(tp, capture_list);
+ 
+ abort_error:
+ 	xfs_rmap_finish_one_cleanup(tp, rcur, error);
+--- a/fs/xfs/xfs_rmap_item.h
++++ b/fs/xfs/xfs_rmap_item.h
+@@ -82,6 +82,7 @@ int xfs_rui_copy_format(struct xfs_log_i
+ 		struct xfs_rui_log_format *dst_rui_fmt);
+ void xfs_rui_item_free(struct xfs_rui_log_item *);
+ void xfs_rui_release(struct xfs_rui_log_item *);
+-int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip);
++int xfs_rui_recover(struct xfs_rui_log_item *ruip,
++		struct list_head *capture_list);
+ 
+ #endif	/* __XFS_RMAP_ITEM_H__ */
diff --git a/queue-5.4/xfs-refactor-xfs_defer_finish_noroll.patch b/queue-5.4/xfs-refactor-xfs_defer_finish_noroll.patch
new file mode 100644
index 00000000000..7895d34420e
--- /dev/null
+++ b/queue-5.4/xfs-refactor-xfs_defer_finish_noroll.patch
@@ -0,0 +1,190 @@
+From chandan.babu@oracle.com Thu Feb 16 06:21:31 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:02 +0530
+Subject: xfs: refactor xfs_defer_finish_noroll
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-9-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit bb47d79750f1a68a75d4c7defc2da934ba31de14 upstream.
+
+Split out a helper that operates on a single xfs_defer_pending structure
+to untangle the code.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c |  128 +++++++++++++++++++++-------------------------
+ 1 file changed, 59 insertions(+), 69 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -360,6 +360,53 @@ xfs_defer_cancel_list(
+ }
+ 
+ /*
++ * Log an intent-done item for the first pending intent, and finish the work
++ * items.
++ */
++static int
++xfs_defer_finish_one(
++	struct xfs_trans		*tp,
++	struct xfs_defer_pending	*dfp)
++{
++	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
++	void				*state = NULL;
++	struct list_head		*li, *n;
++	int				error;
++
++	trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
++
++	dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
++	list_for_each_safe(li, n, &dfp->dfp_work) {
++		list_del(li);
++		dfp->dfp_count--;
++		error = ops->finish_item(tp, li, dfp->dfp_done, &state);
++		if (error == -EAGAIN) {
++			/*
++			 * Caller wants a fresh transaction; put the work item
++			 * back on the list and log a new log intent item to
++			 * replace the old one.  See "Requesting a Fresh
++			 * Transaction while Finishing Deferred Work" above.
++			 */
++			list_add(li, &dfp->dfp_work);
++			dfp->dfp_count++;
++			dfp->dfp_done = NULL;
++			xfs_defer_create_intent(tp, dfp, false);
++		}
++
++		if (error)
++			goto out;
++	}
++
++	/* Done with the dfp, free it. */
++	list_del(&dfp->dfp_list);
++	kmem_free(dfp);
++out:
++	if (ops->finish_cleanup)
++		ops->finish_cleanup(tp, state, error);
++	return error;
++}
++
++/*
+  * Finish all the pending work.  This involves logging intent items for
+  * any work items that wandered in since the last transaction roll (if
+  * one has even happened), rolling the transaction, and finishing the
+@@ -372,11 +419,7 @@ xfs_defer_finish_noroll(
+ 	struct xfs_trans		**tp)
+ {
+ 	struct xfs_defer_pending	*dfp;
+-	struct list_head		*li;
+-	struct list_head		*n;
+-	void				*state;
+ 	int				error = 0;
+-	const struct xfs_defer_op_type	*ops;
+ 	LIST_HEAD(dop_pending);
+ 
+ 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+@@ -385,83 +428,30 @@ xfs_defer_finish_noroll(
+ 
+ 	/* Until we run out of pending work to finish... */
+ 	while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
+-		/* log intents and pull in intake items */
+ 		xfs_defer_create_intents(*tp);
+ 		list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
+ 
+-		/*
+-		 * Roll the transaction.
+-		 */
+ 		error = xfs_defer_trans_roll(tp);
+ 		if (error)
+-			goto out;
++			goto out_shutdown;
+ 
+-		/* Log an intent-done item for the first pending item. */
+ 		dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
+ 				       dfp_list);
+-		ops = defer_op_types[dfp->dfp_type];
+-		trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
+-		dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent,
+-				dfp->dfp_count);
+-
+-		/* Finish the work items. */
+-		state = NULL;
+-		list_for_each_safe(li, n, &dfp->dfp_work) {
+-			list_del(li);
+-			dfp->dfp_count--;
+-			error = ops->finish_item(*tp, li, dfp->dfp_done,
+-					&state);
+-			if (error == -EAGAIN) {
+-				/*
+-				 * Caller wants a fresh transaction;
+-				 * put the work item back on the list
+-				 * and jump out.
+-				 */
+-				list_add(li, &dfp->dfp_work);
+-				dfp->dfp_count++;
+-				break;
+-			} else if (error) {
+-				/*
+-				 * Clean up after ourselves and jump out.
+-				 * xfs_defer_cancel will take care of freeing
+-				 * all these lists and stuff.
+-				 */
+-				if (ops->finish_cleanup)
+-					ops->finish_cleanup(*tp, state, error);
+-				goto out;
+-			}
+-		}
+-		if (error == -EAGAIN) {
+-			/*
+-			 * Caller wants a fresh transaction, so log a new log
+-			 * intent item to replace the old one and roll the
+-			 * transaction.  See "Requesting a Fresh Transaction
+-			 * while Finishing Deferred Work" above.
+-			 */
+-			dfp->dfp_done = NULL;
+-			xfs_defer_create_intent(*tp, dfp, false);
+-		} else {
+-			/* Done with the dfp, free it. */
+-			list_del(&dfp->dfp_list);
+-			kmem_free(dfp);
+-		}
+-
+-		if (ops->finish_cleanup)
+-			ops->finish_cleanup(*tp, state, error);
+-	}
+-
+-out:
+-	if (error) {
+-		xfs_defer_trans_abort(*tp, &dop_pending);
+-		xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+-		trace_xfs_defer_finish_error(*tp, error);
+-		xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
+-		xfs_defer_cancel(*tp);
+-		return error;
++		error = xfs_defer_finish_one(*tp, dfp);
++		if (error && error != -EAGAIN)
++			goto out_shutdown;
+ 	}
+ 
+ 	trace_xfs_defer_finish_done(*tp, _RET_IP_);
+ 	return 0;
++
++out_shutdown:
++	xfs_defer_trans_abort(*tp, &dop_pending);
++	xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
++	trace_xfs_defer_finish_error(*tp, error);
++	xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
++	xfs_defer_cancel(*tp);
++	return error;
+ }
+ 
+ int
diff --git a/queue-5.4/xfs-remove-the-xfs_efd_log_item_t-typedef.patch b/queue-5.4/xfs-remove-the-xfs_efd_log_item_t-typedef.patch
new file mode 100644
index 00000000000..5379c58812c
--- /dev/null
+++ b/queue-5.4/xfs-remove-the-xfs_efd_log_item_t-typedef.patch
@@ -0,0 +1,52 @@
+From chandan.babu@oracle.com Thu Feb 16 06:20:47 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:49:56 +0530
+Subject: xfs: remove the xfs_efd_log_item_t typedef
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-3-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit c84e819090f39e96e4d432c9047a50d2424f99e0 upstream.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_extfree_item.h |    4 ++--
+ fs/xfs/xfs_super.c        |    2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_extfree_item.h
++++ b/fs/xfs/xfs_extfree_item.h
+@@ -63,12 +63,12 @@ struct xfs_efi_log_item {
+  * the fact that some extents earlier mentioned in an efi item
+  * have been freed.
+  */
+-typedef struct xfs_efd_log_item {
++struct xfs_efd_log_item {
+ 	struct xfs_log_item	efd_item;
+ 	struct xfs_efi_log_item *efd_efip;
+ 	uint			efd_next_extent;
+ 	xfs_efd_log_format_t	efd_format;
+-} xfs_efd_log_item_t;
++};
+ 
+ /*
+  * Max number of extents in fast allocation path.
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1914,7 +1914,7 @@ xfs_init_zones(void)
+ 	if (!xfs_buf_item_zone)
+ 		goto out_destroy_trans_zone;
+ 
+-	xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
++	xfs_efd_zone = kmem_zone_init((sizeof(struct xfs_efd_log_item) +
+ 			((XFS_EFD_MAX_FAST_EXTENTS - 1) *
+ 				 sizeof(xfs_extent_t))), "xfs_efd_item");
+ 	if (!xfs_efd_zone)
diff --git a/queue-5.4/xfs-remove-the-xfs_efi_log_item_t-typedef.patch b/queue-5.4/xfs-remove-the-xfs_efi_log_item_t-typedef.patch
new file mode 100644
index 00000000000..7089906556a
--- /dev/null
+++ b/queue-5.4/xfs-remove-the-xfs_efi_log_item_t-typedef.patch
@@ -0,0 +1,108 @@
+From stable-owner@vger.kernel.org Thu Feb 16 08:04:21 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:49:55 +0530
+Subject: xfs: remove the xfs_efi_log_item_t typedef
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-2-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 82ff450b2d936d778361a1de43eb078cc043c7fe upstream.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_extfree_item.c |    2 +-
+ fs/xfs/xfs_extfree_item.h |   10 +++++-----
+ fs/xfs/xfs_log_recover.c  |    4 ++--
+ fs/xfs/xfs_super.c        |    2 +-
+ 4 files changed, 9 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -161,7 +161,7 @@ xfs_efi_init(
+ 
+ 	ASSERT(nextents > 0);
+ 	if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+-		size = (uint)(sizeof(xfs_efi_log_item_t) +
++		size = (uint)(sizeof(struct xfs_efi_log_item) +
+ 			((nextents - 1) * sizeof(xfs_extent_t)));
+ 		efip = kmem_zalloc(size, 0);
+ 	} else {
+--- a/fs/xfs/xfs_extfree_item.h
++++ b/fs/xfs/xfs_extfree_item.h
+@@ -50,13 +50,13 @@ struct kmem_zone;
+  * of commit failure or log I/O errors. Note that the EFD is not inserted in the
+  * AIL, so at this point both the EFI and EFD are freed.
+  */
+-typedef struct xfs_efi_log_item {
++struct xfs_efi_log_item {
+ 	struct xfs_log_item	efi_item;
+ 	atomic_t		efi_refcount;
+ 	atomic_t		efi_next_extent;
+ 	unsigned long		efi_flags;	/* misc flags */
+ 	xfs_efi_log_format_t	efi_format;
+-} xfs_efi_log_item_t;
++};
+ 
+ /*
+  * This is the "extent free done" log item.  It is used to log
+@@ -65,7 +65,7 @@ typedef struct xfs_efi_log_item {
+  */
+ typedef struct xfs_efd_log_item {
+ 	struct xfs_log_item	efd_item;
+-	xfs_efi_log_item_t	*efd_efip;
++	struct xfs_efi_log_item *efd_efip;
+ 	uint			efd_next_extent;
+ 	xfs_efd_log_format_t	efd_format;
+ } xfs_efd_log_item_t;
+@@ -78,10 +78,10 @@ typedef struct xfs_efd_log_item {
+ extern struct kmem_zone	*xfs_efi_zone;
+ extern struct kmem_zone	*xfs_efd_zone;
+ 
+-xfs_efi_log_item_t	*xfs_efi_init(struct xfs_mount *, uint);
++struct xfs_efi_log_item	*xfs_efi_init(struct xfs_mount *, uint);
+ int			xfs_efi_copy_format(xfs_log_iovec_t *buf,
+ 					    xfs_efi_log_format_t *dst_efi_fmt);
+-void			xfs_efi_item_free(xfs_efi_log_item_t *);
++void			xfs_efi_item_free(struct xfs_efi_log_item *);
+ void			xfs_efi_release(struct xfs_efi_log_item *);
+ 
+ int			xfs_efi_recover(struct xfs_mount *mp,
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -3384,7 +3384,7 @@ xlog_recover_efd_pass2(
+ 	struct xlog_recover_item	*item)
+ {
+ 	xfs_efd_log_format_t	*efd_formatp;
+-	xfs_efi_log_item_t	*efip = NULL;
++	struct xfs_efi_log_item	*efip = NULL;
+ 	struct xfs_log_item	*lip;
+ 	uint64_t		efi_id;
+ 	struct xfs_ail_cursor	cur;
+@@ -3405,7 +3405,7 @@ xlog_recover_efd_pass2(
+ 	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ 	while (lip != NULL) {
+ 		if (lip->li_type == XFS_LI_EFI) {
+-			efip = (xfs_efi_log_item_t *)lip;
++			efip = (struct xfs_efi_log_item *)lip;
+ 			if (efip->efi_format.efi_id == efi_id) {
+ 				/*
+ 				 * Drop the EFD reference to the EFI. This
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1920,7 +1920,7 @@ xfs_init_zones(void)
+ 	if (!xfs_efd_zone)
+ 		goto out_destroy_buf_item_zone;
+ 
+-	xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
++	xfs_efi_zone = kmem_zone_init((sizeof(struct xfs_efi_log_item) +
+ 			((XFS_EFI_MAX_FAST_EXTENTS - 1) *
+ 				sizeof(xfs_extent_t))), "xfs_efi_item");
+ 	if (!xfs_efi_zone)
diff --git a/queue-5.4/xfs-remove-the-xfs_inode_log_item_t-typedef.patch b/queue-5.4/xfs-remove-the-xfs_inode_log_item_t-typedef.patch
new file mode 100644
index 00000000000..536a3a35330
--- /dev/null
+++ b/queue-5.4/xfs-remove-the-xfs_inode_log_item_t-typedef.patch
@@ -0,0 +1,114 @@
+From chandan.babu@oracle.com Thu Feb 16 06:20:54 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:49:57 +0530
+Subject: xfs: remove the xfs_inode_log_item_t typedef
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-4-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit fd9cbe51215198ccffa64169c98eae35b0916088 upstream.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_inode_fork.c  |    2 +-
+ fs/xfs/libxfs/xfs_trans_inode.c |    2 +-
+ fs/xfs/xfs_inode.c              |    4 ++--
+ fs/xfs/xfs_inode_item.c         |    2 +-
+ fs/xfs/xfs_inode_item.h         |    4 ++--
+ fs/xfs/xfs_super.c              |    4 ++--
+ 6 files changed, 9 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_inode_fork.c
++++ b/fs/xfs/libxfs/xfs_inode_fork.c
+@@ -592,7 +592,7 @@ void
+ xfs_iflush_fork(
+ 	xfs_inode_t		*ip,
+ 	xfs_dinode_t		*dip,
+-	xfs_inode_log_item_t	*iip,
++	struct xfs_inode_log_item *iip,
+ 	int			whichfork)
+ {
+ 	char			*cp;
+--- a/fs/xfs/libxfs/xfs_trans_inode.c
++++ b/fs/xfs/libxfs/xfs_trans_inode.c
+@@ -27,7 +27,7 @@ xfs_trans_ijoin(
+ 	struct xfs_inode	*ip,
+ 	uint			lock_flags)
+ {
+-	xfs_inode_log_item_t	*iip;
++	struct xfs_inode_log_item *iip;
+ 
+ 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ 	if (ip->i_itemp == NULL)
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2555,7 +2555,7 @@ xfs_ifree_cluster(
+ 	xfs_daddr_t		blkno;
+ 	xfs_buf_t		*bp;
+ 	xfs_inode_t		*ip;
+-	xfs_inode_log_item_t	*iip;
++	struct xfs_inode_log_item *iip;
+ 	struct xfs_log_item	*lip;
+ 	struct xfs_perag	*pag;
+ 	struct xfs_ino_geometry	*igeo = M_IGEO(mp);
+@@ -2617,7 +2617,7 @@ xfs_ifree_cluster(
+ 		 */
+ 		list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
+ 			if (lip->li_type == XFS_LI_INODE) {
+-				iip = (xfs_inode_log_item_t *)lip;
++				iip = (struct xfs_inode_log_item *)lip;
+ 				ASSERT(iip->ili_logged == 1);
+ 				lip->li_cb = xfs_istale_done;
+ 				xfs_trans_ail_copy_lsn(mp->m_ail,
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -781,7 +781,7 @@ xfs_iflush_abort(
+ 	xfs_inode_t		*ip,
+ 	bool			stale)
+ {
+-	xfs_inode_log_item_t	*iip = ip->i_itemp;
++	struct xfs_inode_log_item *iip = ip->i_itemp;
+ 
+ 	if (iip) {
+ 		if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) {
+--- a/fs/xfs/xfs_inode_item.h
++++ b/fs/xfs/xfs_inode_item.h
+@@ -13,7 +13,7 @@ struct xfs_bmbt_rec;
+ struct xfs_inode;
+ struct xfs_mount;
+ 
+-typedef struct xfs_inode_log_item {
++struct xfs_inode_log_item {
+ 	struct xfs_log_item	ili_item;	   /* common portion */
+ 	struct xfs_inode	*ili_inode;	   /* inode ptr */
+ 	xfs_lsn_t		ili_flush_lsn;	   /* lsn at last flush */
+@@ -23,7 +23,7 @@ typedef struct xfs_inode_log_item {
+ 	unsigned int		ili_last_fields;   /* fields when flushed */
+ 	unsigned int		ili_fields;	   /* fields to be logged */
+ 	unsigned int		ili_fsync_fields;  /* logged since last fsync */
+-} xfs_inode_log_item_t;
++};
+ 
+ static inline int xfs_inode_clean(xfs_inode_t *ip)
+ {
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1934,8 +1934,8 @@ xfs_init_zones(void)
+ 		goto out_destroy_efi_zone;
+ 
+ 	xfs_ili_zone =
+-		kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
+-					KM_ZONE_SPREAD, NULL);
++		kmem_zone_init_flags(sizeof(struct xfs_inode_log_item),
++					"xfs_ili", KM_ZONE_SPREAD, NULL);
+ 	if (!xfs_ili_zone)
+ 		goto out_destroy_inode_zone;
+ 	xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
diff --git a/queue-5.4/xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch b/queue-5.4/xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch
new file mode 100644
index 00000000000..0fa785df2af
--- /dev/null
+++ b/queue-5.4/xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch
@@ -0,0 +1,128 @@
+From chandan.babu@oracle.com Thu Feb 16 06:23:36 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:19 +0530
+Subject: xfs: sync lazy sb accounting on quiesce of read-only mounts
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-26-chandan.babu@oracle.com>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 50d25484bebe94320c49dd1347d3330c7063bbdb upstream.
+
+[ Modify xfs_log_unmount_write() to return zero when the log is in a read-only
+state ]
+
+xfs_log_sbcount() syncs the superblock specifically to accumulate
+the in-core percpu superblock counters and commit them to disk. This
+is required to maintain filesystem consistency across quiesce
+(freeze, read-only mount/remount) or unmount when lazy superblock
+accounting is enabled because individual transactions do not update
+the superblock directly.
+
+This mechanism works as expected for writable mounts, but
+xfs_log_sbcount() skips the update for read-only mounts. Read-only
+mounts otherwise still allow log recovery and write out an unmount
+record during log quiesce. If a read-only mount performs log
+recovery, it can modify the in-core superblock counters and write an
+unmount record when the filesystem unmounts without ever syncing the
+in-core counters. This leaves the filesystem with a clean log but in
+an inconsistent state with regard to lazy sb counters.
+
+Update xfs_log_sbcount() to use the same logic
+xfs_log_unmount_write() uses to determine when to write an unmount
+record. This ensures that lazy accounting is always synced before
+the log is cleaned. Refactor this logic into a new helper to
+distinguish between a writable filesystem and a writable log.
+Specifically, the log is writable unless the filesystem is mounted
+with the norecovery mount option, the underlying log device is
+read-only, or the filesystem is shutdown. Drop the freeze state
+check because the update is already allowed during the freezing
+process and no context calls this function on an already frozen fs.
+Also, retain the shutdown check in xfs_log_unmount_write() to catch
+the case where the preceding log force might have triggered a
+shutdown.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Gao Xiang <hsiangkao@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Bill O'Donnell <billodo@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c   |   28 ++++++++++++++++++++--------
+ fs/xfs/xfs_log.h   |    1 +
+ fs/xfs/xfs_mount.c |    3 +--
+ 3 files changed, 22 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -369,6 +369,25 @@ xlog_tic_add_region(xlog_ticket_t *tic,
+ 	tic->t_res_num++;
+ }
+ 
++bool
++xfs_log_writable(
++	struct xfs_mount	*mp)
++{
++	/*
++	 * Never write to the log on norecovery mounts, if the block device is
++	 * read-only, or if the filesystem is shutdown. Read-only mounts still
++	 * allow internal writes for log recovery and unmount purposes, so don't
++	 * restrict that case here.
++	 */
++	if (mp->m_flags & XFS_MOUNT_NORECOVERY)
++		return false;
++	if (xfs_readonly_buftarg(mp->m_log->l_targ))
++		return false;
++	if (XFS_FORCED_SHUTDOWN(mp))
++		return false;
++	return true;
++}
++
+ /*
+  * Replenish the byte reservation required by moving the grant write head.
+  */
+@@ -895,15 +914,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
+ #endif
+ 	int		 error;
+ 
+-	/*
+-	 * Don't write out unmount record on norecovery mounts or ro devices.
+-	 * Or, if we are doing a forced umount (typically because of IO errors).
+-	 */
+-	if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
+-	    xfs_readonly_buftarg(log->l_targ)) {
+-		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
++	if (!xfs_log_writable(mp))
+ 		return 0;
+-	}
+ 
+ 	error = xfs_log_force(mp, XFS_LOG_SYNC);
+ 	ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
+--- a/fs/xfs/xfs_log.h
++++ b/fs/xfs/xfs_log.h
+@@ -132,6 +132,7 @@ int	  xfs_log_reserve(struct xfs_mount *
+ int	  xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
+ void      xfs_log_unmount(struct xfs_mount *mp);
+ int	  xfs_log_force_umount(struct xfs_mount *mp, int logerror);
++bool	xfs_log_writable(struct xfs_mount *mp);
+ 
+ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
+ void	  xfs_log_ticket_put(struct xlog_ticket *ticket);
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1218,8 +1218,7 @@ xfs_fs_writable(
+ int
+ xfs_log_sbcount(xfs_mount_t *mp)
+ {
+-	/* allow this to proceed during the freeze sequence... */
+-	if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
++	if (!xfs_log_writable(mp))
+ 		return 0;
+ 
+ 	/*
diff --git a/queue-5.4/xfs-turn-dfp_intent-into-a-xfs_log_item.patch b/queue-5.4/xfs-turn-dfp_intent-into-a-xfs_log_item.patch
new file mode 100644
index 00000000000..f4f202ba3fb
--- /dev/null
+++ b/queue-5.4/xfs-turn-dfp_intent-into-a-xfs_log_item.patch
@@ -0,0 +1,238 @@
+From stable-owner@vger.kernel.org Thu Feb 16 06:24:16 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:01 +0530
+Subject: xfs: turn dfp_intent into a xfs_log_item
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-8-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 13a8333339072b8654c1d2c75550ee9f41ee15de upstream.
+
+All defer op instance place their own extension of the log item into
+the dfp_intent field.  Replace that with a xfs_log_item to improve type
+safety and make the code easier to follow.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.h  |   11 ++++++-----
+ fs/xfs/xfs_bmap_item.c     |   12 ++++++------
+ fs/xfs/xfs_extfree_item.c  |   12 ++++++------
+ fs/xfs/xfs_refcount_item.c |   12 ++++++------
+ fs/xfs/xfs_rmap_item.c     |   12 ++++++------
+ 5 files changed, 30 insertions(+), 29 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -28,7 +28,7 @@ enum xfs_defer_ops_type {
+ struct xfs_defer_pending {
+ 	struct list_head		dfp_list;	/* pending items */
+ 	struct list_head		dfp_work;	/* work items */
+-	void				*dfp_intent;	/* log intent item */
++	struct xfs_log_item		*dfp_intent;	/* log intent item */
+ 	void				*dfp_done;	/* log done item */
+ 	unsigned int			dfp_count;	/* # extent items */
+ 	enum xfs_defer_ops_type		dfp_type;
+@@ -43,14 +43,15 @@ void xfs_defer_move(struct xfs_trans *dt
+ 
+ /* Description of a deferred type. */
+ struct xfs_defer_op_type {
+-	void (*abort_intent)(void *);
+-	void *(*create_done)(struct xfs_trans *, void *, unsigned int);
++	struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
++			struct list_head *items, unsigned int count, bool sort);
++	void (*abort_intent)(struct xfs_log_item *intent);
++	void *(*create_done)(struct xfs_trans *tp, struct xfs_log_item *intent,
++			unsigned int count);
+ 	int (*finish_item)(struct xfs_trans *, struct list_head *, void *,
+ 			void **);
+ 	void (*finish_cleanup)(struct xfs_trans *, void *, int);
+ 	void (*cancel_item)(struct list_head *);
+-	void *(*create_intent)(struct xfs_trans *tp, struct list_head *items,
+-			unsigned int count, bool sort);
+ 	unsigned int		max_items;
+ };
+ 
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -330,7 +330,7 @@ xfs_bmap_update_log_item(
+ 			bmap->bi_bmap.br_state);
+ }
+ 
+-STATIC void *
++static struct xfs_log_item *
+ xfs_bmap_update_create_intent(
+ 	struct xfs_trans		*tp,
+ 	struct list_head		*items,
+@@ -348,17 +348,17 @@ xfs_bmap_update_create_intent(
+ 		list_sort(mp, items, xfs_bmap_update_diff_items);
+ 	list_for_each_entry(bmap, items, bi_list)
+ 		xfs_bmap_update_log_item(tp, buip, bmap);
+-	return buip;
++	return &buip->bui_item;
+ }
+ 
+ /* Get an BUD so we can process all the deferred rmap updates. */
+ STATIC void *
+ xfs_bmap_update_create_done(
+ 	struct xfs_trans		*tp,
+-	void				*intent,
++	struct xfs_log_item		*intent,
+ 	unsigned int			count)
+ {
+-	return xfs_trans_get_bud(tp, intent);
++	return xfs_trans_get_bud(tp, BUI_ITEM(intent));
+ }
+ 
+ /* Process a deferred rmap update. */
+@@ -394,9 +394,9 @@ xfs_bmap_update_finish_item(
+ /* Abort all pending BUIs. */
+ STATIC void
+ xfs_bmap_update_abort_intent(
+-	void				*intent)
++	struct xfs_log_item		*intent)
+ {
+-	xfs_bui_release(intent);
++	xfs_bui_release(BUI_ITEM(intent));
+ }
+ 
+ /* Cancel a deferred rmap update. */
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -437,7 +437,7 @@ xfs_extent_free_log_item(
+ 	extp->ext_len = free->xefi_blockcount;
+ }
+ 
+-STATIC void *
++static struct xfs_log_item *
+ xfs_extent_free_create_intent(
+ 	struct xfs_trans		*tp,
+ 	struct list_head		*items,
+@@ -455,17 +455,17 @@ xfs_extent_free_create_intent(
+ 		list_sort(mp, items, xfs_extent_free_diff_items);
+ 	list_for_each_entry(free, items, xefi_list)
+ 		xfs_extent_free_log_item(tp, efip, free);
+-	return efip;
++	return &efip->efi_item;
+ }
+ 
+ /* Get an EFD so we can process all the free extents. */
+ STATIC void *
+ xfs_extent_free_create_done(
+ 	struct xfs_trans		*tp,
+-	void				*intent,
++	struct xfs_log_item		*intent,
+ 	unsigned int			count)
+ {
+-	return xfs_trans_get_efd(tp, intent, count);
++	return xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
+ }
+ 
+ /* Process a free extent. */
+@@ -491,9 +491,9 @@ xfs_extent_free_finish_item(
+ /* Abort all pending EFIs. */
+ STATIC void
+ xfs_extent_free_abort_intent(
+-	void				*intent)
++	struct xfs_log_item		*intent)
+ {
+-	xfs_efi_release(intent);
++	xfs_efi_release(EFI_ITEM(intent));
+ }
+ 
+ /* Cancel a free extent. */
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -329,7 +329,7 @@ xfs_refcount_update_log_item(
+ 	xfs_trans_set_refcount_flags(ext, refc->ri_type);
+ }
+ 
+-STATIC void *
++static struct xfs_log_item *
+ xfs_refcount_update_create_intent(
+ 	struct xfs_trans		*tp,
+ 	struct list_head		*items,
+@@ -347,17 +347,17 @@ xfs_refcount_update_create_intent(
+ 		list_sort(mp, items, xfs_refcount_update_diff_items);
+ 	list_for_each_entry(refc, items, ri_list)
+ 		xfs_refcount_update_log_item(tp, cuip, refc);
+-	return cuip;
++	return &cuip->cui_item;
+ }
+ 
+ /* Get an CUD so we can process all the deferred refcount updates. */
+ STATIC void *
+ xfs_refcount_update_create_done(
+ 	struct xfs_trans		*tp,
+-	void				*intent,
++	struct xfs_log_item		*intent,
+ 	unsigned int			count)
+ {
+-	return xfs_trans_get_cud(tp, intent);
++	return xfs_trans_get_cud(tp, CUI_ITEM(intent));
+ }
+ 
+ /* Process a deferred refcount update. */
+@@ -407,9 +407,9 @@ xfs_refcount_update_finish_cleanup(
+ /* Abort all pending CUIs. */
+ STATIC void
+ xfs_refcount_update_abort_intent(
+-	void				*intent)
++	struct xfs_log_item		*intent)
+ {
+-	xfs_cui_release(intent);
++	xfs_cui_release(CUI_ITEM(intent));
+ }
+ 
+ /* Cancel a deferred refcount update. */
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -381,7 +381,7 @@ xfs_rmap_update_log_item(
+ 			rmap->ri_bmap.br_state);
+ }
+ 
+-STATIC void *
++static struct xfs_log_item *
+ xfs_rmap_update_create_intent(
+ 	struct xfs_trans		*tp,
+ 	struct list_head		*items,
+@@ -399,17 +399,17 @@ xfs_rmap_update_create_intent(
+ 		list_sort(mp, items, xfs_rmap_update_diff_items);
+ 	list_for_each_entry(rmap, items, ri_list)
+ 		xfs_rmap_update_log_item(tp, ruip, rmap);
+-	return ruip;
++	return &ruip->rui_item;
+ }
+ 
+ /* Get an RUD so we can process all the deferred rmap updates. */
+ STATIC void *
+ xfs_rmap_update_create_done(
+ 	struct xfs_trans		*tp,
+-	void				*intent,
++	struct xfs_log_item		*intent,
+ 	unsigned int			count)
+ {
+-	return xfs_trans_get_rud(tp, intent);
++	return xfs_trans_get_rud(tp, RUI_ITEM(intent));
+ }
+ 
+ /* Process a deferred rmap update. */
+@@ -451,9 +451,9 @@ xfs_rmap_update_finish_cleanup(
+ /* Abort all pending RUIs. */
+ STATIC void
+ xfs_rmap_update_abort_intent(
+-	void				*intent)
++	struct xfs_log_item	*intent)
+ {
+-	xfs_rui_release(intent);
++	xfs_rui_release(RUI_ITEM(intent));
+ }
+ 
+ /* Cancel a deferred rmap update. */
diff --git a/queue-5.4/xfs-xfs_defer_capture-should-absorb-remaining-block-reservations.patch b/queue-5.4/xfs-xfs_defer_capture-should-absorb-remaining-block-reservations.patch
new file mode 100644
index 00000000000..72ef69f1c76
--- /dev/null
+++ b/queue-5.4/xfs-xfs_defer_capture-should-absorb-remaining-block-reservations.patch
@@ -0,0 +1,95 @@
+From chandan.babu@oracle.com Thu Feb 16 06:21:59 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:06 +0530
+Subject: xfs: xfs_defer_capture should absorb remaining block reservations
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-13-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 4f9a60c48078c0efa3459678fa8d6e050e8ada5d upstream.
+
+When xfs_defer_capture extracts the deferred ops and transaction state
+from a transaction, it should record the remaining block reservations so
+that when we continue the dfops chain, we can reserve the same number of
+blocks to use.  We capture the reservations for both data and realtime
+volumes.
+
+This adds the requirement that every log intent item recovery function
+must be careful to reserve enough blocks to handle both itself and all
+defer ops that it can queue.  On the other hand, this enables us to do
+away with the handwaving block estimation nonsense that was going on in
+xlog_finish_defer_ops.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c |    4 ++++
+ fs/xfs/libxfs/xfs_defer.h |    4 ++++
+ fs/xfs/xfs_log_recover.c  |   21 +++------------------
+ 3 files changed, 11 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -589,6 +589,10 @@ xfs_defer_ops_capture(
+ 	dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
+ 	tp->t_flags &= ~XFS_TRANS_LOWMODE;
+ 
++	/* Capture the remaining block reservations along with the dfops. */
++	dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
++	dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
++
+ 	return dfc;
+ }
+ 
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -73,6 +73,10 @@ struct xfs_defer_capture {
+ 	/* Deferred ops state saved from the transaction. */
+ 	struct list_head	dfc_dfops;
+ 	unsigned int		dfc_tpflags;
++
++	/* Block reservations for the data and rt devices. */
++	unsigned int		dfc_blkres;
++	unsigned int		dfc_rtxres;
+ };
+ 
+ /*
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4766,27 +4766,12 @@ xlog_finish_defer_ops(
+ {
+ 	struct xfs_defer_capture *dfc, *next;
+ 	struct xfs_trans	*tp;
+-	int64_t			freeblks;
+-	uint64_t		resblks;
+ 	int			error = 0;
+ 
+ 	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+-		/*
+-		 * We're finishing the defer_ops that accumulated as a result
+-		 * of recovering unfinished intent items during log recovery.
+-		 * We reserve an itruncate transaction because it is the
+-		 * largest permanent transaction type.  Since we're the only
+-		 * user of the fs right now, take 93% (15/16) of the available
+-		 * free blocks.  Use weird math to avoid a 64-bit division.
+-		 */
+-		freeblks = percpu_counter_sum(&mp->m_fdblocks);
+-		if (freeblks <= 0)
+-			return -ENOSPC;
+-
+-		resblks = min_t(uint64_t, UINT_MAX, freeblks);
+-		resblks = (resblks * 15) >> 4;
+-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+-				0, XFS_TRANS_RESERVE, &tp);
++		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
++				dfc->dfc_blkres, dfc->dfc_rtxres,
++				XFS_TRANS_RESERVE, &tp);
+ 		if (error)
+ 			return error;
+ 
diff --git a/queue-5.4/xfs-xfs_defer_capture-should-absorb-remaining-transaction-reservation.patch b/queue-5.4/xfs-xfs_defer_capture-should-absorb-remaining-transaction-reservation.patch
new file mode 100644
index 00000000000..424378086f1
--- /dev/null
+++ b/queue-5.4/xfs-xfs_defer_capture-should-absorb-remaining-transaction-reservation.patch
@@ -0,0 +1,83 @@
+From chandan.babu@oracle.com Thu Feb 16 06:22:08 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:07 +0530
+Subject: xfs: xfs_defer_capture should absorb remaining transaction reservation
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-14-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 929b92f64048d90d23e40a59c47adf59f5026903 upstream.
+
+When xfs_defer_capture extracts the deferred ops and transaction state
+from a transaction, it should record the transaction reservation type
+from the old transaction so that when we continue the dfops chain, we
+still use the same reservation parameters.
+
+Doing this means that the log item recovery functions get to determine
+the transaction reservation instead of abusing tr_itruncate in yet
+another part of xfs.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c |    3 +++
+ fs/xfs/libxfs/xfs_defer.h |    3 +++
+ fs/xfs/xfs_log_recover.c  |   17 ++++++++++++++---
+ 3 files changed, 20 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -593,6 +593,9 @@ xfs_defer_ops_capture(
+ 	dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
+ 	dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
+ 
++	/* Preserve the log reservation size. */
++	dfc->dfc_logres = tp->t_log_res;
++
+ 	return dfc;
+ }
+ 
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -77,6 +77,9 @@ struct xfs_defer_capture {
+ 	/* Block reservations for the data and rt devices. */
+ 	unsigned int		dfc_blkres;
+ 	unsigned int		dfc_rtxres;
++
++	/* Log reservation saved from the transaction. */
++	unsigned int		dfc_logres;
+ };
+ 
+ /*
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4769,9 +4769,20 @@ xlog_finish_defer_ops(
+ 	int			error = 0;
+ 
+ 	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+-				dfc->dfc_blkres, dfc->dfc_rtxres,
+-				XFS_TRANS_RESERVE, &tp);
++		struct xfs_trans_res	resv;
++
++		/*
++		 * Create a new transaction reservation from the captured
++		 * information.  Set logcount to 1 to force the new transaction
++		 * to regrant every roll so that we can make forward progress
++		 * in recovery no matter how full the log might be.
++		 */
++		resv.tr_logres = dfc->dfc_logres;
++		resv.tr_logcount = 1;
++		resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
++
++		error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
++				dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
+ 		if (error)
+ 			return error;
+