--- /dev/null
+From shaoyi@amazon.com Fri Feb 17 15:23:36 2023
+From: Shaoying Xu <shaoyi@amazon.com>
+Date: Tue, 7 Feb 2023 18:28:20 +0000
+Subject: ipv4: Fix incorrect route flushing when source address is deleted
+To: <gregkh@linuxfoundation.org>
+Cc: <dsahern@kernel.org>, <idosch@nvidia.com>, <kuba@kernel.org>, <patches@lists.linux.dev>, <sashal@kernel.org>, <shaoyi@amazon.com>, <stable@vger.kernel.org>
+Message-ID: <20230207182820.4959-2-shaoyi@amazon.com>
+
+From: Ido Schimmel <idosch@nvidia.com>
+
+[ Upstream commit f96a3d74554df537b6db5c99c27c80e7afadc8d1 ]
+
+Cited commit added the table ID to the FIB info structure, but did not
+prevent structures with different table IDs from being consolidated.
+This can lead to routes being flushed from a VRF when an address is
+deleted from a different VRF.
+
+Fix by taking the table ID into account when looking for a matching FIB
+info. This is already done for FIB info structures backed by a nexthop
+object in fib_find_info_nh().
+
+Add test cases that fail before the fix:
+
+ # ./fib_tests.sh -t ipv4_del_addr
+
+ IPv4 delete address route tests
+ Regular FIB info
+ TEST: Route removed from VRF when source address deleted [ OK ]
+ TEST: Route in default VRF not removed [ OK ]
+ TEST: Route removed in default VRF when source address deleted [ OK ]
+ TEST: Route in VRF is not removed by address delete [ OK ]
+ Identical FIB info with different table ID
+ TEST: Route removed from VRF when source address deleted [FAIL]
+ TEST: Route in default VRF not removed [ OK ]
+ RTNETLINK answers: File exists
+ TEST: Route removed in default VRF when source address deleted [ OK ]
+ TEST: Route in VRF is not removed by address delete [FAIL]
+
+ Tests passed: 6
+ Tests failed: 2
+
+And pass after:
+
+ # ./fib_tests.sh -t ipv4_del_addr
+
+ IPv4 delete address route tests
+ Regular FIB info
+ TEST: Route removed from VRF when source address deleted [ OK ]
+ TEST: Route in default VRF not removed [ OK ]
+ TEST: Route removed in default VRF when source address deleted [ OK ]
+ TEST: Route in VRF is not removed by address delete [ OK ]
+ Identical FIB info with different table ID
+ TEST: Route removed from VRF when source address deleted [ OK ]
+ TEST: Route in default VRF not removed [ OK ]
+ TEST: Route removed in default VRF when source address deleted [ OK ]
+ TEST: Route in VRF is not removed by address delete [ OK ]
+
+ Tests passed: 8
+ Tests failed: 0
+
+Fixes: 5a56a0b3a45d ("net: Don't delete routes in different VRFs")
+Signed-off-by: Ido Schimmel <idosch@nvidia.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/fib_semantics.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ipv4/fib_semantics.c
++++ b/net/ipv4/fib_semantics.c
+@@ -421,6 +421,7 @@ static struct fib_info *fib_find_info(st
+ nfi->fib_prefsrc == fi->fib_prefsrc &&
+ nfi->fib_priority == fi->fib_priority &&
+ nfi->fib_type == fi->fib_type &&
++ nfi->fib_tb_id == fi->fib_tb_id &&
+ memcmp(nfi->fib_metrics, fi->fib_metrics,
+ sizeof(u32) * RTAX_MAX) == 0 &&
+ !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
--- /dev/null
+From shaoyi@amazon.com Fri Feb 17 15:23:11 2023
+From: Shaoying Xu <shaoyi@amazon.com>
+Date: Tue, 7 Feb 2023 18:28:19 +0000
+Subject: Revert "ipv4: Fix incorrect route flushing when source address is deleted"
+To: <gregkh@linuxfoundation.org>
+Cc: <dsahern@kernel.org>, <idosch@nvidia.com>, <kuba@kernel.org>, <patches@lists.linux.dev>, <sashal@kernel.org>, <shaoyi@amazon.com>, <stable@vger.kernel.org>
+Message-ID: <20230207182820.4959-1-shaoyi@amazon.com>
+
+From: Shaoying Xu <shaoyi@amazon.com>
+
+This reverts commit 2537b637eac0bd546f63e1492a34edd30878e8d4 that
+deleted the whole fib_tests.sh by mistake and caused fib_tests failure
+in kselftests run.
+
+Signed-off-by: Shaoying Xu <shaoyi@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/fib_semantics.c | 1
+ tools/testing/selftests/net/fib_tests.sh | 1727 +++++++++++++++++++++++++++++++
+ 2 files changed, 1727 insertions(+), 1 deletion(-)
+ create mode 100755 tools/testing/selftests/net/fib_tests.sh
+
+--- a/net/ipv4/fib_semantics.c
++++ b/net/ipv4/fib_semantics.c
+@@ -421,7 +421,6 @@ static struct fib_info *fib_find_info(st
+ nfi->fib_prefsrc == fi->fib_prefsrc &&
+ nfi->fib_priority == fi->fib_priority &&
+ nfi->fib_type == fi->fib_type &&
+- nfi->fib_tb_id == fi->fib_tb_id &&
+ memcmp(nfi->fib_metrics, fi->fib_metrics,
+ sizeof(u32) * RTAX_MAX) == 0 &&
+ !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
+--- /dev/null
++++ b/tools/testing/selftests/net/fib_tests.sh
+@@ -0,0 +1,1727 @@
++#!/bin/bash
++# SPDX-License-Identifier: GPL-2.0
++
++# This test is for checking IPv4 and IPv6 FIB behavior in response to
++# different events.
++
++ret=0
++# Kselftest framework requirement - SKIP code is 4.
++ksft_skip=4
++
++# all tests in this script. Can be overridden with -t option
++TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter"
++
++VERBOSE=0
++PAUSE_ON_FAIL=no
++PAUSE=no
++IP="ip -netns ns1"
++NS_EXEC="ip netns exec ns1"
++
++which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
++
++log_test()
++{
++ local rc=$1
++ local expected=$2
++ local msg="$3"
++
++ if [ ${rc} -eq ${expected} ]; then
++ printf " TEST: %-60s [ OK ]\n" "${msg}"
++ nsuccess=$((nsuccess+1))
++ else
++ ret=1
++ nfail=$((nfail+1))
++ printf " TEST: %-60s [FAIL]\n" "${msg}"
++ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
++ echo
++ echo "hit enter to continue, 'q' to quit"
++ read a
++ [ "$a" = "q" ] && exit 1
++ fi
++ fi
++
++ if [ "${PAUSE}" = "yes" ]; then
++ echo
++ echo "hit enter to continue, 'q' to quit"
++ read a
++ [ "$a" = "q" ] && exit 1
++ fi
++}
++
++setup()
++{
++ set -e
++ ip netns add ns1
++ ip netns set ns1 auto
++ $IP link set dev lo up
++ ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1
++ ip netns exec ns1 sysctl -qw net.ipv6.conf.all.forwarding=1
++
++ $IP link add dummy0 type dummy
++ $IP link set dev dummy0 up
++ $IP address add 198.51.100.1/24 dev dummy0
++ $IP -6 address add 2001:db8:1::1/64 dev dummy0
++ set +e
++
++}
++
++cleanup()
++{
++ $IP link del dev dummy0 &> /dev/null
++ ip netns del ns1
++ ip netns del ns2 &> /dev/null
++}
++
++get_linklocal()
++{
++ local dev=$1
++ local addr
++
++ addr=$($IP -6 -br addr show dev ${dev} | \
++ awk '{
++ for (i = 3; i <= NF; ++i) {
++ if ($i ~ /^fe80/)
++ print $i
++ }
++ }'
++ )
++ addr=${addr/\/*}
++
++ [ -z "$addr" ] && return 1
++
++ echo $addr
++
++ return 0
++}
++
++fib_unreg_unicast_test()
++{
++ echo
++ echo "Single path route test"
++
++ setup
++
++ echo " Start point"
++ $IP route get fibmatch 198.51.100.2 &> /dev/null
++ log_test $? 0 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++ log_test $? 0 "IPv6 fibmatch"
++
++ set -e
++ $IP link del dev dummy0
++ set +e
++
++ echo " Nexthop device deleted"
++ $IP route get fibmatch 198.51.100.2 &> /dev/null
++ log_test $? 2 "IPv4 fibmatch - no route"
++ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++ log_test $? 2 "IPv6 fibmatch - no route"
++
++ cleanup
++}
++
++fib_unreg_multipath_test()
++{
++
++ echo
++ echo "Multipath route test"
++
++ setup
++
++ set -e
++ $IP link add dummy1 type dummy
++ $IP link set dev dummy1 up
++ $IP address add 192.0.2.1/24 dev dummy1
++ $IP -6 address add 2001:db8:2::1/64 dev dummy1
++
++ $IP route add 203.0.113.0/24 \
++ nexthop via 198.51.100.2 dev dummy0 \
++ nexthop via 192.0.2.2 dev dummy1
++ $IP -6 route add 2001:db8:3::/64 \
++ nexthop via 2001:db8:1::2 dev dummy0 \
++ nexthop via 2001:db8:2::2 dev dummy1
++ set +e
++
++ echo " Start point"
++ $IP route get fibmatch 203.0.113.1 &> /dev/null
++ log_test $? 0 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
++ log_test $? 0 "IPv6 fibmatch"
++
++ set -e
++ $IP link del dev dummy0
++ set +e
++
++ echo " One nexthop device deleted"
++ $IP route get fibmatch 203.0.113.1 &> /dev/null
++ log_test $? 2 "IPv4 - multipath route removed on delete"
++
++ $IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
++ # In IPv6 we do not flush the entire multipath route.
++ log_test $? 0 "IPv6 - multipath down to single path"
++
++ set -e
++ $IP link del dev dummy1
++ set +e
++
++ echo " Second nexthop device deleted"
++ $IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
++ log_test $? 2 "IPv6 - no route"
++
++ cleanup
++}
++
++fib_unreg_test()
++{
++ fib_unreg_unicast_test
++ fib_unreg_multipath_test
++}
++
++fib_down_unicast_test()
++{
++ echo
++ echo "Single path, admin down"
++
++ setup
++
++ echo " Start point"
++ $IP route get fibmatch 198.51.100.2 &> /dev/null
++ log_test $? 0 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++ log_test $? 0 "IPv6 fibmatch"
++
++ set -e
++ $IP link set dev dummy0 down
++ set +e
++
++ echo " Route deleted on down"
++ $IP route get fibmatch 198.51.100.2 &> /dev/null
++ log_test $? 2 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++ log_test $? 2 "IPv6 fibmatch"
++
++ cleanup
++}
++
++fib_down_multipath_test_do()
++{
++ local down_dev=$1
++ local up_dev=$2
++
++ $IP route get fibmatch 203.0.113.1 \
++ oif $down_dev &> /dev/null
++ log_test $? 2 "IPv4 fibmatch on down device"
++ $IP -6 route get fibmatch 2001:db8:3::1 \
++ oif $down_dev &> /dev/null
++ log_test $? 2 "IPv6 fibmatch on down device"
++
++ $IP route get fibmatch 203.0.113.1 \
++ oif $up_dev &> /dev/null
++ log_test $? 0 "IPv4 fibmatch on up device"
++ $IP -6 route get fibmatch 2001:db8:3::1 \
++ oif $up_dev &> /dev/null
++ log_test $? 0 "IPv6 fibmatch on up device"
++
++ $IP route get fibmatch 203.0.113.1 | \
++ grep $down_dev | grep -q "dead linkdown"
++ log_test $? 0 "IPv4 flags on down device"
++ $IP -6 route get fibmatch 2001:db8:3::1 | \
++ grep $down_dev | grep -q "dead linkdown"
++ log_test $? 0 "IPv6 flags on down device"
++
++ $IP route get fibmatch 203.0.113.1 | \
++ grep $up_dev | grep -q "dead linkdown"
++ log_test $? 1 "IPv4 flags on up device"
++ $IP -6 route get fibmatch 2001:db8:3::1 | \
++ grep $up_dev | grep -q "dead linkdown"
++ log_test $? 1 "IPv6 flags on up device"
++}
++
++fib_down_multipath_test()
++{
++ echo
++ echo "Admin down multipath"
++
++ setup
++
++ set -e
++ $IP link add dummy1 type dummy
++ $IP link set dev dummy1 up
++
++ $IP address add 192.0.2.1/24 dev dummy1
++ $IP -6 address add 2001:db8:2::1/64 dev dummy1
++
++ $IP route add 203.0.113.0/24 \
++ nexthop via 198.51.100.2 dev dummy0 \
++ nexthop via 192.0.2.2 dev dummy1
++ $IP -6 route add 2001:db8:3::/64 \
++ nexthop via 2001:db8:1::2 dev dummy0 \
++ nexthop via 2001:db8:2::2 dev dummy1
++ set +e
++
++ echo " Verify start point"
++ $IP route get fibmatch 203.0.113.1 &> /dev/null
++ log_test $? 0 "IPv4 fibmatch"
++
++ $IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
++ log_test $? 0 "IPv6 fibmatch"
++
++ set -e
++ $IP link set dev dummy0 down
++ set +e
++
++ echo " One device down, one up"
++ fib_down_multipath_test_do "dummy0" "dummy1"
++
++ set -e
++ $IP link set dev dummy0 up
++ $IP link set dev dummy1 down
++ set +e
++
++ echo " Other device down and up"
++ fib_down_multipath_test_do "dummy1" "dummy0"
++
++ set -e
++ $IP link set dev dummy0 down
++ set +e
++
++ echo " Both devices down"
++ $IP route get fibmatch 203.0.113.1 &> /dev/null
++ log_test $? 2 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
++ log_test $? 2 "IPv6 fibmatch"
++
++ $IP link del dev dummy1
++ cleanup
++}
++
++fib_down_test()
++{
++ fib_down_unicast_test
++ fib_down_multipath_test
++}
++
++# Local routes should not be affected when carrier changes.
++fib_carrier_local_test()
++{
++ echo
++ echo "Local carrier tests - single path"
++
++ setup
++
++ set -e
++ $IP link set dev dummy0 carrier on
++ set +e
++
++ echo " Start point"
++ $IP route get fibmatch 198.51.100.1 &> /dev/null
++ log_test $? 0 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:1::1 &> /dev/null
++ log_test $? 0 "IPv6 fibmatch"
++
++ $IP route get fibmatch 198.51.100.1 | \
++ grep -q "linkdown"
++ log_test $? 1 "IPv4 - no linkdown flag"
++ $IP -6 route get fibmatch 2001:db8:1::1 | \
++ grep -q "linkdown"
++ log_test $? 1 "IPv6 - no linkdown flag"
++
++ set -e
++ $IP link set dev dummy0 carrier off
++ sleep 1
++ set +e
++
++ echo " Carrier off on nexthop"
++ $IP route get fibmatch 198.51.100.1 &> /dev/null
++ log_test $? 0 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:1::1 &> /dev/null
++ log_test $? 0 "IPv6 fibmatch"
++
++ $IP route get fibmatch 198.51.100.1 | \
++ grep -q "linkdown"
++ log_test $? 1 "IPv4 - linkdown flag set"
++ $IP -6 route get fibmatch 2001:db8:1::1 | \
++ grep -q "linkdown"
++ log_test $? 1 "IPv6 - linkdown flag set"
++
++ set -e
++ $IP address add 192.0.2.1/24 dev dummy0
++ $IP -6 address add 2001:db8:2::1/64 dev dummy0
++ set +e
++
++ echo " Route to local address with carrier down"
++ $IP route get fibmatch 192.0.2.1 &> /dev/null
++ log_test $? 0 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:2::1 &> /dev/null
++ log_test $? 0 "IPv6 fibmatch"
++
++ $IP route get fibmatch 192.0.2.1 | \
++ grep -q "linkdown"
++ log_test $? 1 "IPv4 linkdown flag set"
++ $IP -6 route get fibmatch 2001:db8:2::1 | \
++ grep -q "linkdown"
++ log_test $? 1 "IPv6 linkdown flag set"
++
++ cleanup
++}
++
++fib_carrier_unicast_test()
++{
++ ret=0
++
++ echo
++ echo "Single path route carrier test"
++
++ setup
++
++ set -e
++ $IP link set dev dummy0 carrier on
++ set +e
++
++ echo " Start point"
++ $IP route get fibmatch 198.51.100.2 &> /dev/null
++ log_test $? 0 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++ log_test $? 0 "IPv6 fibmatch"
++
++ $IP route get fibmatch 198.51.100.2 | \
++ grep -q "linkdown"
++ log_test $? 1 "IPv4 no linkdown flag"
++ $IP -6 route get fibmatch 2001:db8:1::2 | \
++ grep -q "linkdown"
++ log_test $? 1 "IPv6 no linkdown flag"
++
++ set -e
++ $IP link set dev dummy0 carrier off
++ sleep 1
++ set +e
++
++ echo " Carrier down"
++ $IP route get fibmatch 198.51.100.2 &> /dev/null
++ log_test $? 0 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
++ log_test $? 0 "IPv6 fibmatch"
++
++ $IP route get fibmatch 198.51.100.2 | \
++ grep -q "linkdown"
++ log_test $? 0 "IPv4 linkdown flag set"
++ $IP -6 route get fibmatch 2001:db8:1::2 | \
++ grep -q "linkdown"
++ log_test $? 0 "IPv6 linkdown flag set"
++
++ set -e
++ $IP address add 192.0.2.1/24 dev dummy0
++ $IP -6 address add 2001:db8:2::1/64 dev dummy0
++ set +e
++
++ echo " Second address added with carrier down"
++ $IP route get fibmatch 192.0.2.2 &> /dev/null
++ log_test $? 0 "IPv4 fibmatch"
++ $IP -6 route get fibmatch 2001:db8:2::2 &> /dev/null
++ log_test $? 0 "IPv6 fibmatch"
++
++ $IP route get fibmatch 192.0.2.2 | \
++ grep -q "linkdown"
++ log_test $? 0 "IPv4 linkdown flag set"
++ $IP -6 route get fibmatch 2001:db8:2::2 | \
++ grep -q "linkdown"
++ log_test $? 0 "IPv6 linkdown flag set"
++
++ cleanup
++}
++
++fib_carrier_test()
++{
++ fib_carrier_local_test
++ fib_carrier_unicast_test
++}
++
++fib_rp_filter_test()
++{
++ echo
++ echo "IPv4 rp_filter tests"
++
++ setup
++
++ set -e
++ ip netns add ns2
++ ip netns set ns2 auto
++
++ ip -netns ns2 link set dev lo up
++
++ $IP link add name veth1 type veth peer name veth2
++ $IP link set dev veth2 netns ns2
++ $IP address add 192.0.2.1/24 dev veth1
++ ip -netns ns2 address add 192.0.2.1/24 dev veth2
++ $IP link set dev veth1 up
++ ip -netns ns2 link set dev veth2 up
++
++ $IP link set dev lo address 52:54:00:6a:c7:5e
++ $IP link set dev veth1 address 52:54:00:6a:c7:5e
++ ip -netns ns2 link set dev lo address 52:54:00:6a:c7:5e
++ ip -netns ns2 link set dev veth2 address 52:54:00:6a:c7:5e
++
++ # 1. (ns2) redirect lo's egress to veth2's egress
++ ip netns exec ns2 tc qdisc add dev lo parent root handle 1: fq_codel
++ ip netns exec ns2 tc filter add dev lo parent 1: protocol arp basic \
++ action mirred egress redirect dev veth2
++ ip netns exec ns2 tc filter add dev lo parent 1: protocol ip basic \
++ action mirred egress redirect dev veth2
++
++ # 2. (ns1) redirect veth1's ingress to lo's ingress
++ $NS_EXEC tc qdisc add dev veth1 ingress
++ $NS_EXEC tc filter add dev veth1 ingress protocol arp basic \
++ action mirred ingress redirect dev lo
++ $NS_EXEC tc filter add dev veth1 ingress protocol ip basic \
++ action mirred ingress redirect dev lo
++
++ # 3. (ns1) redirect lo's egress to veth1's egress
++ $NS_EXEC tc qdisc add dev lo parent root handle 1: fq_codel
++ $NS_EXEC tc filter add dev lo parent 1: protocol arp basic \
++ action mirred egress redirect dev veth1
++ $NS_EXEC tc filter add dev lo parent 1: protocol ip basic \
++ action mirred egress redirect dev veth1
++
++ # 4. (ns2) redirect veth2's ingress to lo's ingress
++ ip netns exec ns2 tc qdisc add dev veth2 ingress
++ ip netns exec ns2 tc filter add dev veth2 ingress protocol arp basic \
++ action mirred ingress redirect dev lo
++ ip netns exec ns2 tc filter add dev veth2 ingress protocol ip basic \
++ action mirred ingress redirect dev lo
++
++ $NS_EXEC sysctl -qw net.ipv4.conf.all.rp_filter=1
++ $NS_EXEC sysctl -qw net.ipv4.conf.all.accept_local=1
++ $NS_EXEC sysctl -qw net.ipv4.conf.all.route_localnet=1
++ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=1
++ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.accept_local=1
++ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.route_localnet=1
++ set +e
++
++ run_cmd "ip netns exec ns2 ping -w1 -c1 192.0.2.1"
++ log_test $? 0 "rp_filter passes local packets"
++
++ run_cmd "ip netns exec ns2 ping -w1 -c1 127.0.0.1"
++ log_test $? 0 "rp_filter passes loopback packets"
++
++ cleanup
++}
++
++################################################################################
++# Tests on nexthop spec
++
++# run 'ip route add' with given spec
++add_rt()
++{
++ local desc="$1"
++ local erc=$2
++ local vrf=$3
++ local pfx=$4
++ local gw=$5
++ local dev=$6
++ local cmd out rc
++
++ [ "$vrf" = "-" ] && vrf="default"
++ [ -n "$gw" ] && gw="via $gw"
++ [ -n "$dev" ] && dev="dev $dev"
++
++ cmd="$IP route add vrf $vrf $pfx $gw $dev"
++ if [ "$VERBOSE" = "1" ]; then
++ printf "\n COMMAND: $cmd\n"
++ fi
++
++ out=$(eval $cmd 2>&1)
++ rc=$?
++ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
++ echo " $out"
++ fi
++ log_test $rc $erc "$desc"
++}
++
++fib4_nexthop()
++{
++ echo
++ echo "IPv4 nexthop tests"
++
++ echo "<<< write me >>>"
++}
++
++fib6_nexthop()
++{
++ local lldummy=$(get_linklocal dummy0)
++ local llv1=$(get_linklocal dummy0)
++
++ if [ -z "$lldummy" ]; then
++ echo "Failed to get linklocal address for dummy0"
++ return 1
++ fi
++ if [ -z "$llv1" ]; then
++ echo "Failed to get linklocal address for veth1"
++ return 1
++ fi
++
++ echo
++ echo "IPv6 nexthop tests"
++
++ add_rt "Directly connected nexthop, unicast address" 0 \
++ - 2001:db8:101::/64 2001:db8:1::2
++ add_rt "Directly connected nexthop, unicast address with device" 0 \
++ - 2001:db8:102::/64 2001:db8:1::2 "dummy0"
++ add_rt "Gateway is linklocal address" 0 \
++ - 2001:db8:103::1/64 $llv1 "veth0"
++
++ # fails because LL address requires a device
++ add_rt "Gateway is linklocal address, no device" 2 \
++ - 2001:db8:104::1/64 $llv1
++
++ # local address can not be a gateway
++ add_rt "Gateway can not be local unicast address" 2 \
++ - 2001:db8:105::/64 2001:db8:1::1
++ add_rt "Gateway can not be local unicast address, with device" 2 \
++ - 2001:db8:106::/64 2001:db8:1::1 "dummy0"
++ add_rt "Gateway can not be a local linklocal address" 2 \
++ - 2001:db8:107::1/64 $lldummy "dummy0"
++
++ # VRF tests
++ add_rt "Gateway can be local address in a VRF" 0 \
++ - 2001:db8:108::/64 2001:db8:51::2
++ add_rt "Gateway can be local address in a VRF, with device" 0 \
++ - 2001:db8:109::/64 2001:db8:51::2 "veth0"
++ add_rt "Gateway can be local linklocal address in a VRF" 0 \
++ - 2001:db8:110::1/64 $llv1 "veth0"
++
++ add_rt "Redirect to VRF lookup" 0 \
++ - 2001:db8:111::/64 "" "red"
++
++ add_rt "VRF route, gateway can be local address in default VRF" 0 \
++ red 2001:db8:112::/64 2001:db8:51::1
++
++ # local address in same VRF fails
++ add_rt "VRF route, gateway can not be a local address" 2 \
++ red 2001:db8:113::1/64 2001:db8:2::1
++ add_rt "VRF route, gateway can not be a local addr with device" 2 \
++ red 2001:db8:114::1/64 2001:db8:2::1 "dummy1"
++}
++
++# Default VRF:
++# dummy0 - 198.51.100.1/24 2001:db8:1::1/64
++# veth0 - 192.0.2.1/24 2001:db8:51::1/64
++#
++# VRF red:
++# dummy1 - 192.168.2.1/24 2001:db8:2::1/64
++# veth1 - 192.0.2.2/24 2001:db8:51::2/64
++#
++# [ dummy0 veth0 ]--[ veth1 dummy1 ]
++
++fib_nexthop_test()
++{
++ setup
++
++ set -e
++
++ $IP -4 rule add pref 32765 table local
++ $IP -4 rule del pref 0
++ $IP -6 rule add pref 32765 table local
++ $IP -6 rule del pref 0
++
++ $IP link add red type vrf table 1
++ $IP link set red up
++ $IP -4 route add vrf red unreachable default metric 4278198272
++ $IP -6 route add vrf red unreachable default metric 4278198272
++
++ $IP link add veth0 type veth peer name veth1
++ $IP link set dev veth0 up
++ $IP address add 192.0.2.1/24 dev veth0
++ $IP -6 address add 2001:db8:51::1/64 dev veth0
++
++ $IP link set dev veth1 vrf red up
++ $IP address add 192.0.2.2/24 dev veth1
++ $IP -6 address add 2001:db8:51::2/64 dev veth1
++
++ $IP link add dummy1 type dummy
++ $IP link set dev dummy1 vrf red up
++ $IP address add 192.168.2.1/24 dev dummy1
++ $IP -6 address add 2001:db8:2::1/64 dev dummy1
++ set +e
++
++ sleep 1
++ fib4_nexthop
++ fib6_nexthop
++
++ (
++ $IP link del dev dummy1
++ $IP link del veth0
++ $IP link del red
++ ) 2>/dev/null
++ cleanup
++}
++
++fib_suppress_test()
++{
++ echo
++ echo "FIB rule with suppress_prefixlength"
++ setup
++
++ $IP link add dummy1 type dummy
++ $IP link set dummy1 up
++ $IP -6 route add default dev dummy1
++ $IP -6 rule add table main suppress_prefixlength 0
++ ping -f -c 1000 -W 1 1234::1 >/dev/null 2>&1
++ $IP -6 rule del table main suppress_prefixlength 0
++ $IP link del dummy1
++
++ # If we got here without crashing, we're good.
++ log_test 0 0 "FIB rule suppress test"
++
++ cleanup
++}
++
++################################################################################
++# Tests on route add and replace
++
++run_cmd()
++{
++ local cmd="$1"
++ local out
++ local stderr="2>/dev/null"
++
++ if [ "$VERBOSE" = "1" ]; then
++ printf " COMMAND: $cmd\n"
++ stderr=
++ fi
++
++ out=$(eval $cmd $stderr)
++ rc=$?
++ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
++ echo " $out"
++ fi
++
++ [ "$VERBOSE" = "1" ] && echo
++
++ return $rc
++}
++
++check_expected()
++{
++ local out="$1"
++ local expected="$2"
++ local rc=0
++
++ [ "${out}" = "${expected}" ] && return 0
++
++ if [ -z "${out}" ]; then
++ if [ "$VERBOSE" = "1" ]; then
++ printf "\nNo route entry found\n"
++ printf "Expected:\n"
++ printf " ${expected}\n"
++ fi
++ return 1
++ fi
++
++ # tricky way to convert output to 1-line without ip's
++ # messy '\'; this drops all extra white space
++ out=$(echo ${out})
++ if [ "${out}" != "${expected}" ]; then
++ rc=1
++ if [ "${VERBOSE}" = "1" ]; then
++ printf " Unexpected route entry. Have:\n"
++ printf " ${out}\n"
++ printf " Expected:\n"
++ printf " ${expected}\n\n"
++ fi
++ fi
++
++ return $rc
++}
++
++# add route for a prefix, flushing any existing routes first
++# expected to be the first step of a test
++add_route6()
++{
++ local pfx="$1"
++ local nh="$2"
++ local out
++
++ if [ "$VERBOSE" = "1" ]; then
++ echo
++ echo " ##################################################"
++ echo
++ fi
++
++ run_cmd "$IP -6 ro flush ${pfx}"
++ [ $? -ne 0 ] && exit 1
++
++ out=$($IP -6 ro ls match ${pfx})
++ if [ -n "$out" ]; then
++ echo "Failed to flush routes for prefix used for tests."
++ exit 1
++ fi
++
++ run_cmd "$IP -6 ro add ${pfx} ${nh}"
++ if [ $? -ne 0 ]; then
++ echo "Failed to add initial route for test."
++ exit 1
++ fi
++}
++
++# add initial route - used in replace route tests
++add_initial_route6()
++{
++ add_route6 "2001:db8:104::/64" "$1"
++}
++
++check_route6()
++{
++ local pfx
++ local expected="$1"
++ local out
++ local rc=0
++
++ set -- $expected
++ pfx=$1
++
++ out=$($IP -6 ro ls match ${pfx} | sed -e 's/ pref medium//')
++ check_expected "${out}" "${expected}"
++}
++
++route_cleanup()
++{
++ $IP li del red 2>/dev/null
++ $IP li del dummy1 2>/dev/null
++ $IP li del veth1 2>/dev/null
++ $IP li del veth3 2>/dev/null
++
++ cleanup &> /dev/null
++}
++
++route_setup()
++{
++ route_cleanup
++ setup
++
++ [ "${VERBOSE}" = "1" ] && set -x
++ set -e
++
++ ip netns add ns2
++ ip netns set ns2 auto
++ ip -netns ns2 link set dev lo up
++ ip netns exec ns2 sysctl -qw net.ipv4.ip_forward=1
++ ip netns exec ns2 sysctl -qw net.ipv6.conf.all.forwarding=1
++
++ $IP li add veth1 type veth peer name veth2
++ $IP li add veth3 type veth peer name veth4
++
++ $IP li set veth1 up
++ $IP li set veth3 up
++ $IP li set veth2 netns ns2 up
++ $IP li set veth4 netns ns2 up
++ ip -netns ns2 li add dummy1 type dummy
++ ip -netns ns2 li set dummy1 up
++
++ $IP -6 addr add 2001:db8:101::1/64 dev veth1 nodad
++ $IP -6 addr add 2001:db8:103::1/64 dev veth3 nodad
++ $IP addr add 172.16.101.1/24 dev veth1
++ $IP addr add 172.16.103.1/24 dev veth3
++
++ ip -netns ns2 -6 addr add 2001:db8:101::2/64 dev veth2 nodad
++ ip -netns ns2 -6 addr add 2001:db8:103::2/64 dev veth4 nodad
++ ip -netns ns2 -6 addr add 2001:db8:104::1/64 dev dummy1 nodad
++
++ ip -netns ns2 addr add 172.16.101.2/24 dev veth2
++ ip -netns ns2 addr add 172.16.103.2/24 dev veth4
++ ip -netns ns2 addr add 172.16.104.1/24 dev dummy1
++
++ set +e
++}
++
++# assumption is that basic add of a single path route works
++# otherwise just adding an address on an interface is broken
++ipv6_rt_add()
++{
++ local rc
++
++ echo
++ echo "IPv6 route add / append tests"
++
++ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++ add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
++ run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::2"
++ log_test $? 2 "Attempt to add duplicate route - gw"
++
++ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++ add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
++ run_cmd "$IP -6 ro add 2001:db8:104::/64 dev veth3"
++ log_test $? 2 "Attempt to add duplicate route - dev only"
++
++ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++ add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
++ run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
++ log_test $? 2 "Attempt to add duplicate route - reject route"
++
++ # route append with same prefix adds a new route
++ # - iproute2 sets NLM_F_CREATE | NLM_F_APPEND
++ add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
++ run_cmd "$IP -6 ro append 2001:db8:104::/64 via 2001:db8:103::2"
++ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++ log_test $? 0 "Append nexthop to existing route - gw"
++
++ # insert mpath directly
++ add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++ log_test $? 0 "Add multipath route"
++
++ add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ run_cmd "$IP -6 ro add 2001:db8:104::/64 nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ log_test $? 2 "Attempt to add duplicate multipath route"
++
++ # insert of a second route without append but different metric
++ add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
++ run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::2 metric 512"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::3 metric 256"
++ rc=$?
++ fi
++ log_test $rc 0 "Route add with different metrics"
++
++ run_cmd "$IP -6 ro del 2001:db8:104::/64 metric 512"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route6 "2001:db8:104::/64 via 2001:db8:103::3 dev veth3 metric 256 2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024"
++ rc=$?
++ fi
++ log_test $rc 0 "Route delete with metric"
++}
++
++ipv6_rt_replace_single()
++{
++ # single path with single path
++ #
++ add_initial_route6 "via 2001:db8:101::2"
++ run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:103::2"
++ check_route6 "2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
++ log_test $? 0 "Single path with single path"
++
++ # single path with multipath
++ #
++ add_initial_route6 "nexthop via 2001:db8:101::2"
++ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::2"
++ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++ log_test $? 0 "Single path with multipath"
++
++ # single path with single path using MULTIPATH attribute
++ #
++ add_initial_route6 "via 2001:db8:101::2"
++ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:103::2"
++ check_route6 "2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
++ log_test $? 0 "Single path with single path via multipath attribute"
++
++ # route replace fails - invalid nexthop
++ add_initial_route6 "via 2001:db8:101::2"
++ run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:104::2"
++ if [ $? -eq 0 ]; then
++ # previous command is expected to fail so if it returns 0
++ # that means the test failed.
++ log_test 0 1 "Invalid nexthop"
++ else
++ check_route6 "2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024"
++ log_test $? 0 "Invalid nexthop"
++ fi
++
++ # replace non-existent route
++ # - note use of change versus replace since ip adds NLM_F_CREATE
++ # for replace
++ add_initial_route6 "via 2001:db8:101::2"
++ run_cmd "$IP -6 ro change 2001:db8:105::/64 via 2001:db8:101::2"
++ log_test $? 2 "Single path - replace of non-existent route"
++}
++
++ipv6_rt_replace_mpath()
++{
++ # multipath with multipath
++ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::3"
++ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::3 dev veth3 weight 1"
++ log_test $? 0 "Multipath with multipath"
++
++ # multipath with single
++ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:101::3"
++ check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
++ log_test $? 0 "Multipath with single path"
++
++ # multipath with single
++ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3"
++ check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
++ log_test $? 0 "Multipath with single path via multipath attribute"
++
++ # multipath with dev-only
++ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ run_cmd "$IP -6 ro replace 2001:db8:104::/64 dev veth1"
++ check_route6 "2001:db8:104::/64 dev veth1 metric 1024"
++ log_test $? 0 "Multipath with dev-only"
++
++ # route replace fails - invalid nexthop 1
++ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3"
++ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++ log_test $? 0 "Multipath - invalid first nexthop"
++
++ # route replace fails - invalid nexthop 2
++ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:113::3"
++ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++ log_test $? 0 "Multipath - invalid second nexthop"
++
++ # multipath non-existent route
++ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ run_cmd "$IP -6 ro change 2001:db8:105::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::3"
++ log_test $? 2 "Multipath - replace of non-existent route"
++}
++
++ipv6_rt_replace()
++{
++ echo
++ echo "IPv6 route replace tests"
++
++ ipv6_rt_replace_single
++ ipv6_rt_replace_mpath
++}
++
++ipv6_route_test()
++{
++ route_setup
++
++ ipv6_rt_add
++ ipv6_rt_replace
++
++ route_cleanup
++}
++
++ip_addr_metric_check()
++{
++ ip addr help 2>&1 | grep -q metric
++ if [ $? -ne 0 ]; then
++ echo "iproute2 command does not support metric for addresses. Skipping test"
++ return 1
++ fi
++
++ return 0
++}
++
++ipv6_addr_metric_test()
++{
++ local rc
++
++ echo
++ echo "IPv6 prefix route tests"
++
++ ip_addr_metric_check || return 1
++
++ setup
++
++ set -e
++ $IP li add dummy1 type dummy
++ $IP li add dummy2 type dummy
++ $IP li set dummy1 up
++ $IP li set dummy2 up
++
++ # default entry is metric 256
++ run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64"
++ run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64"
++ set +e
++
++ check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 256 2001:db8:104::/64 dev dummy2 proto kernel metric 256"
++ log_test $? 0 "Default metric"
++
++ set -e
++ run_cmd "$IP -6 addr flush dev dummy1"
++ run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64 metric 257"
++ set +e
++
++ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 256 2001:db8:104::/64 dev dummy1 proto kernel metric 257"
++ log_test $? 0 "User specified metric on first device"
++
++ set -e
++ run_cmd "$IP -6 addr flush dev dummy2"
++ run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64 metric 258"
++ set +e
++
++ check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 257 2001:db8:104::/64 dev dummy2 proto kernel metric 258"
++ log_test $? 0 "User specified metric on second device"
++
++ run_cmd "$IP -6 addr del dev dummy1 2001:db8:104::1/64 metric 257"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 258"
++ rc=$?
++ fi
++ log_test $rc 0 "Delete of address on first device"
++
++ run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::2/64 metric 259"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
++ rc=$?
++ fi
++ log_test $rc 0 "Modify metric of address"
++
++ # verify prefix route removed on down
++ run_cmd "ip netns exec ns1 sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1"
++ run_cmd "$IP li set dev dummy2 down"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ out=$($IP -6 ro ls match 2001:db8:104::/64)
++ check_expected "${out}" ""
++ rc=$?
++ fi
++ log_test $rc 0 "Prefix route removed on link down"
++
++ # verify prefix route re-inserted with assigned metric
++ run_cmd "$IP li set dev dummy2 up"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
++ rc=$?
++ fi
++ log_test $rc 0 "Prefix route with metric on link up"
++
++ # verify peer metric added correctly
++ set -e
++ run_cmd "$IP -6 addr flush dev dummy2"
++ run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::1 peer 2001:db8:104::2 metric 260"
++ set +e
++
++ check_route6 "2001:db8:104::1 dev dummy2 proto kernel metric 260"
++ log_test $? 0 "Set metric with peer route on local side"
++ check_route6 "2001:db8:104::2 dev dummy2 proto kernel metric 260"
++ log_test $? 0 "Set metric with peer route on peer side"
++
++ set -e
++ run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::1 peer 2001:db8:104::3 metric 261"
++ set +e
++
++ check_route6 "2001:db8:104::1 dev dummy2 proto kernel metric 261"
++ log_test $? 0 "Modify metric and peer address on local side"
++ check_route6 "2001:db8:104::3 dev dummy2 proto kernel metric 261"
++ log_test $? 0 "Modify metric and peer address on peer side"
++
++ $IP li del dummy1
++ $IP li del dummy2
++ cleanup
++}
++
++ipv6_route_metrics_test()
++{
++ local rc
++
++ echo
++ echo "IPv6 routes with metrics"
++
++ route_setup
++
++ #
++ # single path with metrics
++ #
++ run_cmd "$IP -6 ro add 2001:db8:111::/64 via 2001:db8:101::2 mtu 1400"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route6 "2001:db8:111::/64 via 2001:db8:101::2 dev veth1 metric 1024 mtu 1400"
++ rc=$?
++ fi
++ log_test $rc 0 "Single path route with mtu metric"
++
++
++ #
++ # multipath via separate routes with metrics
++ #
++ run_cmd "$IP -6 ro add 2001:db8:112::/64 via 2001:db8:101::2 mtu 1400"
++ run_cmd "$IP -6 ro append 2001:db8:112::/64 via 2001:db8:103::2"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route6 "2001:db8:112::/64 metric 1024 mtu 1400 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++ rc=$?
++ fi
++ log_test $rc 0 "Multipath route via 2 single routes with mtu metric on first"
++
++ # second route is coalesced to first to make a multipath route.
++ # MTU of the second path is hidden from display!
++ run_cmd "$IP -6 ro add 2001:db8:113::/64 via 2001:db8:101::2"
++ run_cmd "$IP -6 ro append 2001:db8:113::/64 via 2001:db8:103::2 mtu 1400"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route6 "2001:db8:113::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++ rc=$?
++ fi
++ log_test $rc 0 "Multipath route via 2 single routes with mtu metric on 2nd"
++
++ run_cmd "$IP -6 ro del 2001:db8:113::/64 via 2001:db8:101::2"
++ if [ $? -eq 0 ]; then
++ check_route6 "2001:db8:113::/64 via 2001:db8:103::2 dev veth3 metric 1024 mtu 1400"
++ log_test $? 0 " MTU of second leg"
++ fi
++
++ #
++ # multipath with metrics
++ #
++ run_cmd "$IP -6 ro add 2001:db8:115::/64 mtu 1400 nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route6 "2001:db8:115::/64 metric 1024 mtu 1400 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
++ rc=$?
++ fi
++ log_test $rc 0 "Multipath route with mtu metric"
++
++ $IP -6 ro add 2001:db8:104::/64 via 2001:db8:101::2 mtu 1300
++ run_cmd "ip netns exec ns1 ${ping6} -w1 -c1 -s 1500 2001:db8:104::1"
++ log_test $? 0 "Using route with mtu metric"
++
++ run_cmd "$IP -6 ro add 2001:db8:114::/64 via 2001:db8:101::2 congctl lock foo"
++ log_test $? 2 "Invalid metric (fails metric_convert)"
++
++ route_cleanup
++}
++
++# add route for a prefix, flushing any existing routes first
++# expected to be the first step of a test
++add_route()
++{
++ local pfx="$1"
++ local nh="$2"
++ local out
++
++ if [ "$VERBOSE" = "1" ]; then
++ echo
++ echo " ##################################################"
++ echo
++ fi
++
++ run_cmd "$IP ro flush ${pfx}"
++ [ $? -ne 0 ] && exit 1
++
++ out=$($IP ro ls match ${pfx})
++ if [ -n "$out" ]; then
++ echo "Failed to flush routes for prefix used for tests."
++ exit 1
++ fi
++
++ run_cmd "$IP ro add ${pfx} ${nh}"
++ if [ $? -ne 0 ]; then
++ echo "Failed to add initial route for test."
++ exit 1
++ fi
++}
++
++# add initial route - used in replace route tests
++add_initial_route()
++{
++ add_route "172.16.104.0/24" "$1"
++}
++
++check_route()
++{
++ local pfx
++ local expected="$1"
++ local out
++
++ set -- $expected
++ pfx=$1
++ [ "${pfx}" = "unreachable" ] && pfx=$2
++
++ out=$($IP ro ls match ${pfx})
++ check_expected "${out}" "${expected}"
++}
++
++# assumption is that basic add of a single path route works
++# otherwise just adding an address on an interface is broken
++ipv4_rt_add()
++{
++ local rc
++
++ echo
++ echo "IPv4 route add / append tests"
++
++ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++ add_route "172.16.104.0/24" "via 172.16.101.2"
++ run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.2"
++ log_test $? 2 "Attempt to add duplicate route - gw"
++
++ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++ add_route "172.16.104.0/24" "via 172.16.101.2"
++ run_cmd "$IP ro add 172.16.104.0/24 dev veth3"
++ log_test $? 2 "Attempt to add duplicate route - dev only"
++
++ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
++ add_route "172.16.104.0/24" "via 172.16.101.2"
++ run_cmd "$IP ro add unreachable 172.16.104.0/24"
++ log_test $? 2 "Attempt to add duplicate route - reject route"
++
++ # iproute2 prepend only sets NLM_F_CREATE
++ # - adds a new route; does NOT convert existing route to ECMP
++ add_route "172.16.104.0/24" "via 172.16.101.2"
++ run_cmd "$IP ro prepend 172.16.104.0/24 via 172.16.103.2"
++ check_route "172.16.104.0/24 via 172.16.103.2 dev veth3 172.16.104.0/24 via 172.16.101.2 dev veth1"
++ log_test $? 0 "Add new nexthop for existing prefix"
++
++ # route append with same prefix adds a new route
++ # - iproute2 sets NLM_F_CREATE | NLM_F_APPEND
++ add_route "172.16.104.0/24" "via 172.16.101.2"
++ run_cmd "$IP ro append 172.16.104.0/24 via 172.16.103.2"
++ check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 via 172.16.103.2 dev veth3"
++ log_test $? 0 "Append nexthop to existing route - gw"
++
++ add_route "172.16.104.0/24" "via 172.16.101.2"
++ run_cmd "$IP ro append 172.16.104.0/24 dev veth3"
++ check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 dev veth3 scope link"
++ log_test $? 0 "Append nexthop to existing route - dev only"
++
++ add_route "172.16.104.0/24" "via 172.16.101.2"
++ run_cmd "$IP ro append unreachable 172.16.104.0/24"
++ check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 unreachable 172.16.104.0/24"
++ log_test $? 0 "Append nexthop to existing route - reject route"
++
++ run_cmd "$IP ro flush 172.16.104.0/24"
++ run_cmd "$IP ro add unreachable 172.16.104.0/24"
++ run_cmd "$IP ro append 172.16.104.0/24 via 172.16.103.2"
++ check_route "unreachable 172.16.104.0/24 172.16.104.0/24 via 172.16.103.2 dev veth3"
++ log_test $? 0 "Append nexthop to existing reject route - gw"
++
++ run_cmd "$IP ro flush 172.16.104.0/24"
++ run_cmd "$IP ro add unreachable 172.16.104.0/24"
++ run_cmd "$IP ro append 172.16.104.0/24 dev veth3"
++ check_route "unreachable 172.16.104.0/24 172.16.104.0/24 dev veth3 scope link"
++ log_test $? 0 "Append nexthop to existing reject route - dev only"
++
++ # insert mpath directly
++ add_route "172.16.104.0/24" "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ check_route "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++ log_test $? 0 "add multipath route"
++
++ add_route "172.16.104.0/24" "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ run_cmd "$IP ro add 172.16.104.0/24 nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ log_test $? 2 "Attempt to add duplicate multipath route"
++
++ # insert of a second route without append but different metric
++ add_route "172.16.104.0/24" "via 172.16.101.2"
++ run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.2 metric 512"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.3 metric 256"
++ rc=$?
++ fi
++ log_test $rc 0 "Route add with different metrics"
++
++ run_cmd "$IP ro del 172.16.104.0/24 metric 512"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 via 172.16.103.3 dev veth3 metric 256"
++ rc=$?
++ fi
++ log_test $rc 0 "Route delete with metric"
++}
++
++ipv4_rt_replace_single()
++{
++ # single path with single path
++ #
++ add_initial_route "via 172.16.101.2"
++ run_cmd "$IP ro replace 172.16.104.0/24 via 172.16.103.2"
++ check_route "172.16.104.0/24 via 172.16.103.2 dev veth3"
++ log_test $? 0 "Single path with single path"
++
++ # single path with multipath
++ #
++ add_initial_route "nexthop via 172.16.101.2"
++ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.2"
++ check_route "172.16.104.0/24 nexthop via 172.16.101.3 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++ log_test $? 0 "Single path with multipath"
++
++ # single path with reject
++ #
++ add_initial_route "nexthop via 172.16.101.2"
++ run_cmd "$IP ro replace unreachable 172.16.104.0/24"
++ check_route "unreachable 172.16.104.0/24"
++ log_test $? 0 "Single path with reject route"
++
++ # single path with single path using MULTIPATH attribute
++ #
++ add_initial_route "via 172.16.101.2"
++ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.103.2"
++ check_route "172.16.104.0/24 via 172.16.103.2 dev veth3"
++ log_test $? 0 "Single path with single path via multipath attribute"
++
++ # route replace fails - invalid nexthop
++ add_initial_route "via 172.16.101.2"
++ run_cmd "$IP ro replace 172.16.104.0/24 via 2001:db8:104::2"
++ if [ $? -eq 0 ]; then
++ # previous command is expected to fail so if it returns 0
++ # that means the test failed.
++ log_test 0 1 "Invalid nexthop"
++ else
++ check_route "172.16.104.0/24 via 172.16.101.2 dev veth1"
++ log_test $? 0 "Invalid nexthop"
++ fi
++
++ # replace non-existent route
++ # - note use of change versus replace since ip adds NLM_F_CREATE
++ # for replace
++ add_initial_route "via 172.16.101.2"
++ run_cmd "$IP ro change 172.16.105.0/24 via 172.16.101.2"
++ log_test $? 2 "Single path - replace of non-existent route"
++}
++
++ipv4_rt_replace_mpath()
++{
++ # multipath with multipath
++ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.3"
++ check_route "172.16.104.0/24 nexthop via 172.16.101.3 dev veth1 weight 1 nexthop via 172.16.103.3 dev veth3 weight 1"
++ log_test $? 0 "Multipath with multipath"
++
++ # multipath with single
++ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ run_cmd "$IP ro replace 172.16.104.0/24 via 172.16.101.3"
++ check_route "172.16.104.0/24 via 172.16.101.3 dev veth1"
++ log_test $? 0 "Multipath with single path"
++
++ # multipath with single
++ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3"
++ check_route "172.16.104.0/24 via 172.16.101.3 dev veth1"
++ log_test $? 0 "Multipath with single path via multipath attribute"
++
++ # multipath with reject
++ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ run_cmd "$IP ro replace unreachable 172.16.104.0/24"
++ check_route "unreachable 172.16.104.0/24"
++ log_test $? 0 "Multipath with reject route"
++
++ # route replace fails - invalid nexthop 1
++ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.111.3 nexthop via 172.16.103.3"
++ check_route "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++ log_test $? 0 "Multipath - invalid first nexthop"
++
++ # route replace fails - invalid nexthop 2
++ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.113.3"
++ check_route "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++ log_test $? 0 "Multipath - invalid second nexthop"
++
++ # multipath non-existent route
++ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ run_cmd "$IP ro change 172.16.105.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.3"
++ log_test $? 2 "Multipath - replace of non-existent route"
++}
++
++ipv4_rt_replace()
++{
++ echo
++ echo "IPv4 route replace tests"
++
++ ipv4_rt_replace_single
++ ipv4_rt_replace_mpath
++}
++
++ipv4_route_test()
++{
++ route_setup
++
++ ipv4_rt_add
++ ipv4_rt_replace
++
++ route_cleanup
++}
++
++ipv4_addr_metric_test()
++{
++ local rc
++
++ echo
++ echo "IPv4 prefix route tests"
++
++ ip_addr_metric_check || return 1
++
++ setup
++
++ set -e
++ $IP li add dummy1 type dummy
++ $IP li add dummy2 type dummy
++ $IP li set dummy1 up
++ $IP li set dummy2 up
++
++ # default entry is metric 256
++ run_cmd "$IP addr add dev dummy1 172.16.104.1/24"
++ run_cmd "$IP addr add dev dummy2 172.16.104.2/24"
++ set +e
++
++ check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2"
++ log_test $? 0 "Default metric"
++
++ set -e
++ run_cmd "$IP addr flush dev dummy1"
++ run_cmd "$IP addr add dev dummy1 172.16.104.1/24 metric 257"
++ set +e
++
++ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257"
++ log_test $? 0 "User specified metric on first device"
++
++ set -e
++ run_cmd "$IP addr flush dev dummy2"
++ run_cmd "$IP addr add dev dummy2 172.16.104.2/24 metric 258"
++ set +e
++
++ check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
++ log_test $? 0 "User specified metric on second device"
++
++ run_cmd "$IP addr del dev dummy1 172.16.104.1/24 metric 257"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
++ rc=$?
++ fi
++ log_test $rc 0 "Delete of address on first device"
++
++ run_cmd "$IP addr change dev dummy2 172.16.104.2/24 metric 259"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
++ rc=$?
++ fi
++ log_test $rc 0 "Modify metric of address"
++
++ # verify prefix route removed on down
++ run_cmd "$IP li set dev dummy2 down"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ out=$($IP ro ls match 172.16.104.0/24)
++ check_expected "${out}" ""
++ rc=$?
++ fi
++ log_test $rc 0 "Prefix route removed on link down"
++
++ # verify prefix route re-inserted with assigned metric
++ run_cmd "$IP li set dev dummy2 up"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
++ rc=$?
++ fi
++ log_test $rc 0 "Prefix route with metric on link up"
++
++ # explicitly check for metric changes on edge scenarios
++ run_cmd "$IP addr flush dev dummy2"
++ run_cmd "$IP addr add dev dummy2 172.16.104.0/24 metric 259"
++ run_cmd "$IP addr change dev dummy2 172.16.104.0/24 metric 260"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.0 metric 260"
++ rc=$?
++ fi
++ log_test $rc 0 "Modify metric of .0/24 address"
++
++ run_cmd "$IP addr flush dev dummy2"
++ run_cmd "$IP addr add dev dummy2 172.16.104.1/32 peer 172.16.104.2 metric 260"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.104.2 dev dummy2 proto kernel scope link src 172.16.104.1 metric 260"
++ rc=$?
++ fi
++ log_test $rc 0 "Set metric of address with peer route"
++
++ run_cmd "$IP addr change dev dummy2 172.16.104.1/32 peer 172.16.104.3 metric 261"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.104.3 dev dummy2 proto kernel scope link src 172.16.104.1 metric 261"
++ rc=$?
++ fi
++ log_test $rc 0 "Modify metric and peer address for peer route"
++
++ $IP li del dummy1
++ $IP li del dummy2
++ cleanup
++}
++
++ipv4_route_metrics_test()
++{
++ local rc
++
++ echo
++ echo "IPv4 route add / append tests"
++
++ route_setup
++
++ run_cmd "$IP ro add 172.16.111.0/24 via 172.16.101.2 mtu 1400"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.111.0/24 via 172.16.101.2 dev veth1 mtu 1400"
++ rc=$?
++ fi
++ log_test $rc 0 "Single path route with mtu metric"
++
++
++ run_cmd "$IP ro add 172.16.112.0/24 mtu 1400 nexthop via 172.16.101.2 nexthop via 172.16.103.2"
++ rc=$?
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.112.0/24 mtu 1400 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++ rc=$?
++ fi
++ log_test $rc 0 "Multipath route with mtu metric"
++
++ $IP ro add 172.16.104.0/24 via 172.16.101.2 mtu 1300
++ run_cmd "ip netns exec ns1 ping -w1 -c1 -s 1500 172.16.104.1"
++ log_test $? 0 "Using route with mtu metric"
++
++ run_cmd "$IP ro add 172.16.111.0/24 via 172.16.101.2 congctl lock foo"
++ log_test $? 2 "Invalid metric (fails metric_convert)"
++
++ route_cleanup
++}
++
++ipv4_route_v6_gw_test()
++{
++ local rc
++
++ echo
++ echo "IPv4 route with IPv6 gateway tests"
++
++ route_setup
++ sleep 2
++
++ #
++ # single path route
++ #
++ run_cmd "$IP ro add 172.16.104.0/24 via inet6 2001:db8:101::2"
++ rc=$?
++ log_test $rc 0 "Single path route with IPv6 gateway"
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.104.0/24 via inet6 2001:db8:101::2 dev veth1"
++ fi
++
++ run_cmd "ip netns exec ns1 ping -w1 -c1 172.16.104.1"
++ log_test $rc 0 "Single path route with IPv6 gateway - ping"
++
++ run_cmd "$IP ro del 172.16.104.0/24 via inet6 2001:db8:101::2"
++ rc=$?
++ log_test $rc 0 "Single path route delete"
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.112.0/24"
++ fi
++
++ #
++ # multipath - v6 then v4
++ #
++ run_cmd "$IP ro add 172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 nexthop via 172.16.103.2 dev veth3"
++ rc=$?
++ log_test $rc 0 "Multipath route add - v6 nexthop then v4"
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
++ fi
++
++ run_cmd "$IP ro del 172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 nexthop via inet6 2001:db8:101::2 dev veth1"
++ log_test $? 2 " Multipath route delete - nexthops in wrong order"
++
++ run_cmd "$IP ro del 172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 nexthop via 172.16.103.2 dev veth3"
++ log_test $? 0 " Multipath route delete exact match"
++
++ #
++ # multipath - v4 then v6
++ #
++ run_cmd "$IP ro add 172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 nexthop via inet6 2001:db8:101::2 dev veth1"
++ rc=$?
++ log_test $rc 0 "Multipath route add - v4 nexthop then v6"
++ if [ $rc -eq 0 ]; then
++ check_route "172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 weight 1 nexthop via inet6 2001:db8:101::2 dev veth1 weight 1"
++ fi
++
++ run_cmd "$IP ro del 172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 nexthop via 172.16.103.2 dev veth3"
++ log_test $? 2 " Multipath route delete - nexthops in wrong order"
++
++ run_cmd "$IP ro del 172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 nexthop via inet6 2001:db8:101::2 dev veth1"
++ log_test $? 0 " Multipath route delete exact match"
++
++ route_cleanup
++}
++
++################################################################################
++# usage
++
++usage()
++{
++ cat <<EOF
++usage: ${0##*/} OPTS
++
++ -t <test> Test(s) to run (default: all)
++ (options: $TESTS)
++ -p Pause on fail
++ -P Pause after each test before cleanup
++ -v verbose mode (show commands and output)
++EOF
++}
++
++################################################################################
++# main
++
++while getopts :t:pPhv o
++do
++ case $o in
++ t) TESTS=$OPTARG;;
++ p) PAUSE_ON_FAIL=yes;;
++ P) PAUSE=yes;;
++ v) VERBOSE=$(($VERBOSE + 1));;
++ h) usage; exit 0;;
++ *) usage; exit 1;;
++ esac
++done
++
++PEER_CMD="ip netns exec ${PEER_NS}"
++
++# make sure we don't pause twice
++[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no
++
++if [ "$(id -u)" -ne 0 ];then
++ echo "SKIP: Need root privileges"
++ exit $ksft_skip;
++fi
++
++if [ ! -x "$(command -v ip)" ]; then
++ echo "SKIP: Could not run test without ip tool"
++ exit $ksft_skip
++fi
++
++ip route help 2>&1 | grep -q fibmatch
++if [ $? -ne 0 ]; then
++ echo "SKIP: iproute2 too old, missing fibmatch"
++ exit $ksft_skip
++fi
++
++# start clean
++cleanup &> /dev/null
++
++for t in $TESTS
++do
++ case $t in
++ fib_unreg_test|unregister) fib_unreg_test;;
++ fib_down_test|down) fib_down_test;;
++ fib_carrier_test|carrier) fib_carrier_test;;
++ fib_rp_filter_test|rp_filter) fib_rp_filter_test;;
++ fib_nexthop_test|nexthop) fib_nexthop_test;;
++ fib_suppress_test|suppress) fib_suppress_test;;
++ ipv6_route_test|ipv6_rt) ipv6_route_test;;
++ ipv4_route_test|ipv4_rt) ipv4_route_test;;
++ ipv6_addr_metric) ipv6_addr_metric_test;;
++ ipv4_addr_metric) ipv4_addr_metric_test;;
++ ipv6_route_metrics) ipv6_route_metrics_test;;
++ ipv4_route_metrics) ipv4_route_metrics_test;;
++ ipv4_route_v6_gw) ipv4_route_v6_gw_test;;
++
++ help) echo "Test names: $TESTS"; exit 0;;
++ esac
++done
++
++if [ "$TESTS" != "none" ]; then
++ printf "\nTests passed: %3d\n" ${nsuccess}
++ printf "Tests failed: %3d\n" ${nfail}
++fi
++
++exit $ret
aio-fix-mremap-after-fork-null-deref.patch
btrfs-free-device-in-btrfs_close_devices-for-a-single-device-filesystem.patch
netfilter-nft_tproxy-restrict-to-prerouting-hook.patch
+xfs-remove-the-xfs_efi_log_item_t-typedef.patch
+xfs-remove-the-xfs_efd_log_item_t-typedef.patch
+xfs-remove-the-xfs_inode_log_item_t-typedef.patch
+xfs-factor-out-a-xfs_defer_create_intent-helper.patch
+xfs-merge-the-log_item-defer-op-into-create_intent.patch
+xfs-merge-the-diff_items-defer-op-into-create_intent.patch
+xfs-turn-dfp_intent-into-a-xfs_log_item.patch
+xfs-refactor-xfs_defer_finish_noroll.patch
+xfs-log-new-intent-items-created-as-part-of-finishing-recovered-intent-items.patch
+xfs-fix-finobt-btree-block-recovery-ordering.patch
+xfs-proper-replay-of-deferred-ops-queued-during-log-recovery.patch
+xfs-xfs_defer_capture-should-absorb-remaining-block-reservations.patch
+xfs-xfs_defer_capture-should-absorb-remaining-transaction-reservation.patch
+xfs-clean-up-bmap-intent-item-recovery-checking.patch
+xfs-clean-up-xfs_bui_item_recover-iget-trans_alloc-ilock-ordering.patch
+xfs-fix-an-incore-inode-uaf-in-xfs_bui_recover.patch
+xfs-change-the-order-in-which-child-and-parent-defer-ops-are-finished.patch
+xfs-periodically-relog-deferred-intent-items.patch
+xfs-expose-the-log-push-threshold.patch
+xfs-only-relog-deferred-intent-items-if-free-space-in-the-log-gets-low.patch
+xfs-fix-missing-cow-blocks-writeback-conversion-retry.patch
+xfs-ensure-inobt-record-walks-always-make-forward-progress.patch
+xfs-fix-the-forward-progress-assertion-in-xfs_iwalk_run_callbacks.patch
+xfs-prevent-uaf-in-xfs_log_item_in_current_chkpt.patch
+xfs-sync-lazy-sb-accounting-on-quiesce-of-read-only-mounts.patch
+revert-ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch
+ipv4-fix-incorrect-route-flushing-when-source-address-is-deleted.patch
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:22:40 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:11 +0530
+Subject: xfs: change the order in which child and parent defer ops are finished
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-18-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 27dada070d59c28a441f1907d2cec891b17dcb26 upstream.
+
+The defer ops code has been finishing items in the wrong order -- if a
+top level defer op creates items A and B, and finishing item A creates
+more defer ops A1 and A2, we'll put the new items on the end of the
+chain and process them in the order A B A1 A2. This is kind of weird,
+since it's convenient for programmers to be able to think of A and B as
+an ordered sequence where all the sub-tasks for A must finish before we
+move on to B, e.g. A A1 A2 D.
+
+Right now, our log intent items are not so complex that this matters,
+but this will become important for the atomic extent swapping patchset.
+In order to maintain correct reference counting of extents, we have to
+unmap and remap extents in that order, and we want to complete that work
+before moving on to the next range that the user wants to swap. This
+patch fixes defer ops to satsify that requirement.
+
+The primary symptom of the incorrect order was noticed in an early
+performance analysis of the atomic extent swap code. An astonishingly
+large number of deferred work items accumulated when userspace requested
+an atomic update of two very fragmented files. The cause of this was
+traced to the same ordering bug in the inner loop of
+xfs_defer_finish_noroll.
+
+If the ->finish_item method of a deferred operation queues new deferred
+operations, those new deferred ops are appended to the tail of the
+pending work list. To illustrate, say that a caller creates a
+transaction t0 with four deferred operations D0-D3. The first thing
+defer ops does is roll the transaction to t1, leaving us with:
+
+t1: D0(t0), D1(t0), D2(t0), D3(t0)
+
+Let's say that finishing each of D0-D3 will create two new deferred ops.
+After finish D0 and roll, we'll have the following chain:
+
+t2: D1(t0), D2(t0), D3(t0), d4(t1), d5(t1)
+
+d4 and d5 were logged to t1. Notice that while we're about to start
+work on D1, we haven't actually completed all the work implied by D0
+being finished. So far we've been careful (or lucky) to structure the
+dfops callers such that D1 doesn't depend on d4 or d5 being finished,
+but this is a potential logic bomb.
+
+There's a second problem lurking. Let's see what happens as we finish
+D1-D3:
+
+t3: D2(t0), D3(t0), d4(t1), d5(t1), d6(t2), d7(t2)
+t4: D3(t0), d4(t1), d5(t1), d6(t2), d7(t2), d8(t3), d9(t3)
+t5: d4(t1), d5(t1), d6(t2), d7(t2), d8(t3), d9(t3), d10(t4), d11(t4)
+
+Let's say that d4-d11 are simple work items that don't queue any other
+operations, which means that we can complete each d4 and roll to t6:
+
+t6: d5(t1), d6(t2), d7(t2), d8(t3), d9(t3), d10(t4), d11(t4)
+t7: d6(t2), d7(t2), d8(t3), d9(t3), d10(t4), d11(t4)
+...
+t11: d10(t4), d11(t4)
+t12: d11(t4)
+<done>
+
+When we try to roll to transaction #12, we're holding defer op d11,
+which we logged way back in t4. This means that the tail of the log is
+pinned at t4. If the log is very small or there are a lot of other
+threads updating metadata, this means that we might have wrapped the log
+and cannot get roll to t11 because there isn't enough space left before
+we'd run into t4.
+
+Let's shift back to the original failure. I mentioned before that I
+discovered this flaw while developing the atomic file update code. In
+that scenario, we have a defer op (D0) that finds a range of file blocks
+to remap, creates a handful of new defer ops to do that, and then asks
+to be continued with however much work remains.
+
+So, D0 is the original swapext deferred op. The first thing defer ops
+does is rolls to t1:
+
+t1: D0(t0)
+
+We try to finish D0, logging d1 and d2 in the process, but can't get all
+the work done. We log a done item and a new intent item for the work
+that D0 still has to do, and roll to t2:
+
+t2: D0'(t1), d1(t1), d2(t1)
+
+We roll and try to finish D0', but still can't get all the work done, so
+we log a done item and a new intent item for it, requeue D0 a second
+time, and roll to t3:
+
+t3: D0''(t2), d1(t1), d2(t1), d3(t2), d4(t2)
+
+If it takes 48 more rolls to complete D0, then we'll finally dispense
+with D0 in t50:
+
+t50: D<fifty primes>(t49), d1(t1), ..., d102(t50)
+
+We then try to roll again to get a chain like this:
+
+t51: d1(t1), d2(t1), ..., d101(t50), d102(t50)
+...
+t152: d102(t50)
+<done>
+
+Notice that in rolling to transaction #51, we're holding on to a log
+intent item for d1 that was logged in transaction #1. This means that
+the tail of the log is pinned at t1. If the log is very small or there
+are a lot of other threads updating metadata, this means that we might
+have wrapped the log and cannot roll to t51 because there isn't enough
+space left before we'd run into t1. This is of course problem #2 again.
+
+But notice the third problem with this scenario: we have 102 defer ops
+tied to this transaction! Each of these items are backed by pinned
+kernel memory, which means that we risk OOM if the chains get too long.
+
+Yikes. Problem #1 is a subtle logic bomb that could hit someone in the
+future; problem #2 applies (rarely) to the current upstream, and problem
+
+This is not how incremental deferred operations were supposed to work.
+The dfops design of logging in the same transaction an intent-done item
+and a new intent item for the work remaining was to make it so that we
+only have to juggle enough deferred work items to finish that one small
+piece of work. Deferred log item recovery will find that first
+unfinished work item and restart it, no matter how many other intent
+items might follow it in the log. Therefore, it's ok to put the new
+intents at the start of the dfops chain.
+
+For the first example, the chains look like this:
+
+t2: d4(t1), d5(t1), D1(t0), D2(t0), D3(t0)
+t3: d5(t1), D1(t0), D2(t0), D3(t0)
+...
+t9: d9(t7), D3(t0)
+t10: D3(t0)
+t11: d10(t10), d11(t10)
+t12: d11(t10)
+
+For the second example, the chains look like this:
+
+t1: D0(t0)
+t2: d1(t1), d2(t1), D0'(t1)
+t3: d2(t1), D0'(t1)
+t4: D0'(t1)
+t5: d1(t4), d2(t4), D0''(t4)
+...
+t148: D0<50 primes>(t147)
+t149: d101(t148), d102(t148)
+t150: d102(t148)
+<done>
+
+This actually sucks more for pinning the log tail (we try to roll to t10
+while holding an intent item that was logged in t1) but we've solved
+problem #1. We've also reduced the maximum chain length from:
+
+ sum(all the new items) + nr_original_items
+
+to:
+
+ max(new items that each original item creates) + nr_original_items
+
+This solves problem #3 by sharply reducing the number of defer ops that
+can be attached to a transaction at any given time. The change makes
+the problem of log tail pinning worse, but is improvement we need to
+solve problem #2. Actually solving #2, however, is left to the next
+patch.
+
+Note that a subsequent analysis of some hard-to-trigger reflink and COW
+livelocks on extremely fragmented filesystems (or systems running a lot
+of IO threads) showed the same symptoms -- uncomfortably large numbers
+of incore deferred work items and occasional stalls in the transaction
+grant code while waiting for log reservations. I think this patch and
+the next one will also solve these problems.
+
+As originally written, the code used list_splice_tail_init instead of
+list_splice_init, so change that, and leave a short comment explaining
+our actions.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -431,8 +431,17 @@ xfs_defer_finish_noroll(
+
+ /* Until we run out of pending work to finish... */
+ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
++ /*
++ * Deferred items that are created in the process of finishing
++ * other deferred work items should be queued at the head of
++ * the pending list, which puts them ahead of the deferred work
++ * that was created by the caller. This keeps the number of
++ * pending work items to a minimum, which decreases the amount
++ * of time that any one intent item can stick around in memory,
++ * pinning the log tail.
++ */
+ xfs_defer_create_intents(*tp);
+- list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
++ list_splice_init(&(*tp)->t_dfops, &dop_pending);
+
+ error = xfs_defer_trans_roll(tp);
+ if (error)
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:22:15 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:08 +0530
+Subject: xfs: clean up bmap intent item recovery checking
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-15-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 919522e89f8e71fc6a8f8abe17be4011573c6ea0 upstream.
+
+The bmap intent item checking code in xfs_bui_item_recover is spread all
+over the function. We should check the recovered log item at the top
+before we allocate any resources or do anything else, so do that.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bmap_item.c | 38 ++++++++++++--------------------------
+ 1 file changed, 12 insertions(+), 26 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -434,9 +434,7 @@ xfs_bui_recover(
+ xfs_fsblock_t startblock_fsb;
+ xfs_fsblock_t inode_fsb;
+ xfs_filblks_t count;
+- bool op_ok;
+ struct xfs_bud_log_item *budp;
+- enum xfs_bmap_intent_type type;
+ int whichfork;
+ xfs_exntst_t state;
+ struct xfs_trans *tp;
+@@ -462,16 +460,19 @@ xfs_bui_recover(
+ XFS_FSB_TO_DADDR(mp, bmap->me_startblock));
+ inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp,
+ XFS_INO_TO_FSB(mp, bmap->me_owner)));
+- switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
++ state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
++ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
++ whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
++ XFS_ATTR_FORK : XFS_DATA_FORK;
++ bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
++ switch (bui_type) {
+ case XFS_BMAP_MAP:
+ case XFS_BMAP_UNMAP:
+- op_ok = true;
+ break;
+ default:
+- op_ok = false;
+- break;
++ return -EFSCORRUPTED;
+ }
+- if (!op_ok || startblock_fsb == 0 ||
++ if (startblock_fsb == 0 ||
+ bmap->me_len == 0 ||
+ inode_fsb == 0 ||
+ startblock_fsb >= mp->m_sb.sb_dblocks ||
+@@ -502,32 +503,17 @@ xfs_bui_recover(
+ if (VFS_I(ip)->i_nlink == 0)
+ xfs_iflags_set(ip, XFS_IRECOVERY);
+
+- /* Process deferred bmap item. */
+- state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+- whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+- XFS_ATTR_FORK : XFS_DATA_FORK;
+- bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+- switch (bui_type) {
+- case XFS_BMAP_MAP:
+- case XFS_BMAP_UNMAP:
+- type = bui_type;
+- break;
+- default:
+- XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
+- error = -EFSCORRUPTED;
+- goto err_inode;
+- }
+ xfs_trans_ijoin(tp, ip, 0);
+
+ count = bmap->me_len;
+- error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork,
+- bmap->me_startoff, bmap->me_startblock, &count, state);
++ error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
++ whichfork, bmap->me_startoff, bmap->me_startblock,
++ &count, state);
+ if (error)
+ goto err_inode;
+
+ if (count > 0) {
+- ASSERT(type == XFS_BMAP_UNMAP);
++ ASSERT(bui_type == XFS_BMAP_UNMAP);
+ irec.br_startblock = bmap->me_startblock;
+ irec.br_blockcount = count;
+ irec.br_startoff = bmap->me_startoff;
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:22:24 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:09 +0530
+Subject: xfs: clean up xfs_bui_item_recover iget/trans_alloc/ilock ordering
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-16-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 64a3f3315bc60f710a0a25c1798ac0ea58c6fa1f upstream.
+
+In most places in XFS, we have a specific order in which we gather
+resources: grab the inode, allocate a transaction, then lock the inode.
+xfs_bui_item_recover doesn't do it in that order, so fix it to be more
+consistent. This also makes the error bailout code a bit less weird.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bmap_item.c | 38 ++++++++++++++++++++++++--------------
+ 1 file changed, 24 insertions(+), 14 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -22,6 +22,7 @@
+ #include "xfs_bmap_btree.h"
+ #include "xfs_trans_space.h"
+ #include "xfs_error.h"
++#include "xfs_quota.h"
+
+ kmem_zone_t *xfs_bui_zone;
+ kmem_zone_t *xfs_bud_zone;
+@@ -488,21 +489,26 @@ xfs_bui_recover(
+ return -EFSCORRUPTED;
+ }
+
+- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+- XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
++ /* Grab the inode. */
++ error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip);
+ if (error)
+ return error;
+
+- budp = xfs_trans_get_bud(tp, buip);
+-
+- /* Grab the inode. */
+- error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip);
++ error = xfs_qm_dqattach(ip);
+ if (error)
+- goto err_inode;
++ goto err_rele;
+
+ if (VFS_I(ip)->i_nlink == 0)
+ xfs_iflags_set(ip, XFS_IRECOVERY);
+
++ /* Allocate transaction and do the work. */
++ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
++ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
++ if (error)
++ goto err_rele;
++
++ budp = xfs_trans_get_bud(tp, buip);
++ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ count = bmap->me_len;
+@@ -510,7 +516,7 @@ xfs_bui_recover(
+ whichfork, bmap->me_startoff, bmap->me_startblock,
+ &count, state);
+ if (error)
+- goto err_inode;
++ goto err_cancel;
+
+ if (count > 0) {
+ ASSERT(bui_type == XFS_BMAP_UNMAP);
+@@ -522,16 +528,20 @@ xfs_bui_recover(
+ }
+
+ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
++ /* Commit transaction, which frees the transaction. */
+ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
++ if (error)
++ goto err_unlock;
++
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+- return error;
++ return 0;
+
+-err_inode:
++err_cancel:
+ xfs_trans_cancel(tp);
+- if (ip) {
+- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+- xfs_irele(ip);
+- }
++err_unlock:
++ xfs_iunlock(ip, XFS_ILOCK_EXCL);
++err_rele:
++ xfs_irele(ip);
+ return error;
+ }
--- /dev/null
+From stable-owner@vger.kernel.org Thu Feb 16 07:28:47 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:16 +0530
+Subject: xfs: ensure inobt record walks always make forward progress
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-23-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 27c14b5daa82861220d6fa6e27b51f05f21ffaa7 upstream.
+
+[ In xfs_iwalk_ag(), Replace a call to XFS_IS_CORRUPT() with a call to
+ ASSERT() ]
+
+The aim of the inode btree record iterator function is to call a
+callback on every record in the btree. To avoid having to tear down and
+recreate the inode btree cursor around every callback, it caches a
+certain number of records in a memory buffer. After each batch of
+callback invocations, we have to perform a btree lookup to find the
+next record after where we left off.
+
+However, if the keys of the inode btree are corrupt, the lookup might
+put us in the wrong part of the inode btree, causing the walk function
+to loop forever. Therefore, we add extra cursor tracking to make sure
+that we never go backwards neither when performing the lookup nor when
+jumping to the next inobt record. This also fixes an off by one error
+where upon resume the lookup should have been for the inode /after/ the
+point at which we stopped.
+
+Found by fuzzing xfs/460 with keys[2].startino = ones causing bulkstat
+and quotacheck to hang.
+
+Fixes: a211432c27ff ("xfs: create simplified inode walk function")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Chandan Babu R <chandanrlinux@gmail.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iwalk.c | 27 ++++++++++++++++++++++++---
+ 1 file changed, 24 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_iwalk.c
++++ b/fs/xfs/xfs_iwalk.c
+@@ -55,6 +55,9 @@ struct xfs_iwalk_ag {
+ /* Where do we start the traversal? */
+ xfs_ino_t startino;
+
++ /* What was the last inode number we saw when iterating the inobt? */
++ xfs_ino_t lastino;
++
+ /* Array of inobt records we cache. */
+ struct xfs_inobt_rec_incore *recs;
+
+@@ -300,6 +303,9 @@ xfs_iwalk_ag_start(
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1);
+
++ iwag->lastino = XFS_AGINO_TO_INO(mp, agno,
++ irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
++
+ /*
+ * If the LE lookup yielded an inobt record before the cursor position,
+ * skip it and see if there's another one after it.
+@@ -346,15 +352,17 @@ xfs_iwalk_run_callbacks(
+ struct xfs_mount *mp = iwag->mp;
+ struct xfs_trans *tp = iwag->tp;
+ struct xfs_inobt_rec_incore *irec;
+- xfs_agino_t restart;
++ xfs_agino_t next_agino;
+ int error;
+
++ next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
++
+ ASSERT(iwag->nr_recs > 0);
+
+ /* Delete cursor but remember the last record we cached... */
+ xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+ irec = &iwag->recs[iwag->nr_recs - 1];
+- restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
++ ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
+
+ error = xfs_iwalk_ag_recs(iwag);
+ if (error)
+@@ -371,7 +379,7 @@ xfs_iwalk_run_callbacks(
+ if (error)
+ return error;
+
+- return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
++ return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
+ }
+
+ /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
+@@ -395,6 +403,7 @@ xfs_iwalk_ag(
+
+ while (!error && has_more) {
+ struct xfs_inobt_rec_incore *irec;
++ xfs_ino_t rec_fsino;
+
+ cond_resched();
+ if (xfs_pwork_want_abort(&iwag->pwork))
+@@ -406,6 +415,15 @@ xfs_iwalk_ag(
+ if (error || !has_more)
+ break;
+
++ /* Make sure that we always move forward. */
++ rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino);
++ if (iwag->lastino != NULLFSINO && iwag->lastino >= rec_fsino) {
++ ASSERT(iwag->lastino < rec_fsino);
++ error = -EFSCORRUPTED;
++ goto out;
++ }
++ iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
++
+ /* No allocated inodes in this chunk; skip it. */
+ if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
+ error = xfs_btree_increment(cur, 0, &has_more);
+@@ -534,6 +552,7 @@ xfs_iwalk(
+ .trim_start = 1,
+ .skip_empty = 1,
+ .pwork = XFS_PWORK_SINGLE_THREADED,
++ .lastino = NULLFSINO,
+ };
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ int error;
+@@ -622,6 +641,7 @@ xfs_iwalk_threaded(
+ iwag->data = data;
+ iwag->startino = startino;
+ iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
++ iwag->lastino = NULLFSINO;
+ xfs_pwork_queue(&pctl, &iwag->pwork);
+ startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
+ if (flags & XFS_INOBT_WALK_SAME_AG)
+@@ -695,6 +715,7 @@ xfs_inobt_walk(
+ .startino = startino,
+ .sz_recs = xfs_inobt_walk_prefetch(inobt_records),
+ .pwork = XFS_PWORK_SINGLE_THREADED,
++ .lastino = NULLFSINO,
+ };
+ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
+ int error;
--- /dev/null
+From stable-owner@vger.kernel.org Thu Feb 16 07:30:39 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:13 +0530
+Subject: xfs: expose the log push threshold
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-20-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit ed1575daf71e4e21d8ae735b6e687c95454aaa17 upstream.
+
+Separate the computation of the log push threshold and the push logic in
+xlog_grant_push_ail. This enables higher level code to determine (for
+example) that it is holding on to a logged intent item and the log is so
+busy that it is more than 75% full. In that case, it would be desirable
+to move the log item towards the head to release the tail, which we will
+cover in the next patch.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icreate_item.c | 1 +
+ fs/xfs/xfs_log.c | 40 ++++++++++++++++++++++++++++++----------
+ fs/xfs/xfs_log.h | 2 ++
+ 3 files changed, 33 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_icreate_item.c
++++ b/fs/xfs/xfs_icreate_item.c
+@@ -10,6 +10,7 @@
+ #include "xfs_trans.h"
+ #include "xfs_trans_priv.h"
+ #include "xfs_icreate_item.h"
++#include "xfs_log_priv.h"
+ #include "xfs_log.h"
+
+ kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -1537,14 +1537,14 @@ xlog_commit_record(
+ }
+
+ /*
+- * Push on the buffer cache code if we ever use more than 75% of the on-disk
+- * log space. This code pushes on the lsn which would supposedly free up
+- * the 25% which we want to leave free. We may need to adopt a policy which
+- * pushes on an lsn which is further along in the log once we reach the high
+- * water mark. In this manner, we would be creating a low water mark.
++ * Compute the LSN that we'd need to push the log tail towards in order to have
++ * (a) enough on-disk log space to log the number of bytes specified, (b) at
++ * least 25% of the log space free, and (c) at least 256 blocks free. If the
++ * log free space already meets all three thresholds, this function returns
++ * NULLCOMMITLSN.
+ */
+-STATIC void
+-xlog_grant_push_ail(
++xfs_lsn_t
++xlog_grant_push_threshold(
+ struct xlog *log,
+ int need_bytes)
+ {
+@@ -1570,7 +1570,7 @@ xlog_grant_push_ail(
+ free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
+ free_threshold = max(free_threshold, 256);
+ if (free_blocks >= free_threshold)
+- return;
++ return NULLCOMMITLSN;
+
+ xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
+ &threshold_block);
+@@ -1590,13 +1590,33 @@ xlog_grant_push_ail(
+ if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
+ threshold_lsn = last_sync_lsn;
+
++ return threshold_lsn;
++}
++
++/*
++ * Push the tail of the log if we need to do so to maintain the free log space
++ * thresholds set out by xlog_grant_push_threshold. We may need to adopt a
++ * policy which pushes on an lsn which is further along in the log once we
++ * reach the high water mark. In this manner, we would be creating a low water
++ * mark.
++ */
++STATIC void
++xlog_grant_push_ail(
++ struct xlog *log,
++ int need_bytes)
++{
++ xfs_lsn_t threshold_lsn;
++
++ threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
++ if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log))
++ return;
++
+ /*
+ * Get the transaction layer to kick the dirty buffers out to
+ * disk asynchronously. No point in trying to do this if
+ * the filesystem is shutting down.
+ */
+- if (!XLOG_FORCED_SHUTDOWN(log))
+- xfs_ail_push(log->l_ailp, threshold_lsn);
++ xfs_ail_push(log->l_ailp, threshold_lsn);
+ }
+
+ /*
+--- a/fs/xfs/xfs_log.h
++++ b/fs/xfs/xfs_log.h
+@@ -146,4 +146,6 @@ void xfs_log_quiesce(struct xfs_mount *m
+ bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
+ bool xfs_log_in_recovery(struct xfs_mount *);
+
++xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
++
+ #endif /* __XFS_LOG_H__ */
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:21:02 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:49:58 +0530
+Subject: xfs: factor out a xfs_defer_create_intent helper
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-5-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit e046e949486ec92d83b2ccdf0e7e9144f74ef028 upstream.
+
+Create a helper that encapsulates the whole logic to create a defer
+intent. This reorders some of the work that was done, but none of
+that has an affect on the operation as only fields that don't directly
+interact are affected.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 39 +++++++++++++++++++++++----------------
+ 1 file changed, 23 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -178,6 +178,23 @@ static const struct xfs_defer_op_type *d
+ [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
+ };
+
++static void
++xfs_defer_create_intent(
++ struct xfs_trans *tp,
++ struct xfs_defer_pending *dfp,
++ bool sort)
++{
++ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
++ struct list_head *li;
++
++ if (sort)
++ list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
++
++ dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
++ list_for_each(li, &dfp->dfp_work)
++ ops->log_item(tp, dfp->dfp_intent, li);
++}
++
+ /*
+ * For each pending item in the intake list, log its intent item and the
+ * associated extents, then add the entire intake list to the end of
+@@ -187,17 +204,11 @@ STATIC void
+ xfs_defer_create_intents(
+ struct xfs_trans *tp)
+ {
+- struct list_head *li;
+ struct xfs_defer_pending *dfp;
+- const struct xfs_defer_op_type *ops;
+
+ list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
+- ops = defer_op_types[dfp->dfp_type];
+- dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
+ trace_xfs_defer_create_intent(tp->t_mountp, dfp);
+- list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
+- list_for_each(li, &dfp->dfp_work)
+- ops->log_item(tp, dfp->dfp_intent, li);
++ xfs_defer_create_intent(tp, dfp, true);
+ }
+ }
+
+@@ -427,17 +438,13 @@ xfs_defer_finish_noroll(
+ }
+ if (error == -EAGAIN) {
+ /*
+- * Caller wants a fresh transaction, so log a
+- * new log intent item to replace the old one
+- * and roll the transaction. See "Requesting
+- * a Fresh Transaction while Finishing
+- * Deferred Work" above.
++ * Caller wants a fresh transaction, so log a new log
++ * intent item to replace the old one and roll the
++ * transaction. See "Requesting a Fresh Transaction
++ * while Finishing Deferred Work" above.
+ */
+- dfp->dfp_intent = ops->create_intent(*tp,
+- dfp->dfp_count);
+ dfp->dfp_done = NULL;
+- list_for_each(li, &dfp->dfp_work)
+- ops->log_item(*tp, dfp->dfp_intent, li);
++ xfs_defer_create_intent(*tp, dfp, false);
+ } else {
+ /* Done with the dfp, free it. */
+ list_del(&dfp->dfp_list);
--- /dev/null
+From stable-owner@vger.kernel.org Thu Feb 16 08:00:00 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:10 +0530
+Subject: xfs: fix an incore inode UAF in xfs_bui_recover
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-17-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit ff4ab5e02a0447dd1e290883eb6cd7d94848e590 upstream.
+
+In xfs_bui_item_recover, there exists a use-after-free bug with regards
+to the inode that is involved in the bmap replay operation. If the
+mapping operation does not complete, we call xfs_bmap_unmap_extent to
+create a deferred op to finish the unmapping work, and we retain a
+pointer to the incore inode.
+
+Unfortunately, the very next thing we do is commit the transaction and
+drop the inode. If reclaim tears down the inode before we try to finish
+the defer ops, we dereference garbage and blow up. Therefore, create a
+way to join inodes to the defer ops freezer so that we can maintain the
+xfs_inode reference until we're done with the inode.
+
+Note: This imposes the requirement that there be enough memory to keep
+every incore inode in memory throughout recovery.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 43 ++++++++++++++++++++++++++++++++++++++-----
+ fs/xfs/libxfs/xfs_defer.h | 11 +++++++++--
+ fs/xfs/xfs_bmap_item.c | 7 +++++--
+ fs/xfs/xfs_extfree_item.c | 2 +-
+ fs/xfs/xfs_log_recover.c | 7 ++++++-
+ fs/xfs/xfs_refcount_item.c | 2 +-
+ fs/xfs/xfs_rmap_item.c | 2 +-
+ 7 files changed, 61 insertions(+), 13 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -16,6 +16,7 @@
+ #include "xfs_inode.h"
+ #include "xfs_inode_item.h"
+ #include "xfs_trace.h"
++#include "xfs_icache.h"
+
+ /*
+ * Deferred Operations in XFS
+@@ -567,10 +568,14 @@ xfs_defer_move(
+ * deferred ops state is transferred to the capture structure and the
+ * transaction is then ready for the caller to commit it. If there are no
+ * intent items to capture, this function returns NULL.
++ *
++ * If capture_ip is not NULL, the capture structure will obtain an extra
++ * reference to the inode.
+ */
+ static struct xfs_defer_capture *
+ xfs_defer_ops_capture(
+- struct xfs_trans *tp)
++ struct xfs_trans *tp,
++ struct xfs_inode *capture_ip)
+ {
+ struct xfs_defer_capture *dfc;
+
+@@ -596,6 +601,15 @@ xfs_defer_ops_capture(
+ /* Preserve the log reservation size. */
+ dfc->dfc_logres = tp->t_log_res;
+
++ /*
++ * Grab an extra reference to this inode and attach it to the capture
++ * structure.
++ */
++ if (capture_ip) {
++ ihold(VFS_I(capture_ip));
++ dfc->dfc_capture_ip = capture_ip;
++ }
++
+ return dfc;
+ }
+
+@@ -606,24 +620,33 @@ xfs_defer_ops_release(
+ struct xfs_defer_capture *dfc)
+ {
+ xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
++ if (dfc->dfc_capture_ip)
++ xfs_irele(dfc->dfc_capture_ip);
+ kmem_free(dfc);
+ }
+
+ /*
+ * Capture any deferred ops and commit the transaction. This is the last step
+- * needed to finish a log intent item that we recovered from the log.
++ * needed to finish a log intent item that we recovered from the log. If any
++ * of the deferred ops operate on an inode, the caller must pass in that inode
++ * so that the reference can be transferred to the capture structure. The
++ * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
++ * xfs_defer_ops_continue.
+ */
+ int
+ xfs_defer_ops_capture_and_commit(
+ struct xfs_trans *tp,
++ struct xfs_inode *capture_ip,
+ struct list_head *capture_list)
+ {
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_defer_capture *dfc;
+ int error;
+
++ ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL));
++
+ /* If we don't capture anything, commit transaction and exit. */
+- dfc = xfs_defer_ops_capture(tp);
++ dfc = xfs_defer_ops_capture(tp, capture_ip);
+ if (!dfc)
+ return xfs_trans_commit(tp);
+
+@@ -640,16 +663,26 @@ xfs_defer_ops_capture_and_commit(
+
+ /*
+ * Attach a chain of captured deferred ops to a new transaction and free the
+- * capture structure.
++ * capture structure. If an inode was captured, it will be passed back to the
++ * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
++ * The caller now owns the inode reference.
+ */
+ void
+ xfs_defer_ops_continue(
+ struct xfs_defer_capture *dfc,
+- struct xfs_trans *tp)
++ struct xfs_trans *tp,
++ struct xfs_inode **captured_ipp)
+ {
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
++ /* Lock and join the captured inode to the new transaction. */
++ if (dfc->dfc_capture_ip) {
++ xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL);
++ xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0);
++ }
++ *captured_ipp = dfc->dfc_capture_ip;
++
+ /* Move captured dfops chain and state to the transaction. */
+ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
+ tp->t_flags |= dfc->dfc_tpflags;
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -80,6 +80,12 @@ struct xfs_defer_capture {
+
+ /* Log reservation saved from the transaction. */
+ unsigned int dfc_logres;
++
++ /*
++ * An inode reference that must be maintained to complete the deferred
++ * work.
++ */
++ struct xfs_inode *dfc_capture_ip;
+ };
+
+ /*
+@@ -87,8 +93,9 @@ struct xfs_defer_capture {
+ * This doesn't normally happen except log recovery.
+ */
+ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
+- struct list_head *capture_list);
+-void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp);
++ struct xfs_inode *capture_ip, struct list_head *capture_list);
++void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
++ struct xfs_inode **captured_ipp);
+ void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+
+ #endif /* __XFS_DEFER_H__ */
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -528,8 +528,11 @@ xfs_bui_recover(
+ }
+
+ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+- /* Commit transaction, which frees the transaction. */
+- error = xfs_defer_ops_capture_and_commit(tp, capture_list);
++ /*
++ * Commit transaction, which frees the transaction and saves the inode
++ * for later replay activities.
++ */
++ error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list);
+ if (error)
+ goto err_unlock;
+
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -639,7 +639,7 @@ xfs_efi_recover(
+
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+
+- return xfs_defer_ops_capture_and_commit(tp, capture_list);
++ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+
+ abort_error:
+ xfs_trans_cancel(tp);
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4766,6 +4766,7 @@ xlog_finish_defer_ops(
+ {
+ struct xfs_defer_capture *dfc, *next;
+ struct xfs_trans *tp;
++ struct xfs_inode *ip;
+ int error = 0;
+
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+@@ -4791,9 +4792,13 @@ xlog_finish_defer_ops(
+ * from recovering a single intent item.
+ */
+ list_del_init(&dfc->dfc_list);
+- xfs_defer_ops_continue(dfc, tp);
++ xfs_defer_ops_continue(dfc, tp, &ip);
+
+ error = xfs_trans_commit(tp);
++ if (ip) {
++ xfs_iunlock(ip, XFS_ILOCK_EXCL);
++ xfs_irele(ip);
++ }
+ if (error)
+ return error;
+ }
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -569,7 +569,7 @@ xfs_cui_recover(
+
+ xfs_refcount_finish_one_cleanup(tp, rcur, error);
+ set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+- return xfs_defer_ops_capture_and_commit(tp, capture_list);
++ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+
+ abort_error:
+ xfs_refcount_finish_one_cleanup(tp, rcur, error);
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -593,7 +593,7 @@ xfs_rui_recover(
+
+ xfs_rmap_finish_one_cleanup(tp, rcur, error);
+ set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
+- return xfs_defer_ops_capture_and_commit(tp, capture_list);
++ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
+
+ abort_error:
+ xfs_rmap_finish_one_cleanup(tp, rcur, error);
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:21:43 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:04 +0530
+Subject: xfs: fix finobt btree block recovery ordering
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-11-chandan.babu@oracle.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 671459676ab0e1d371c8d6b184ad1faa05b6941e upstream.
+
+[ In 5.4.y, xlog_recover_get_buf_lsn() is defined inside
+ fs/xfs/xfs_log_recover.c ]
+
+Nathan popped up on #xfs and pointed out that we fail to handle
+finobt btree blocks in xlog_recover_get_buf_lsn(). This means they
+always fall through the entire magic number matching code to "recover
+immediately". Whilst most of the time this is the correct behaviour,
+occasionally it will be incorrect and could potentially overwrite
+more recent metadata because we don't check the LSN in the on disk
+metadata at all.
+
+This bug has been present since the finobt was first introduced, and
+is a potential cause of the occasional xfs_iget_check_free_state()
+failures we see that indicate that the inode btree state does not
+match the on disk inode state.
+
+Fixes: aafc3c246529 ("xfs: support the XFS_BTNUM_FINOBT free inode btree type")
+Reported-by: Nathan Scott <nathans@redhat.com>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -2206,6 +2206,8 @@ xlog_recover_get_buf_lsn(
+ case XFS_ABTC_MAGIC:
+ case XFS_RMAP_CRC_MAGIC:
+ case XFS_REFC_CRC_MAGIC:
++ case XFS_FIBT_CRC_MAGIC:
++ case XFS_FIBT_MAGIC:
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC: {
+ struct xfs_btree_block *btb = blk;
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:23:08 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:15 +0530
+Subject: xfs: fix missing CoW blocks writeback conversion retry
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-22-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit c2f09217a4305478c55adc9a98692488dd19cd32 upstream.
+
+[ Set xfs_writepage_ctx->fork to XFS_DATA_FORK since 5.4.y tracks current
+ extent's fork in this variable ]
+
+In commit 7588cbeec6df, we tried to fix a race stemming from the lack of
+coordination between higher level code that wants to allocate and remap
+CoW fork extents into the data fork. Christoph cites as examples the
+always_cow mode, and a directio write completion racing with writeback.
+
+According to the comments before the goto retry, we want to restart the
+lookup to catch the extent in the data fork, but we don't actually reset
+whichfork or cow_fsb, which means the second try executes using stale
+information. Up until now I think we've gotten lucky that either
+there's something left in the CoW fork to cause cow_fsb to be reset, or
+either data/cow fork sequence numbers have advanced enough to force a
+fresh lookup from the data fork. However, if we reach the retry with an
+empty stable CoW fork and a stable data fork, neither of those things
+happens. The retry foolishly re-calls xfs_convert_blocks on the CoW
+fork which fails again. This time, we toss the write.
+
+I've recently been working on extending reflink to the realtime device.
+When the realtime extent size is larger than a single block, we have to
+force the page cache to CoW the entire rt extent if a write (or
+fallocate) are not aligned with the rt extent size. The strategy I've
+chosen to deal with this is derived from Dave's blocksize > pagesize
+series: dirtying around the write range, and ensuring that writeback
+always starts mapping on an rt extent boundary. This has brought this
+race front and center, since generic/522 blows up immediately.
+
+However, I'm pretty sure this is a bug outright, independent of that.
+
+Fixes: 7588cbeec6df ("xfs: retry COW fork delalloc conversion when no extent was found")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -495,7 +495,7 @@ xfs_map_blocks(
+ ssize_t count = i_blocksize(inode);
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
+- xfs_fileoff_t cow_fsb = NULLFILEOFF;
++ xfs_fileoff_t cow_fsb;
+ struct xfs_bmbt_irec imap;
+ struct xfs_iext_cursor icur;
+ int retries = 0;
+@@ -529,6 +529,8 @@ xfs_map_blocks(
+ * landed in a hole and we skip the block.
+ */
+ retry:
++ cow_fsb = NULLFILEOFF;
++ wpc->fork = XFS_DATA_FORK;
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+ (ip->i_df.if_flags & XFS_IFEXTENTS));
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:23:20 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:17 +0530
+Subject: xfs: fix the forward progress assertion in xfs_iwalk_run_callbacks
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-24-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit a5336d6bb2d02d0e9d4d3c8be04b80b8b68d56c8 upstream.
+
+In commit 27c14b5daa82 we started tracking the last inode seen during an
+inode walk to avoid infinite loops if a corrupt inobt record happens to
+have a lower ir_startino than the record preceeding it. Unfortunately,
+the assertion trips over the case where there are completely empty inobt
+records (which can happen quite easily on 64k page filesystems) because
+we advance the tracking cursor without actually putting the empty record
+into the processing buffer. Fix the assert to allow for this case.
+
+Reported-by: zlang@redhat.com
+Fixes: 27c14b5daa82 ("xfs: ensure inobt record walks always make forward progress")
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Zorro Lang <zlang@redhat.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iwalk.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_iwalk.c
++++ b/fs/xfs/xfs_iwalk.c
+@@ -362,7 +362,7 @@ xfs_iwalk_run_callbacks(
+ /* Delete cursor but remember the last record we cached... */
+ xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
+ irec = &iwag->recs[iwag->nr_recs - 1];
+- ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
++ ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
+
+ error = xfs_iwalk_ag_recs(iwag);
+ if (error)
--- /dev/null
+From stable-owner@vger.kernel.org Thu Feb 16 07:27:30 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:03 +0530
+Subject: xfs: log new intent items created as part of finishing recovered intent items
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-10-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 93293bcbde93567efaf4e6bcd58cad270e1fcbf5 upstream.
+
+[Slightly edit fs/xfs/xfs_bmap_item.c & fs/xfs/xfs_refcount_item.c to resolve
+merge conflicts]
+
+During a code inspection, I found a serious bug in the log intent item
+recovery code when an intent item cannot complete all the work and
+decides to requeue itself to get that done. When this happens, the
+item recovery creates a new incore deferred op representing the
+remaining work and attaches it to the transaction that it allocated. At
+the end of _item_recover, it moves the entire chain of deferred ops to
+the dummy parent_tp that xlog_recover_process_intents passed to it, but
+fail to log a new intent item for the remaining work before committing
+the transaction for the single unit of work.
+
+xlog_finish_defer_ops logs those new intent items once recovery has
+finished dealing with the intent items that it recovered, but this isn't
+sufficient. If the log is forced to disk after a recovered log item
+decides to requeue itself and the system goes down before we call
+xlog_finish_defer_ops, the second log recovery will never see the new
+intent item and therefore has no idea that there was more work to do.
+It will finish recovery leaving the filesystem in a corrupted state.
+
+The same logic applies to /any/ deferred ops added during intent item
+recovery, not just the one handling the remaining work.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 26 ++++++++++++++++++++++++--
+ fs/xfs/libxfs/xfs_defer.h | 6 ++++++
+ fs/xfs/xfs_bmap_item.c | 2 +-
+ fs/xfs/xfs_refcount_item.c | 2 +-
+ 4 files changed, 32 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -186,8 +186,9 @@ xfs_defer_create_intent(
+ {
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+
+- dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
+- dfp->dfp_count, sort);
++ if (!dfp->dfp_intent)
++ dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
++ dfp->dfp_count, sort);
+ }
+
+ /*
+@@ -390,6 +391,7 @@ xfs_defer_finish_one(
+ list_add(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+ dfp->dfp_done = NULL;
++ dfp->dfp_intent = NULL;
+ xfs_defer_create_intent(tp, dfp, false);
+ }
+
+@@ -552,3 +554,23 @@ xfs_defer_move(
+
+ xfs_defer_reset(stp);
+ }
++
++/*
++ * Prepare a chain of fresh deferred ops work items to be completed later. Log
++ * recovery requires the ability to put off until later the actual finishing
++ * work so that it can process unfinished items recovered from the log in
++ * correct order.
++ *
++ * Create and log intent items for all the work that we're capturing so that we
++ * can be assured that the items will get replayed if the system goes down
++ * before log recovery gets a chance to finish the work it put off. Then we
++ * move the chain from stp to dtp.
++ */
++void
++xfs_defer_capture(
++ struct xfs_trans *dtp,
++ struct xfs_trans *stp)
++{
++ xfs_defer_create_intents(stp);
++ xfs_defer_move(dtp, stp);
++}
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -61,4 +61,10 @@ extern const struct xfs_defer_op_type xf
+ extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
+ extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+
++/*
++ * Functions to capture a chain of deferred operations and continue them later.
++ * This doesn't normally happen except log recovery.
++ */
++void xfs_defer_capture(struct xfs_trans *dtp, struct xfs_trans *stp);
++
+ #endif /* __XFS_DEFER_H__ */
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -541,7 +541,7 @@ xfs_bui_recover(
+ }
+
+ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+- xfs_defer_move(parent_tp, tp);
++ xfs_defer_capture(parent_tp, tp);
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -574,7 +574,7 @@ xfs_cui_recover(
+
+ xfs_refcount_finish_one_cleanup(tp, rcur, error);
+ set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+- xfs_defer_move(parent_tp, tp);
++ xfs_defer_capture(parent_tp, tp);
+ error = xfs_trans_commit(tp);
+ return error;
+
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:21:18 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:00 +0530
+Subject: xfs: merge the ->diff_items defer op into ->create_intent
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-7-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit d367a868e46b025a8ced8e00ef2b3a3c2f3bf732 upstream.
+
+This avoids a per-item indirect call, and also simplifies the interface
+a bit.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 5 +----
+ fs/xfs/libxfs/xfs_defer.h | 3 +--
+ fs/xfs/xfs_bmap_item.c | 9 ++++++---
+ fs/xfs/xfs_extfree_item.c | 7 ++++---
+ fs/xfs/xfs_refcount_item.c | 6 ++++--
+ fs/xfs/xfs_rmap_item.c | 6 ++++--
+ 6 files changed, 20 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -186,11 +186,8 @@ xfs_defer_create_intent(
+ {
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+
+- if (sort)
+- list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
+-
+ dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
+- dfp->dfp_count);
++ dfp->dfp_count, sort);
+ }
+
+ /*
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -49,9 +49,8 @@ struct xfs_defer_op_type {
+ void **);
+ void (*finish_cleanup)(struct xfs_trans *, void *, int);
+ void (*cancel_item)(struct list_head *);
+- int (*diff_items)(void *, struct list_head *, struct list_head *);
+ void *(*create_intent)(struct xfs_trans *tp, struct list_head *items,
+- unsigned int count);
++ unsigned int count, bool sort);
+ unsigned int max_items;
+ };
+
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -334,14 +334,18 @@ STATIC void *
+ xfs_bmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+- unsigned int count)
++ unsigned int count,
++ bool sort)
+ {
+- struct xfs_bui_log_item *buip = xfs_bui_init(tp->t_mountp);
++ struct xfs_mount *mp = tp->t_mountp;
++ struct xfs_bui_log_item *buip = xfs_bui_init(mp);
+ struct xfs_bmap_intent *bmap;
+
+ ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+
+ xfs_trans_add_item(tp, &buip->bui_item);
++ if (sort)
++ list_sort(mp, items, xfs_bmap_update_diff_items);
+ list_for_each_entry(bmap, items, bi_list)
+ xfs_bmap_update_log_item(tp, buip, bmap);
+ return buip;
+@@ -408,7 +412,6 @@ xfs_bmap_update_cancel_item(
+
+ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+ .max_items = XFS_BUI_MAX_FAST_EXTENTS,
+- .diff_items = xfs_bmap_update_diff_items,
+ .create_intent = xfs_bmap_update_create_intent,
+ .abort_intent = xfs_bmap_update_abort_intent,
+ .create_done = xfs_bmap_update_create_done,
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -441,7 +441,8 @@ STATIC void *
+ xfs_extent_free_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+- unsigned int count)
++ unsigned int count,
++ bool sort)
+ {
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_efi_log_item *efip = xfs_efi_init(mp, count);
+@@ -450,6 +451,8 @@ xfs_extent_free_create_intent(
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &efip->efi_item);
++ if (sort)
++ list_sort(mp, items, xfs_extent_free_diff_items);
+ list_for_each_entry(free, items, xefi_list)
+ xfs_extent_free_log_item(tp, efip, free);
+ return efip;
+@@ -506,7 +509,6 @@ xfs_extent_free_cancel_item(
+
+ const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+- .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .create_done = xfs_extent_free_create_done,
+@@ -571,7 +573,6 @@ xfs_agfl_free_finish_item(
+ /* sub-type with special handling for AGFL deferred frees */
+ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+- .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .create_done = xfs_extent_free_create_done,
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -333,7 +333,8 @@ STATIC void *
+ xfs_refcount_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+- unsigned int count)
++ unsigned int count,
++ bool sort)
+ {
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count);
+@@ -342,6 +343,8 @@ xfs_refcount_update_create_intent(
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &cuip->cui_item);
++ if (sort)
++ list_sort(mp, items, xfs_refcount_update_diff_items);
+ list_for_each_entry(refc, items, ri_list)
+ xfs_refcount_update_log_item(tp, cuip, refc);
+ return cuip;
+@@ -422,7 +425,6 @@ xfs_refcount_update_cancel_item(
+
+ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+ .max_items = XFS_CUI_MAX_FAST_EXTENTS,
+- .diff_items = xfs_refcount_update_diff_items,
+ .create_intent = xfs_refcount_update_create_intent,
+ .abort_intent = xfs_refcount_update_abort_intent,
+ .create_done = xfs_refcount_update_create_done,
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -385,7 +385,8 @@ STATIC void *
+ xfs_rmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+- unsigned int count)
++ unsigned int count,
++ bool sort)
+ {
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count);
+@@ -394,6 +395,8 @@ xfs_rmap_update_create_intent(
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &ruip->rui_item);
++ if (sort)
++ list_sort(mp, items, xfs_rmap_update_diff_items);
+ list_for_each_entry(rmap, items, ri_list)
+ xfs_rmap_update_log_item(tp, ruip, rmap);
+ return ruip;
+@@ -466,7 +469,6 @@ xfs_rmap_update_cancel_item(
+
+ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+ .max_items = XFS_RUI_MAX_FAST_EXTENTS,
+- .diff_items = xfs_rmap_update_diff_items,
+ .create_intent = xfs_rmap_update_create_intent,
+ .abort_intent = xfs_rmap_update_abort_intent,
+ .create_done = xfs_rmap_update_create_done,
--- /dev/null
+From stable-owner@vger.kernel.org Thu Feb 16 07:13:43 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:49:59 +0530
+Subject: xfs: merge the ->log_item defer op into ->create_intent
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-6-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit c1f09188e8de0ae65433cb9c8ace4feb66359bcc upstream.
+
+These are aways called together, and my merging them we reduce the amount
+of indirect calls, improve type safety and in general clean up the code
+a bit.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 6 +----
+ fs/xfs/libxfs/xfs_defer.h | 4 +--
+ fs/xfs/xfs_bmap_item.c | 47 +++++++++++++++++--------------------------
+ fs/xfs/xfs_extfree_item.c | 49 ++++++++++++++++++---------------------------
+ fs/xfs/xfs_refcount_item.c | 48 ++++++++++++++++++--------------------------
+ fs/xfs/xfs_rmap_item.c | 48 ++++++++++++++++++--------------------------
+ 6 files changed, 83 insertions(+), 119 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -185,14 +185,12 @@ xfs_defer_create_intent(
+ bool sort)
+ {
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+- struct list_head *li;
+
+ if (sort)
+ list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
+
+- dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
+- list_for_each(li, &dfp->dfp_work)
+- ops->log_item(tp, dfp->dfp_intent, li);
++ dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
++ dfp->dfp_count);
+ }
+
+ /*
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -50,8 +50,8 @@ struct xfs_defer_op_type {
+ void (*finish_cleanup)(struct xfs_trans *, void *, int);
+ void (*cancel_item)(struct list_head *);
+ int (*diff_items)(void *, struct list_head *, struct list_head *);
+- void *(*create_intent)(struct xfs_trans *, uint);
+- void (*log_item)(struct xfs_trans *, void *, struct list_head *);
++ void *(*create_intent)(struct xfs_trans *tp, struct list_head *items,
++ unsigned int count);
+ unsigned int max_items;
+ };
+
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -278,27 +278,6 @@ xfs_bmap_update_diff_items(
+ return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
+ }
+
+-/* Get an BUI. */
+-STATIC void *
+-xfs_bmap_update_create_intent(
+- struct xfs_trans *tp,
+- unsigned int count)
+-{
+- struct xfs_bui_log_item *buip;
+-
+- ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+- ASSERT(tp != NULL);
+-
+- buip = xfs_bui_init(tp->t_mountp);
+- ASSERT(buip != NULL);
+-
+- /*
+- * Get a log_item_desc to point at the new item.
+- */
+- xfs_trans_add_item(tp, &buip->bui_item);
+- return buip;
+-}
+-
+ /* Set the map extent flags for this mapping. */
+ static void
+ xfs_trans_set_bmap_flags(
+@@ -326,16 +305,12 @@ xfs_trans_set_bmap_flags(
+ STATIC void
+ xfs_bmap_update_log_item(
+ struct xfs_trans *tp,
+- void *intent,
+- struct list_head *item)
++ struct xfs_bui_log_item *buip,
++ struct xfs_bmap_intent *bmap)
+ {
+- struct xfs_bui_log_item *buip = intent;
+- struct xfs_bmap_intent *bmap;
+ uint next_extent;
+ struct xfs_map_extent *map;
+
+- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+-
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+
+@@ -355,6 +330,23 @@ xfs_bmap_update_log_item(
+ bmap->bi_bmap.br_state);
+ }
+
++STATIC void *
++xfs_bmap_update_create_intent(
++ struct xfs_trans *tp,
++ struct list_head *items,
++ unsigned int count)
++{
++ struct xfs_bui_log_item *buip = xfs_bui_init(tp->t_mountp);
++ struct xfs_bmap_intent *bmap;
++
++ ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
++
++ xfs_trans_add_item(tp, &buip->bui_item);
++ list_for_each_entry(bmap, items, bi_list)
++ xfs_bmap_update_log_item(tp, buip, bmap);
++ return buip;
++}
++
+ /* Get an BUD so we can process all the deferred rmap updates. */
+ STATIC void *
+ xfs_bmap_update_create_done(
+@@ -419,7 +411,6 @@ const struct xfs_defer_op_type xfs_bmap_
+ .diff_items = xfs_bmap_update_diff_items,
+ .create_intent = xfs_bmap_update_create_intent,
+ .abort_intent = xfs_bmap_update_abort_intent,
+- .log_item = xfs_bmap_update_log_item,
+ .create_done = xfs_bmap_update_create_done,
+ .finish_item = xfs_bmap_update_finish_item,
+ .cancel_item = xfs_bmap_update_cancel_item,
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -412,41 +412,16 @@ xfs_extent_free_diff_items(
+ XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
+ }
+
+-/* Get an EFI. */
+-STATIC void *
+-xfs_extent_free_create_intent(
+- struct xfs_trans *tp,
+- unsigned int count)
+-{
+- struct xfs_efi_log_item *efip;
+-
+- ASSERT(tp != NULL);
+- ASSERT(count > 0);
+-
+- efip = xfs_efi_init(tp->t_mountp, count);
+- ASSERT(efip != NULL);
+-
+- /*
+- * Get a log_item_desc to point at the new item.
+- */
+- xfs_trans_add_item(tp, &efip->efi_item);
+- return efip;
+-}
+-
+ /* Log a free extent to the intent item. */
+ STATIC void
+ xfs_extent_free_log_item(
+ struct xfs_trans *tp,
+- void *intent,
+- struct list_head *item)
++ struct xfs_efi_log_item *efip,
++ struct xfs_extent_free_item *free)
+ {
+- struct xfs_efi_log_item *efip = intent;
+- struct xfs_extent_free_item *free;
+ uint next_extent;
+ struct xfs_extent *extp;
+
+- free = container_of(item, struct xfs_extent_free_item, xefi_list);
+-
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+
+@@ -462,6 +437,24 @@ xfs_extent_free_log_item(
+ extp->ext_len = free->xefi_blockcount;
+ }
+
++STATIC void *
++xfs_extent_free_create_intent(
++ struct xfs_trans *tp,
++ struct list_head *items,
++ unsigned int count)
++{
++ struct xfs_mount *mp = tp->t_mountp;
++ struct xfs_efi_log_item *efip = xfs_efi_init(mp, count);
++ struct xfs_extent_free_item *free;
++
++ ASSERT(count > 0);
++
++ xfs_trans_add_item(tp, &efip->efi_item);
++ list_for_each_entry(free, items, xefi_list)
++ xfs_extent_free_log_item(tp, efip, free);
++ return efip;
++}
++
+ /* Get an EFD so we can process all the free extents. */
+ STATIC void *
+ xfs_extent_free_create_done(
+@@ -516,7 +509,6 @@ const struct xfs_defer_op_type xfs_exten
+ .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+- .log_item = xfs_extent_free_log_item,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_extent_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+@@ -582,7 +574,6 @@ const struct xfs_defer_op_type xfs_agfl_
+ .diff_items = xfs_extent_free_diff_items,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+- .log_item = xfs_extent_free_log_item,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_agfl_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -284,27 +284,6 @@ xfs_refcount_update_diff_items(
+ XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+ }
+
+-/* Get an CUI. */
+-STATIC void *
+-xfs_refcount_update_create_intent(
+- struct xfs_trans *tp,
+- unsigned int count)
+-{
+- struct xfs_cui_log_item *cuip;
+-
+- ASSERT(tp != NULL);
+- ASSERT(count > 0);
+-
+- cuip = xfs_cui_init(tp->t_mountp, count);
+- ASSERT(cuip != NULL);
+-
+- /*
+- * Get a log_item_desc to point at the new item.
+- */
+- xfs_trans_add_item(tp, &cuip->cui_item);
+- return cuip;
+-}
+-
+ /* Set the phys extent flags for this reverse mapping. */
+ static void
+ xfs_trans_set_refcount_flags(
+@@ -328,16 +307,12 @@ xfs_trans_set_refcount_flags(
+ STATIC void
+ xfs_refcount_update_log_item(
+ struct xfs_trans *tp,
+- void *intent,
+- struct list_head *item)
++ struct xfs_cui_log_item *cuip,
++ struct xfs_refcount_intent *refc)
+ {
+- struct xfs_cui_log_item *cuip = intent;
+- struct xfs_refcount_intent *refc;
+ uint next_extent;
+ struct xfs_phys_extent *ext;
+
+- refc = container_of(item, struct xfs_refcount_intent, ri_list);
+-
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+
+@@ -354,6 +329,24 @@ xfs_refcount_update_log_item(
+ xfs_trans_set_refcount_flags(ext, refc->ri_type);
+ }
+
++STATIC void *
++xfs_refcount_update_create_intent(
++ struct xfs_trans *tp,
++ struct list_head *items,
++ unsigned int count)
++{
++ struct xfs_mount *mp = tp->t_mountp;
++ struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count);
++ struct xfs_refcount_intent *refc;
++
++ ASSERT(count > 0);
++
++ xfs_trans_add_item(tp, &cuip->cui_item);
++ list_for_each_entry(refc, items, ri_list)
++ xfs_refcount_update_log_item(tp, cuip, refc);
++ return cuip;
++}
++
+ /* Get an CUD so we can process all the deferred refcount updates. */
+ STATIC void *
+ xfs_refcount_update_create_done(
+@@ -432,7 +425,6 @@ const struct xfs_defer_op_type xfs_refco
+ .diff_items = xfs_refcount_update_diff_items,
+ .create_intent = xfs_refcount_update_create_intent,
+ .abort_intent = xfs_refcount_update_abort_intent,
+- .log_item = xfs_refcount_update_log_item,
+ .create_done = xfs_refcount_update_create_done,
+ .finish_item = xfs_refcount_update_finish_item,
+ .finish_cleanup = xfs_refcount_update_finish_cleanup,
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -352,41 +352,16 @@ xfs_rmap_update_diff_items(
+ XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
+ }
+
+-/* Get an RUI. */
+-STATIC void *
+-xfs_rmap_update_create_intent(
+- struct xfs_trans *tp,
+- unsigned int count)
+-{
+- struct xfs_rui_log_item *ruip;
+-
+- ASSERT(tp != NULL);
+- ASSERT(count > 0);
+-
+- ruip = xfs_rui_init(tp->t_mountp, count);
+- ASSERT(ruip != NULL);
+-
+- /*
+- * Get a log_item_desc to point at the new item.
+- */
+- xfs_trans_add_item(tp, &ruip->rui_item);
+- return ruip;
+-}
+-
+ /* Log rmap updates in the intent item. */
+ STATIC void
+ xfs_rmap_update_log_item(
+ struct xfs_trans *tp,
+- void *intent,
+- struct list_head *item)
++ struct xfs_rui_log_item *ruip,
++ struct xfs_rmap_intent *rmap)
+ {
+- struct xfs_rui_log_item *ruip = intent;
+- struct xfs_rmap_intent *rmap;
+ uint next_extent;
+ struct xfs_map_extent *map;
+
+- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
+-
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+
+@@ -406,6 +381,24 @@ xfs_rmap_update_log_item(
+ rmap->ri_bmap.br_state);
+ }
+
++STATIC void *
++xfs_rmap_update_create_intent(
++ struct xfs_trans *tp,
++ struct list_head *items,
++ unsigned int count)
++{
++ struct xfs_mount *mp = tp->t_mountp;
++ struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count);
++ struct xfs_rmap_intent *rmap;
++
++ ASSERT(count > 0);
++
++ xfs_trans_add_item(tp, &ruip->rui_item);
++ list_for_each_entry(rmap, items, ri_list)
++ xfs_rmap_update_log_item(tp, ruip, rmap);
++ return ruip;
++}
++
+ /* Get an RUD so we can process all the deferred rmap updates. */
+ STATIC void *
+ xfs_rmap_update_create_done(
+@@ -476,7 +469,6 @@ const struct xfs_defer_op_type xfs_rmap_
+ .diff_items = xfs_rmap_update_diff_items,
+ .create_intent = xfs_rmap_update_create_intent,
+ .abort_intent = xfs_rmap_update_abort_intent,
+- .log_item = xfs_rmap_update_log_item,
+ .create_done = xfs_rmap_update_create_done,
+ .finish_item = xfs_rmap_update_finish_item,
+ .finish_cleanup = xfs_rmap_update_finish_cleanup,
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:23:01 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:14 +0530
+Subject: xfs: only relog deferred intent items if free space in the log gets low
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-21-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 74f4d6a1e065c92428c5b588099e307a582d79d9 upstream.
+
+Now that we have the ability to ask the log how far the tail needs to be
+pushed to maintain its free space targets, augment the decision to relog
+an intent item so that we only do it if the log has hit the 75% full
+threshold. There's no point in relogging an intent into the same
+checkpoint, and there's no need to relog if there's plenty of free space
+in the log.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -372,7 +372,10 @@ xfs_defer_relog(
+ struct xfs_trans **tpp,
+ struct list_head *dfops)
+ {
++ struct xlog *log = (*tpp)->t_mountp->m_log;
+ struct xfs_defer_pending *dfp;
++ xfs_lsn_t threshold_lsn = NULLCOMMITLSN;
++
+
+ ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+@@ -388,6 +391,19 @@ xfs_defer_relog(
+ xfs_log_item_in_current_chkpt(dfp->dfp_intent))
+ continue;
+
++ /*
++ * Figure out where we need the tail to be in order to maintain
++ * the minimum required free space in the log. Only sample
++ * the log threshold once per call.
++ */
++ if (threshold_lsn == NULLCOMMITLSN) {
++ threshold_lsn = xlog_grant_push_threshold(log, 0);
++ if (threshold_lsn == NULLCOMMITLSN)
++ break;
++ }
++ if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
++ continue;
++
+ trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
+ XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
+ dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
--- /dev/null
+From stable-owner@vger.kernel.org Thu Feb 16 07:39:54 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:12 +0530
+Subject: xfs: periodically relog deferred intent items
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-19-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 4e919af7827a6adfc28e82cd6c4ffcfcc3dd6118 upstream.
+
+[ Modify xfs_{bmap|extfree|refcount|rmap}_item.c to fix merge conflicts ]
+
+There's a subtle design flaw in the deferred log item code that can lead
+to pinning the log tail. Taking up the defer ops chain examples from
+the previous commit, we can get trapped in sequences like this:
+
+Caller hands us a transaction t0 with D0-D3 attached. The defer ops
+chain will look like the following if the transaction rolls succeed:
+
+t1: D0(t0), D1(t0), D2(t0), D3(t0)
+t2: d4(t1), d5(t1), D1(t0), D2(t0), D3(t0)
+t3: d5(t1), D1(t0), D2(t0), D3(t0)
+...
+t9: d9(t7), D3(t0)
+t10: D3(t0)
+t11: d10(t10), d11(t10)
+t12: d11(t10)
+
+In transaction 9, we finish d9 and try to roll to t10 while holding onto
+an intent item for D3 that we logged in t0.
+
+The previous commit changed the order in which we place new defer ops in
+the defer ops processing chain to reduce the maximum chain length. Now
+make xfs_defer_finish_noroll capable of relogging the entire chain
+periodically so that we can always move the log tail forward. Most
+chains will never get relogged, except for operations that generate very
+long chains (large extents containing many blocks with different sharing
+levels) or are on filesystems with small logs and a lot of ongoing
+metadata updates.
+
+Callers are now required to ensure that the transaction reservation is
+large enough to handle logging done items and new intent items for the
+maximum possible chain length. Most callers are careful to keep the
+chain lengths low, so the overhead should be minimal.
+
+The decision to relog an intent item is made based on whether the intent
+was logged in a previous checkpoint, since there's no point in relogging
+an intent into the same checkpoint.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 42 ++++++++++++++++++
+ fs/xfs/xfs_bmap_item.c | 83 +++++++++++++++++++++++------------
+ fs/xfs/xfs_extfree_item.c | 104 ++++++++++++++++++++++++++++-----------------
+ fs/xfs/xfs_refcount_item.c | 95 ++++++++++++++++++++++++++---------------
+ fs/xfs/xfs_rmap_item.c | 93 +++++++++++++++++++++++++---------------
+ fs/xfs/xfs_stats.c | 4 +
+ fs/xfs/xfs_stats.h | 1
+ fs/xfs/xfs_trace.h | 1
+ fs/xfs/xfs_trans.h | 10 ++++
+ 9 files changed, 300 insertions(+), 133 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -17,6 +17,7 @@
+ #include "xfs_inode_item.h"
+ #include "xfs_trace.h"
+ #include "xfs_icache.h"
++#include "xfs_log.h"
+
+ /*
+ * Deferred Operations in XFS
+@@ -362,6 +363,42 @@ xfs_defer_cancel_list(
+ }
+
+ /*
++ * Prevent a log intent item from pinning the tail of the log by logging a
++ * done item to release the intent item; and then log a new intent item.
++ * The caller should provide a fresh transaction and roll it after we're done.
++ */
++static int
++xfs_defer_relog(
++ struct xfs_trans **tpp,
++ struct list_head *dfops)
++{
++ struct xfs_defer_pending *dfp;
++
++ ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
++
++ list_for_each_entry(dfp, dfops, dfp_list) {
++ /*
++ * If the log intent item for this deferred op is not a part of
++ * the current log checkpoint, relog the intent item to keep
++ * the log tail moving forward. We're ok with this being racy
++ * because an incorrect decision means we'll be a little slower
++ * at pushing the tail.
++ */
++ if (dfp->dfp_intent == NULL ||
++ xfs_log_item_in_current_chkpt(dfp->dfp_intent))
++ continue;
++
++ trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
++ XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
++ dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
++ }
++
++ if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
++ return xfs_defer_trans_roll(tpp);
++ return 0;
++}
++
++/*
+ * Log an intent-done item for the first pending intent, and finish the work
+ * items.
+ */
+@@ -447,6 +484,11 @@ xfs_defer_finish_noroll(
+ if (error)
+ goto out_shutdown;
+
++ /* Possibly relog intent items to keep the log moving. */
++ error = xfs_defer_relog(tp, &dop_pending);
++ if (error)
++ goto out_shutdown;
++
+ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
+ dfp_list);
+ error = xfs_defer_finish_one(*tp, dfp);
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -125,34 +125,6 @@ xfs_bui_item_release(
+ xfs_bui_release(BUI_ITEM(lip));
+ }
+
+-static const struct xfs_item_ops xfs_bui_item_ops = {
+- .iop_size = xfs_bui_item_size,
+- .iop_format = xfs_bui_item_format,
+- .iop_unpin = xfs_bui_item_unpin,
+- .iop_release = xfs_bui_item_release,
+-};
+-
+-/*
+- * Allocate and initialize an bui item with the given number of extents.
+- */
+-struct xfs_bui_log_item *
+-xfs_bui_init(
+- struct xfs_mount *mp)
+-
+-{
+- struct xfs_bui_log_item *buip;
+-
+- buip = kmem_zone_zalloc(xfs_bui_zone, 0);
+-
+- xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
+- buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
+- buip->bui_format.bui_id = (uintptr_t)(void *)buip;
+- atomic_set(&buip->bui_next_extent, 0);
+- atomic_set(&buip->bui_refcount, 2);
+-
+- return buip;
+-}
+-
+ static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip)
+ {
+ return container_of(lip, struct xfs_bud_log_item, bud_item);
+@@ -548,3 +520,58 @@ err_rele:
+ xfs_irele(ip);
+ return error;
+ }
++
++/* Relog an intent item to push the log tail forward. */
++static struct xfs_log_item *
++xfs_bui_item_relog(
++ struct xfs_log_item *intent,
++ struct xfs_trans *tp)
++{
++ struct xfs_bud_log_item *budp;
++ struct xfs_bui_log_item *buip;
++ struct xfs_map_extent *extp;
++ unsigned int count;
++
++ count = BUI_ITEM(intent)->bui_format.bui_nextents;
++ extp = BUI_ITEM(intent)->bui_format.bui_extents;
++
++ tp->t_flags |= XFS_TRANS_DIRTY;
++ budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
++ set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
++
++ buip = xfs_bui_init(tp->t_mountp);
++ memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
++ atomic_set(&buip->bui_next_extent, count);
++ xfs_trans_add_item(tp, &buip->bui_item);
++ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
++ return &buip->bui_item;
++}
++
++static const struct xfs_item_ops xfs_bui_item_ops = {
++ .iop_size = xfs_bui_item_size,
++ .iop_format = xfs_bui_item_format,
++ .iop_unpin = xfs_bui_item_unpin,
++ .iop_release = xfs_bui_item_release,
++ .iop_relog = xfs_bui_item_relog,
++};
++
++/*
++ * Allocate and initialize an bui item with the given number of extents.
++ */
++struct xfs_bui_log_item *
++xfs_bui_init(
++ struct xfs_mount *mp)
++
++{
++ struct xfs_bui_log_item *buip;
++
++ buip = kmem_zone_zalloc(xfs_bui_zone, 0);
++
++ xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
++ buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
++ buip->bui_format.bui_id = (uintptr_t)(void *)buip;
++ atomic_set(&buip->bui_next_extent, 0);
++ atomic_set(&buip->bui_refcount, 2);
++
++ return buip;
++}
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -139,44 +139,6 @@ xfs_efi_item_release(
+ xfs_efi_release(EFI_ITEM(lip));
+ }
+
+-static const struct xfs_item_ops xfs_efi_item_ops = {
+- .iop_size = xfs_efi_item_size,
+- .iop_format = xfs_efi_item_format,
+- .iop_unpin = xfs_efi_item_unpin,
+- .iop_release = xfs_efi_item_release,
+-};
+-
+-
+-/*
+- * Allocate and initialize an efi item with the given number of extents.
+- */
+-struct xfs_efi_log_item *
+-xfs_efi_init(
+- struct xfs_mount *mp,
+- uint nextents)
+-
+-{
+- struct xfs_efi_log_item *efip;
+- uint size;
+-
+- ASSERT(nextents > 0);
+- if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+- size = (uint)(sizeof(struct xfs_efi_log_item) +
+- ((nextents - 1) * sizeof(xfs_extent_t)));
+- efip = kmem_zalloc(size, 0);
+- } else {
+- efip = kmem_zone_zalloc(xfs_efi_zone, 0);
+- }
+-
+- xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
+- efip->efi_format.efi_nextents = nextents;
+- efip->efi_format.efi_id = (uintptr_t)(void *)efip;
+- atomic_set(&efip->efi_next_extent, 0);
+- atomic_set(&efip->efi_refcount, 2);
+-
+- return efip;
+-}
+-
+ /*
+ * Copy an EFI format buffer from the given buf, and into the destination
+ * EFI format structure.
+@@ -645,3 +607,69 @@ abort_error:
+ xfs_trans_cancel(tp);
+ return error;
+ }
++
++/* Relog an intent item to push the log tail forward. */
++static struct xfs_log_item *
++xfs_efi_item_relog(
++ struct xfs_log_item *intent,
++ struct xfs_trans *tp)
++{
++ struct xfs_efd_log_item *efdp;
++ struct xfs_efi_log_item *efip;
++ struct xfs_extent *extp;
++ unsigned int count;
++
++ count = EFI_ITEM(intent)->efi_format.efi_nextents;
++ extp = EFI_ITEM(intent)->efi_format.efi_extents;
++
++ tp->t_flags |= XFS_TRANS_DIRTY;
++ efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
++ efdp->efd_next_extent = count;
++ memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
++ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
++
++ efip = xfs_efi_init(tp->t_mountp, count);
++ memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
++ atomic_set(&efip->efi_next_extent, count);
++ xfs_trans_add_item(tp, &efip->efi_item);
++ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
++ return &efip->efi_item;
++}
++
++static const struct xfs_item_ops xfs_efi_item_ops = {
++ .iop_size = xfs_efi_item_size,
++ .iop_format = xfs_efi_item_format,
++ .iop_unpin = xfs_efi_item_unpin,
++ .iop_release = xfs_efi_item_release,
++ .iop_relog = xfs_efi_item_relog,
++};
++
++/*
++ * Allocate and initialize an efi item with the given number of extents.
++ */
++struct xfs_efi_log_item *
++xfs_efi_init(
++ struct xfs_mount *mp,
++ uint nextents)
++
++{
++ struct xfs_efi_log_item *efip;
++ uint size;
++
++ ASSERT(nextents > 0);
++ if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
++ size = (uint)(sizeof(struct xfs_efi_log_item) +
++ ((nextents - 1) * sizeof(xfs_extent_t)));
++ efip = kmem_zalloc(size, 0);
++ } else {
++ efip = kmem_zone_zalloc(xfs_efi_zone, 0);
++ }
++
++ xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
++ efip->efi_format.efi_nextents = nextents;
++ efip->efi_format.efi_id = (uintptr_t)(void *)efip;
++ atomic_set(&efip->efi_next_extent, 0);
++ atomic_set(&efip->efi_refcount, 2);
++
++ return efip;
++}
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -123,40 +123,6 @@ xfs_cui_item_release(
+ xfs_cui_release(CUI_ITEM(lip));
+ }
+
+-static const struct xfs_item_ops xfs_cui_item_ops = {
+- .iop_size = xfs_cui_item_size,
+- .iop_format = xfs_cui_item_format,
+- .iop_unpin = xfs_cui_item_unpin,
+- .iop_release = xfs_cui_item_release,
+-};
+-
+-/*
+- * Allocate and initialize an cui item with the given number of extents.
+- */
+-struct xfs_cui_log_item *
+-xfs_cui_init(
+- struct xfs_mount *mp,
+- uint nextents)
+-
+-{
+- struct xfs_cui_log_item *cuip;
+-
+- ASSERT(nextents > 0);
+- if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
+- cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
+- 0);
+- else
+- cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
+-
+- xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+- cuip->cui_format.cui_nextents = nextents;
+- cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
+- atomic_set(&cuip->cui_next_extent, 0);
+- atomic_set(&cuip->cui_refcount, 2);
+-
+- return cuip;
+-}
+-
+ static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip)
+ {
+ return container_of(lip, struct xfs_cud_log_item, cud_item);
+@@ -576,3 +542,64 @@ abort_error:
+ xfs_trans_cancel(tp);
+ return error;
+ }
++
++/* Relog an intent item to push the log tail forward. */
++static struct xfs_log_item *
++xfs_cui_item_relog(
++ struct xfs_log_item *intent,
++ struct xfs_trans *tp)
++{
++ struct xfs_cud_log_item *cudp;
++ struct xfs_cui_log_item *cuip;
++ struct xfs_phys_extent *extp;
++ unsigned int count;
++
++ count = CUI_ITEM(intent)->cui_format.cui_nextents;
++ extp = CUI_ITEM(intent)->cui_format.cui_extents;
++
++ tp->t_flags |= XFS_TRANS_DIRTY;
++ cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
++ set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
++
++ cuip = xfs_cui_init(tp->t_mountp, count);
++ memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
++ atomic_set(&cuip->cui_next_extent, count);
++ xfs_trans_add_item(tp, &cuip->cui_item);
++ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
++ return &cuip->cui_item;
++}
++
++static const struct xfs_item_ops xfs_cui_item_ops = {
++ .iop_size = xfs_cui_item_size,
++ .iop_format = xfs_cui_item_format,
++ .iop_unpin = xfs_cui_item_unpin,
++ .iop_release = xfs_cui_item_release,
++ .iop_relog = xfs_cui_item_relog,
++};
++
++/*
++ * Allocate and initialize an cui item with the given number of extents.
++ */
++struct xfs_cui_log_item *
++xfs_cui_init(
++ struct xfs_mount *mp,
++ uint nextents)
++
++{
++ struct xfs_cui_log_item *cuip;
++
++ ASSERT(nextents > 0);
++ if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
++ cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
++ 0);
++ else
++ cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
++
++ xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
++ cuip->cui_format.cui_nextents = nextents;
++ cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
++ atomic_set(&cuip->cui_next_extent, 0);
++ atomic_set(&cuip->cui_refcount, 2);
++
++ return cuip;
++}
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -122,39 +122,6 @@ xfs_rui_item_release(
+ xfs_rui_release(RUI_ITEM(lip));
+ }
+
+-static const struct xfs_item_ops xfs_rui_item_ops = {
+- .iop_size = xfs_rui_item_size,
+- .iop_format = xfs_rui_item_format,
+- .iop_unpin = xfs_rui_item_unpin,
+- .iop_release = xfs_rui_item_release,
+-};
+-
+-/*
+- * Allocate and initialize an rui item with the given number of extents.
+- */
+-struct xfs_rui_log_item *
+-xfs_rui_init(
+- struct xfs_mount *mp,
+- uint nextents)
+-
+-{
+- struct xfs_rui_log_item *ruip;
+-
+- ASSERT(nextents > 0);
+- if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
+- ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
+- else
+- ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
+-
+- xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
+- ruip->rui_format.rui_nextents = nextents;
+- ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
+- atomic_set(&ruip->rui_next_extent, 0);
+- atomic_set(&ruip->rui_refcount, 2);
+-
+- return ruip;
+-}
+-
+ /*
+ * Copy an RUI format buffer from the given buf, and into the destination
+ * RUI format structure. The RUI/RUD items were designed not to need any
+@@ -600,3 +567,63 @@ abort_error:
+ xfs_trans_cancel(tp);
+ return error;
+ }
++
++/* Relog an intent item to push the log tail forward. */
++static struct xfs_log_item *
++xfs_rui_item_relog(
++ struct xfs_log_item *intent,
++ struct xfs_trans *tp)
++{
++ struct xfs_rud_log_item *rudp;
++ struct xfs_rui_log_item *ruip;
++ struct xfs_map_extent *extp;
++ unsigned int count;
++
++ count = RUI_ITEM(intent)->rui_format.rui_nextents;
++ extp = RUI_ITEM(intent)->rui_format.rui_extents;
++
++ tp->t_flags |= XFS_TRANS_DIRTY;
++ rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
++ set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
++
++ ruip = xfs_rui_init(tp->t_mountp, count);
++ memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
++ atomic_set(&ruip->rui_next_extent, count);
++ xfs_trans_add_item(tp, &ruip->rui_item);
++ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
++ return &ruip->rui_item;
++}
++
++static const struct xfs_item_ops xfs_rui_item_ops = {
++ .iop_size = xfs_rui_item_size,
++ .iop_format = xfs_rui_item_format,
++ .iop_unpin = xfs_rui_item_unpin,
++ .iop_release = xfs_rui_item_release,
++ .iop_relog = xfs_rui_item_relog,
++};
++
++/*
++ * Allocate and initialize an rui item with the given number of extents.
++ */
++struct xfs_rui_log_item *
++xfs_rui_init(
++ struct xfs_mount *mp,
++ uint nextents)
++
++{
++ struct xfs_rui_log_item *ruip;
++
++ ASSERT(nextents > 0);
++ if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
++ ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
++ else
++ ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
++
++ xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
++ ruip->rui_format.rui_nextents = nextents;
++ ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
++ atomic_set(&ruip->rui_next_extent, 0);
++ atomic_set(&ruip->rui_refcount, 2);
++
++ return ruip;
++}
+--- a/fs/xfs/xfs_stats.c
++++ b/fs/xfs/xfs_stats.c
+@@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __p
+ uint64_t xs_xstrat_bytes = 0;
+ uint64_t xs_write_bytes = 0;
+ uint64_t xs_read_bytes = 0;
++ uint64_t defer_relog = 0;
+
+ static const struct xstats_entry {
+ char *desc;
+@@ -70,10 +71,13 @@ int xfs_stats_format(struct xfsstats __p
+ xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
+ xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
+ xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
++ defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
+ }
+
+ len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
+ xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
++ len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n",
++ defer_relog);
+ len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n",
+ #if defined(DEBUG)
+ 1);
+--- a/fs/xfs/xfs_stats.h
++++ b/fs/xfs/xfs_stats.h
+@@ -137,6 +137,7 @@ struct __xfsstats {
+ uint64_t xs_xstrat_bytes;
+ uint64_t xs_write_bytes;
+ uint64_t xs_read_bytes;
++ uint64_t defer_relog;
+ };
+
+ #define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -2418,6 +2418,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cre
+ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
+ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
+ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
++DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+
+ #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+ DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -77,6 +77,8 @@ struct xfs_item_ops {
+ void (*iop_release)(struct xfs_log_item *);
+ xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
+ void (*iop_error)(struct xfs_log_item *, xfs_buf_t *);
++ struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
++ struct xfs_trans *tp);
+ };
+
+ /*
+@@ -244,4 +246,12 @@ void xfs_trans_buf_copy_type(struct xfs
+
+ extern kmem_zone_t *xfs_trans_zone;
+
++static inline struct xfs_log_item *
++xfs_trans_item_relog(
++ struct xfs_log_item *lip,
++ struct xfs_trans *tp)
++{
++ return lip->li_ops->iop_relog(lip, tp);
++}
++
+ #endif /* __XFS_TRANS_H__ */
--- /dev/null
+From stable-owner@vger.kernel.org Thu Feb 16 08:02:00 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:18 +0530
+Subject: xfs: prevent UAF in xfs_log_item_in_current_chkpt
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-25-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit f8d92a66e810acbef6ddbc0bd0cbd9b117ce8acd upstream.
+
+[ Continue to interpret xfs_log_item->li_seq as an LSN rather than a CIL sequence
+ number. ]
+
+While I was running with KASAN and lockdep enabled, I stumbled upon an
+KASAN report about a UAF to a freed CIL checkpoint. Looking at the
+comment for xfs_log_item_in_current_chkpt, it seems pretty obvious to me
+that the original patch to xfs_defer_finish_noroll should have done
+something to lock the CIL to prevent it from switching the CIL contexts
+while the predicate runs.
+
+For upper level code that needs to know if a given log item is new
+enough not to need relogging, add a new wrapper that takes the CIL
+context lock long enough to sample the current CIL context. This is
+kind of racy in that the CIL can switch the contexts immediately after
+sampling, but that's ok because the consequence is that the defer ops
+code is a little slow to relog items.
+
+ ==================================================================
+ BUG: KASAN: use-after-free in xfs_log_item_in_current_chkpt+0x139/0x160 [xfs]
+ Read of size 8 at addr ffff88804ea5f608 by task fsstress/527999
+
+ CPU: 1 PID: 527999 Comm: fsstress Tainted: G D 5.16.0-rc4-xfsx #rc4
+ Call Trace:
+ <TASK>
+ dump_stack_lvl+0x45/0x59
+ print_address_description.constprop.0+0x1f/0x140
+ kasan_report.cold+0x83/0xdf
+ xfs_log_item_in_current_chkpt+0x139/0x160
+ xfs_defer_finish_noroll+0x3bb/0x1e30
+ __xfs_trans_commit+0x6c8/0xcf0
+ xfs_reflink_remap_extent+0x66f/0x10e0
+ xfs_reflink_remap_blocks+0x2dd/0xa90
+ xfs_file_remap_range+0x27b/0xc30
+ vfs_dedupe_file_range_one+0x368/0x420
+ vfs_dedupe_file_range+0x37c/0x5d0
+ do_vfs_ioctl+0x308/0x1260
+ __x64_sys_ioctl+0xa1/0x170
+ do_syscall_64+0x35/0x80
+ entry_SYSCALL_64_after_hwframe+0x44/0xae
+ RIP: 0033:0x7f2c71a2950b
+ Code: 0f 1e fa 48 8b 05 85 39 0d 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff
+ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01
+f0 ff ff 73 01 c3 48 8b 0d 55 39 0d 00 f7 d8 64 89 01 48
+ RSP: 002b:00007ffe8c0e03c8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
+ RAX: ffffffffffffffda RBX: 00005600862a8740 RCX: 00007f2c71a2950b
+ RDX: 00005600862a7be0 RSI: 00000000c0189436 RDI: 0000000000000004
+ RBP: 000000000000000b R08: 0000000000000027 R09: 0000000000000003
+ R10: 0000000000000000 R11: 0000000000000246 R12: 000000000000005a
+ R13: 00005600862804a8 R14: 0000000000016000 R15: 00005600862a8a20
+ </TASK>
+
+ Allocated by task 464064:
+ kasan_save_stack+0x1e/0x50
+ __kasan_kmalloc+0x81/0xa0
+ kmem_alloc+0xcd/0x2c0 [xfs]
+ xlog_cil_ctx_alloc+0x17/0x1e0 [xfs]
+ xlog_cil_push_work+0x141/0x13d0 [xfs]
+ process_one_work+0x7f6/0x1380
+ worker_thread+0x59d/0x1040
+ kthread+0x3b0/0x490
+ ret_from_fork+0x1f/0x30
+
+ Freed by task 51:
+ kasan_save_stack+0x1e/0x50
+ kasan_set_track+0x21/0x30
+ kasan_set_free_info+0x20/0x30
+ __kasan_slab_free+0xed/0x130
+ slab_free_freelist_hook+0x7f/0x160
+ kfree+0xde/0x340
+ xlog_cil_committed+0xbfd/0xfe0 [xfs]
+ xlog_cil_process_committed+0x103/0x1c0 [xfs]
+ xlog_state_do_callback+0x45d/0xbd0 [xfs]
+ xlog_ioend_work+0x116/0x1c0 [xfs]
+ process_one_work+0x7f6/0x1380
+ worker_thread+0x59d/0x1040
+ kthread+0x3b0/0x490
+ ret_from_fork+0x1f/0x30
+
+ Last potentially related work creation:
+ kasan_save_stack+0x1e/0x50
+ __kasan_record_aux_stack+0xb7/0xc0
+ insert_work+0x48/0x2e0
+ __queue_work+0x4e7/0xda0
+ queue_work_on+0x69/0x80
+ xlog_cil_push_now.isra.0+0x16b/0x210 [xfs]
+ xlog_cil_force_seq+0x1b7/0x850 [xfs]
+ xfs_log_force_seq+0x1c7/0x670 [xfs]
+ xfs_file_fsync+0x7c1/0xa60 [xfs]
+ __x64_sys_fsync+0x52/0x80
+ do_syscall_64+0x35/0x80
+ entry_SYSCALL_64_after_hwframe+0x44/0xae
+
+ The buggy address belongs to the object at ffff88804ea5f600
+ which belongs to the cache kmalloc-256 of size 256
+ The buggy address is located 8 bytes inside of
+ 256-byte region [ffff88804ea5f600, ffff88804ea5f700)
+ The buggy address belongs to the page:
+ page:ffffea00013a9780 refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88804ea5ea00 pfn:0x4ea5e
+ head:ffffea00013a9780 order:1 compound_mapcount:0
+ flags: 0x4fff80000010200(slab|head|node=1|zone=1|lastcpupid=0xfff)
+ raw: 04fff80000010200 ffffea0001245908 ffffea00011bd388 ffff888004c42b40
+ raw: ffff88804ea5ea00 0000000000100009 00000001ffffffff 0000000000000000
+ page dumped because: kasan: bad access detected
+
+ Memory state around the buggy address:
+ ffff88804ea5f500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff88804ea5f580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ >ffff88804ea5f600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ^
+ ffff88804ea5f680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff88804ea5f700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ==================================================================
+
+Fixes: 4e919af7827a ("xfs: periodically relog deferred intent items")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_cil.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_log_cil.c
++++ b/fs/xfs/xfs_log_cil.c
+@@ -1178,21 +1178,19 @@ out_shutdown:
+ */
+ bool
+ xfs_log_item_in_current_chkpt(
+- struct xfs_log_item *lip)
++ struct xfs_log_item *lip)
+ {
+- struct xfs_cil_ctx *ctx;
++ struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp;
+
+ if (list_empty(&lip->li_cil))
+ return false;
+
+- ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+-
+ /*
+ * li_seq is written on the first commit of a log item to record the
+ * first checkpoint it is written to. Hence if it is different to the
+ * current sequence, we're in a new checkpoint.
+ */
+- if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
++ if (XFS_LSN_CMP(lip->li_seq, READ_ONCE(cil->xc_current_sequence)) != 0)
+ return false;
+ return true;
+ }
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:21:53 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:05 +0530
+Subject: xfs: proper replay of deferred ops queued during log recovery
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-12-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit e6fff81e487089e47358a028526a9f63cdbcd503 upstream.
+
+When we replay unfinished intent items that have been recovered from the
+log, it's possible that the replay will cause the creation of more
+deferred work items. As outlined in commit 509955823cc9c ("xfs: log
+recovery should replay deferred ops in order"), later work items have an
+implicit ordering dependency on earlier work items. Therefore, recovery
+must replay the items (both recovered and created) in the same order
+that they would have been during normal operation.
+
+For log recovery, we enforce this ordering by using an empty transaction
+to collect deferred ops that get created in the process of recovering a
+log intent item to prevent them from being committed before the rest of
+the recovered intent items. After we finish committing all the
+recovered log items, we allocate a transaction with an enormous block
+reservation, splice our huge list of created deferred ops into that
+transaction, and commit it, thereby finishing all those ops.
+
+This is /really/ hokey -- it's the one place in XFS where we allow
+nested transactions; the splicing of the defer ops list is is inelegant
+and has to be done twice per recovery function; and the broken way we
+handle inode pointers and block reservations cause subtle use-after-free
+and allocator problems that will be fixed by this patch and the two
+patches after it.
+
+Therefore, replace the hokey empty transaction with a structure designed
+to capture each chain of deferred ops that are created as part of
+recovering a single unfinished log intent. Finally, refactor the loop
+that replays those chains to do so using one transaction per chain.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 89 ++++++++++++++++++++++++--
+ fs/xfs/libxfs/xfs_defer.h | 19 +++++
+ fs/xfs/xfs_bmap_item.c | 18 +----
+ fs/xfs/xfs_bmap_item.h | 3
+ fs/xfs/xfs_extfree_item.c | 9 +-
+ fs/xfs/xfs_extfree_item.h | 4 -
+ fs/xfs/xfs_log_recover.c | 151 +++++++++++++++++++++++++--------------------
+ fs/xfs/xfs_refcount_item.c | 18 +----
+ fs/xfs/xfs_refcount_item.h | 3
+ fs/xfs/xfs_rmap_item.c | 8 +-
+ fs/xfs/xfs_rmap_item.h | 3
+ 11 files changed, 213 insertions(+), 112 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -563,14 +563,89 @@ xfs_defer_move(
+ *
+ * Create and log intent items for all the work that we're capturing so that we
+ * can be assured that the items will get replayed if the system goes down
+- * before log recovery gets a chance to finish the work it put off. Then we
+- * move the chain from stp to dtp.
++ * before log recovery gets a chance to finish the work it put off. The entire
++ * deferred ops state is transferred to the capture structure and the
++ * transaction is then ready for the caller to commit it. If there are no
++ * intent items to capture, this function returns NULL.
++ */
++static struct xfs_defer_capture *
++xfs_defer_ops_capture(
++ struct xfs_trans *tp)
++{
++ struct xfs_defer_capture *dfc;
++
++ if (list_empty(&tp->t_dfops))
++ return NULL;
++
++ /* Create an object to capture the defer ops. */
++ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
++ INIT_LIST_HEAD(&dfc->dfc_list);
++ INIT_LIST_HEAD(&dfc->dfc_dfops);
++
++ xfs_defer_create_intents(tp);
++
++ /* Move the dfops chain and transaction state to the capture struct. */
++ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
++ dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
++ tp->t_flags &= ~XFS_TRANS_LOWMODE;
++
++ return dfc;
++}
++
++/* Release all resources that we used to capture deferred ops. */
++void
++xfs_defer_ops_release(
++ struct xfs_mount *mp,
++ struct xfs_defer_capture *dfc)
++{
++ xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
++ kmem_free(dfc);
++}
++
++/*
++ * Capture any deferred ops and commit the transaction. This is the last step
++ * needed to finish a log intent item that we recovered from the log.
++ */
++int
++xfs_defer_ops_capture_and_commit(
++ struct xfs_trans *tp,
++ struct list_head *capture_list)
++{
++ struct xfs_mount *mp = tp->t_mountp;
++ struct xfs_defer_capture *dfc;
++ int error;
++
++ /* If we don't capture anything, commit transaction and exit. */
++ dfc = xfs_defer_ops_capture(tp);
++ if (!dfc)
++ return xfs_trans_commit(tp);
++
++ /* Commit the transaction and add the capture structure to the list. */
++ error = xfs_trans_commit(tp);
++ if (error) {
++ xfs_defer_ops_release(mp, dfc);
++ return error;
++ }
++
++ list_add_tail(&dfc->dfc_list, capture_list);
++ return 0;
++}
++
++/*
++ * Attach a chain of captured deferred ops to a new transaction and free the
++ * capture structure.
+ */
+ void
+-xfs_defer_capture(
+- struct xfs_trans *dtp,
+- struct xfs_trans *stp)
++xfs_defer_ops_continue(
++ struct xfs_defer_capture *dfc,
++ struct xfs_trans *tp)
+ {
+- xfs_defer_create_intents(stp);
+- xfs_defer_move(dtp, stp);
++ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
++ ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
++
++ /* Move captured dfops chain and state to the transaction. */
++ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
++ tp->t_flags |= dfc->dfc_tpflags;
++
++ kmem_free(dfc);
+ }
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -7,6 +7,7 @@
+ #define __XFS_DEFER_H__
+
+ struct xfs_defer_op_type;
++struct xfs_defer_capture;
+
+ /*
+ * Header for deferred operation list.
+@@ -62,9 +63,25 @@ extern const struct xfs_defer_op_type xf
+ extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+
+ /*
++ * This structure enables a dfops user to detach the chain of deferred
++ * operations from a transaction so that they can be continued later.
++ */
++struct xfs_defer_capture {
++ /* List of other capture structures. */
++ struct list_head dfc_list;
++
++ /* Deferred ops state saved from the transaction. */
++ struct list_head dfc_dfops;
++ unsigned int dfc_tpflags;
++};
++
++/*
+ * Functions to capture a chain of deferred operations and continue them later.
+ * This doesn't normally happen except log recovery.
+ */
+-void xfs_defer_capture(struct xfs_trans *dtp, struct xfs_trans *stp);
++int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
++ struct list_head *capture_list);
++void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp);
++void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+
+ #endif /* __XFS_DEFER_H__ */
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -425,8 +425,8 @@ const struct xfs_defer_op_type xfs_bmap_
+ */
+ int
+ xfs_bui_recover(
+- struct xfs_trans *parent_tp,
+- struct xfs_bui_log_item *buip)
++ struct xfs_bui_log_item *buip,
++ struct list_head *capture_list)
+ {
+ int error = 0;
+ unsigned int bui_type;
+@@ -442,7 +442,7 @@ xfs_bui_recover(
+ struct xfs_trans *tp;
+ struct xfs_inode *ip = NULL;
+ struct xfs_bmbt_irec irec;
+- struct xfs_mount *mp = parent_tp->t_mountp;
++ struct xfs_mount *mp = buip->bui_item.li_mountp;
+
+ ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
+
+@@ -491,12 +491,7 @@ xfs_bui_recover(
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ if (error)
+ return error;
+- /*
+- * Recovery stashes all deferred ops during intent processing and
+- * finishes them on completion. Transfer current dfops state to this
+- * transaction and transfer the result back before we return.
+- */
+- xfs_defer_move(tp, parent_tp);
++
+ budp = xfs_trans_get_bud(tp, buip);
+
+ /* Grab the inode. */
+@@ -541,15 +536,12 @@ xfs_bui_recover(
+ }
+
+ set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+- xfs_defer_capture(parent_tp, tp);
+- error = xfs_trans_commit(tp);
++ error = xfs_defer_ops_capture_and_commit(tp, capture_list);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+-
+ return error;
+
+ err_inode:
+- xfs_defer_move(parent_tp, tp);
+ xfs_trans_cancel(tp);
+ if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+--- a/fs/xfs/xfs_bmap_item.h
++++ b/fs/xfs/xfs_bmap_item.h
+@@ -77,6 +77,7 @@ extern struct kmem_zone *xfs_bud_zone;
+ struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
+ void xfs_bui_item_free(struct xfs_bui_log_item *);
+ void xfs_bui_release(struct xfs_bui_log_item *);
+-int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip);
++int xfs_bui_recover(struct xfs_bui_log_item *buip,
++ struct list_head *capture_list);
+
+ #endif /* __XFS_BMAP_ITEM_H__ */
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -586,9 +586,10 @@ const struct xfs_defer_op_type xfs_agfl_
+ */
+ int
+ xfs_efi_recover(
+- struct xfs_mount *mp,
+- struct xfs_efi_log_item *efip)
++ struct xfs_efi_log_item *efip,
++ struct list_head *capture_list)
+ {
++ struct xfs_mount *mp = efip->efi_item.li_mountp;
+ struct xfs_efd_log_item *efdp;
+ struct xfs_trans *tp;
+ int i;
+@@ -637,8 +638,8 @@ xfs_efi_recover(
+ }
+
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
+- error = xfs_trans_commit(tp);
+- return error;
++
++ return xfs_defer_ops_capture_and_commit(tp, capture_list);
+
+ abort_error:
+ xfs_trans_cancel(tp);
+--- a/fs/xfs/xfs_extfree_item.h
++++ b/fs/xfs/xfs_extfree_item.h
+@@ -84,7 +84,7 @@ int xfs_efi_copy_format(xfs_log_iovec_
+ void xfs_efi_item_free(struct xfs_efi_log_item *);
+ void xfs_efi_release(struct xfs_efi_log_item *);
+
+-int xfs_efi_recover(struct xfs_mount *mp,
+- struct xfs_efi_log_item *efip);
++int xfs_efi_recover(struct xfs_efi_log_item *efip,
++ struct list_head *capture_list);
+
+ #endif /* __XFS_EXTFREE_ITEM_H__ */
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4587,9 +4587,9 @@ xlog_recover_process_data(
+ /* Recover the EFI if necessary. */
+ STATIC int
+ xlog_recover_process_efi(
+- struct xfs_mount *mp,
+ struct xfs_ail *ailp,
+- struct xfs_log_item *lip)
++ struct xfs_log_item *lip,
++ struct list_head *capture_list)
+ {
+ struct xfs_efi_log_item *efip;
+ int error;
+@@ -4602,7 +4602,7 @@ xlog_recover_process_efi(
+ return 0;
+
+ spin_unlock(&ailp->ail_lock);
+- error = xfs_efi_recover(mp, efip);
++ error = xfs_efi_recover(efip, capture_list);
+ spin_lock(&ailp->ail_lock);
+
+ return error;
+@@ -4627,9 +4627,9 @@ xlog_recover_cancel_efi(
+ /* Recover the RUI if necessary. */
+ STATIC int
+ xlog_recover_process_rui(
+- struct xfs_mount *mp,
+ struct xfs_ail *ailp,
+- struct xfs_log_item *lip)
++ struct xfs_log_item *lip,
++ struct list_head *capture_list)
+ {
+ struct xfs_rui_log_item *ruip;
+ int error;
+@@ -4642,7 +4642,7 @@ xlog_recover_process_rui(
+ return 0;
+
+ spin_unlock(&ailp->ail_lock);
+- error = xfs_rui_recover(mp, ruip);
++ error = xfs_rui_recover(ruip, capture_list);
+ spin_lock(&ailp->ail_lock);
+
+ return error;
+@@ -4667,9 +4667,9 @@ xlog_recover_cancel_rui(
+ /* Recover the CUI if necessary. */
+ STATIC int
+ xlog_recover_process_cui(
+- struct xfs_trans *parent_tp,
+ struct xfs_ail *ailp,
+- struct xfs_log_item *lip)
++ struct xfs_log_item *lip,
++ struct list_head *capture_list)
+ {
+ struct xfs_cui_log_item *cuip;
+ int error;
+@@ -4682,7 +4682,7 @@ xlog_recover_process_cui(
+ return 0;
+
+ spin_unlock(&ailp->ail_lock);
+- error = xfs_cui_recover(parent_tp, cuip);
++ error = xfs_cui_recover(cuip, capture_list);
+ spin_lock(&ailp->ail_lock);
+
+ return error;
+@@ -4707,9 +4707,9 @@ xlog_recover_cancel_cui(
+ /* Recover the BUI if necessary. */
+ STATIC int
+ xlog_recover_process_bui(
+- struct xfs_trans *parent_tp,
+ struct xfs_ail *ailp,
+- struct xfs_log_item *lip)
++ struct xfs_log_item *lip,
++ struct list_head *capture_list)
+ {
+ struct xfs_bui_log_item *buip;
+ int error;
+@@ -4722,7 +4722,7 @@ xlog_recover_process_bui(
+ return 0;
+
+ spin_unlock(&ailp->ail_lock);
+- error = xfs_bui_recover(parent_tp, buip);
++ error = xfs_bui_recover(buip, capture_list);
+ spin_lock(&ailp->ail_lock);
+
+ return error;
+@@ -4761,37 +4761,65 @@ static inline bool xlog_item_is_intent(s
+ /* Take all the collected deferred ops and finish them in order. */
+ static int
+ xlog_finish_defer_ops(
+- struct xfs_trans *parent_tp)
++ struct xfs_mount *mp,
++ struct list_head *capture_list)
+ {
+- struct xfs_mount *mp = parent_tp->t_mountp;
++ struct xfs_defer_capture *dfc, *next;
+ struct xfs_trans *tp;
+ int64_t freeblks;
+- uint resblks;
+- int error;
++ uint64_t resblks;
++ int error = 0;
+
+- /*
+- * We're finishing the defer_ops that accumulated as a result of
+- * recovering unfinished intent items during log recovery. We
+- * reserve an itruncate transaction because it is the largest
+- * permanent transaction type. Since we're the only user of the fs
+- * right now, take 93% (15/16) of the available free blocks. Use
+- * weird math to avoid a 64-bit division.
+- */
+- freeblks = percpu_counter_sum(&mp->m_fdblocks);
+- if (freeblks <= 0)
+- return -ENOSPC;
+- resblks = min_t(int64_t, UINT_MAX, freeblks);
+- resblks = (resblks * 15) >> 4;
+- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+- 0, XFS_TRANS_RESERVE, &tp);
+- if (error)
+- return error;
+- /* transfer all collected dfops to this transaction */
+- xfs_defer_move(tp, parent_tp);
++ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
++ /*
++ * We're finishing the defer_ops that accumulated as a result
++ * of recovering unfinished intent items during log recovery.
++ * We reserve an itruncate transaction because it is the
++ * largest permanent transaction type. Since we're the only
++ * user of the fs right now, take 93% (15/16) of the available
++ * free blocks. Use weird math to avoid a 64-bit division.
++ */
++ freeblks = percpu_counter_sum(&mp->m_fdblocks);
++ if (freeblks <= 0)
++ return -ENOSPC;
++
++ resblks = min_t(uint64_t, UINT_MAX, freeblks);
++ resblks = (resblks * 15) >> 4;
++ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
++ 0, XFS_TRANS_RESERVE, &tp);
++ if (error)
++ return error;
++
++ /*
++ * Transfer to this new transaction all the dfops we captured
++ * from recovering a single intent item.
++ */
++ list_del_init(&dfc->dfc_list);
++ xfs_defer_ops_continue(dfc, tp);
++
++ error = xfs_trans_commit(tp);
++ if (error)
++ return error;
++ }
+
+- return xfs_trans_commit(tp);
++ ASSERT(list_empty(capture_list));
++ return 0;
+ }
+
++/* Release all the captured defer ops and capture structures in this list. */
++static void
++xlog_abort_defer_ops(
++ struct xfs_mount *mp,
++ struct list_head *capture_list)
++{
++ struct xfs_defer_capture *dfc;
++ struct xfs_defer_capture *next;
++
++ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
++ list_del_init(&dfc->dfc_list);
++ xfs_defer_ops_release(mp, dfc);
++ }
++}
+ /*
+ * When this is called, all of the log intent items which did not have
+ * corresponding log done items should be in the AIL. What we do now
+@@ -4812,35 +4840,23 @@ STATIC int
+ xlog_recover_process_intents(
+ struct xlog *log)
+ {
+- struct xfs_trans *parent_tp;
++ LIST_HEAD(capture_list);
+ struct xfs_ail_cursor cur;
+ struct xfs_log_item *lip;
+ struct xfs_ail *ailp;
+- int error;
++ int error = 0;
+ #if defined(DEBUG) || defined(XFS_WARN)
+ xfs_lsn_t last_lsn;
+ #endif
+
+- /*
+- * The intent recovery handlers commit transactions to complete recovery
+- * for individual intents, but any new deferred operations that are
+- * queued during that process are held off until the very end. The
+- * purpose of this transaction is to serve as a container for deferred
+- * operations. Each intent recovery handler must transfer dfops here
+- * before its local transaction commits, and we'll finish the entire
+- * list below.
+- */
+- error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
+- if (error)
+- return error;
+-
+ ailp = log->l_ailp;
+ spin_lock(&ailp->ail_lock);
+- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ #if defined(DEBUG) || defined(XFS_WARN)
+ last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
+ #endif
+- while (lip != NULL) {
++ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
++ lip != NULL;
++ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
+ /*
+ * We're done when we see something other than an intent.
+ * There should be no intents left in the AIL now.
+@@ -4862,35 +4878,40 @@ xlog_recover_process_intents(
+
+ /*
+ * NOTE: If your intent processing routine can create more
+- * deferred ops, you /must/ attach them to the dfops in this
+- * routine or else those subsequent intents will get
++ * deferred ops, you /must/ attach them to the capture list in
++ * the recover routine or else those subsequent intents will be
+ * replayed in the wrong order!
+ */
+ switch (lip->li_type) {
+ case XFS_LI_EFI:
+- error = xlog_recover_process_efi(log->l_mp, ailp, lip);
++ error = xlog_recover_process_efi(ailp, lip, &capture_list);
+ break;
+ case XFS_LI_RUI:
+- error = xlog_recover_process_rui(log->l_mp, ailp, lip);
++ error = xlog_recover_process_rui(ailp, lip, &capture_list);
+ break;
+ case XFS_LI_CUI:
+- error = xlog_recover_process_cui(parent_tp, ailp, lip);
++ error = xlog_recover_process_cui(ailp, lip, &capture_list);
+ break;
+ case XFS_LI_BUI:
+- error = xlog_recover_process_bui(parent_tp, ailp, lip);
++ error = xlog_recover_process_bui(ailp, lip, &capture_list);
+ break;
+ }
+ if (error)
+- goto out;
+- lip = xfs_trans_ail_cursor_next(ailp, &cur);
++ break;
+ }
+-out:
++
+ xfs_trans_ail_cursor_done(&cur);
+ spin_unlock(&ailp->ail_lock);
+- if (!error)
+- error = xlog_finish_defer_ops(parent_tp);
+- xfs_trans_cancel(parent_tp);
++ if (error)
++ goto err;
++
++ error = xlog_finish_defer_ops(log->l_mp, &capture_list);
++ if (error)
++ goto err;
+
++ return 0;
++err:
++ xlog_abort_defer_ops(log->l_mp, &capture_list);
+ return error;
+ }
+
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -439,8 +439,8 @@ const struct xfs_defer_op_type xfs_refco
+ */
+ int
+ xfs_cui_recover(
+- struct xfs_trans *parent_tp,
+- struct xfs_cui_log_item *cuip)
++ struct xfs_cui_log_item *cuip,
++ struct list_head *capture_list)
+ {
+ int i;
+ int error = 0;
+@@ -456,7 +456,7 @@ xfs_cui_recover(
+ xfs_extlen_t new_len;
+ struct xfs_bmbt_irec irec;
+ bool requeue_only = false;
+- struct xfs_mount *mp = parent_tp->t_mountp;
++ struct xfs_mount *mp = cuip->cui_item.li_mountp;
+
+ ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
+
+@@ -511,12 +511,7 @@ xfs_cui_recover(
+ mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+- /*
+- * Recovery stashes all deferred ops during intent processing and
+- * finishes them on completion. Transfer current dfops state to this
+- * transaction and transfer the result back before we return.
+- */
+- xfs_defer_move(tp, parent_tp);
++
+ cudp = xfs_trans_get_cud(tp, cuip);
+
+ for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+@@ -574,13 +569,10 @@ xfs_cui_recover(
+
+ xfs_refcount_finish_one_cleanup(tp, rcur, error);
+ set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+- xfs_defer_capture(parent_tp, tp);
+- error = xfs_trans_commit(tp);
+- return error;
++ return xfs_defer_ops_capture_and_commit(tp, capture_list);
+
+ abort_error:
+ xfs_refcount_finish_one_cleanup(tp, rcur, error);
+- xfs_defer_move(parent_tp, tp);
+ xfs_trans_cancel(tp);
+ return error;
+ }
+--- a/fs/xfs/xfs_refcount_item.h
++++ b/fs/xfs/xfs_refcount_item.h
+@@ -80,6 +80,7 @@ extern struct kmem_zone *xfs_cud_zone;
+ struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
+ void xfs_cui_item_free(struct xfs_cui_log_item *);
+ void xfs_cui_release(struct xfs_cui_log_item *);
+-int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip);
++int xfs_cui_recover(struct xfs_cui_log_item *cuip,
++ struct list_head *capture_list);
+
+ #endif /* __XFS_REFCOUNT_ITEM_H__ */
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -483,9 +483,10 @@ const struct xfs_defer_op_type xfs_rmap_
+ */
+ int
+ xfs_rui_recover(
+- struct xfs_mount *mp,
+- struct xfs_rui_log_item *ruip)
++ struct xfs_rui_log_item *ruip,
++ struct list_head *capture_list)
+ {
++ struct xfs_mount *mp = ruip->rui_item.li_mountp;
+ int i;
+ int error = 0;
+ struct xfs_map_extent *rmap;
+@@ -592,8 +593,7 @@ xfs_rui_recover(
+
+ xfs_rmap_finish_one_cleanup(tp, rcur, error);
+ set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
+- error = xfs_trans_commit(tp);
+- return error;
++ return xfs_defer_ops_capture_and_commit(tp, capture_list);
+
+ abort_error:
+ xfs_rmap_finish_one_cleanup(tp, rcur, error);
+--- a/fs/xfs/xfs_rmap_item.h
++++ b/fs/xfs/xfs_rmap_item.h
+@@ -82,6 +82,7 @@ int xfs_rui_copy_format(struct xfs_log_i
+ struct xfs_rui_log_format *dst_rui_fmt);
+ void xfs_rui_item_free(struct xfs_rui_log_item *);
+ void xfs_rui_release(struct xfs_rui_log_item *);
+-int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip);
++int xfs_rui_recover(struct xfs_rui_log_item *ruip,
++ struct list_head *capture_list);
+
+ #endif /* __XFS_RMAP_ITEM_H__ */
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:21:31 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:02 +0530
+Subject: xfs: refactor xfs_defer_finish_noroll
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-9-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit bb47d79750f1a68a75d4c7defc2da934ba31de14 upstream.
+
+Split out a helper that operates on a single xfs_defer_pending structure
+to untangle the code.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 128 +++++++++++++++++++++-------------------------
+ 1 file changed, 59 insertions(+), 69 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -360,6 +360,53 @@ xfs_defer_cancel_list(
+ }
+
+ /*
++ * Log an intent-done item for the first pending intent, and finish the work
++ * items.
++ */
++static int
++xfs_defer_finish_one(
++ struct xfs_trans *tp,
++ struct xfs_defer_pending *dfp)
++{
++ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
++ void *state = NULL;
++ struct list_head *li, *n;
++ int error;
++
++ trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
++
++ dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
++ list_for_each_safe(li, n, &dfp->dfp_work) {
++ list_del(li);
++ dfp->dfp_count--;
++ error = ops->finish_item(tp, li, dfp->dfp_done, &state);
++ if (error == -EAGAIN) {
++ /*
++ * Caller wants a fresh transaction; put the work item
++ * back on the list and log a new log intent item to
++ * replace the old one. See "Requesting a Fresh
++ * Transaction while Finishing Deferred Work" above.
++ */
++ list_add(li, &dfp->dfp_work);
++ dfp->dfp_count++;
++ dfp->dfp_done = NULL;
++ xfs_defer_create_intent(tp, dfp, false);
++ }
++
++ if (error)
++ goto out;
++ }
++
++ /* Done with the dfp, free it. */
++ list_del(&dfp->dfp_list);
++ kmem_free(dfp);
++out:
++ if (ops->finish_cleanup)
++ ops->finish_cleanup(tp, state, error);
++ return error;
++}
++
++/*
+ * Finish all the pending work. This involves logging intent items for
+ * any work items that wandered in since the last transaction roll (if
+ * one has even happened), rolling the transaction, and finishing the
+@@ -372,11 +419,7 @@ xfs_defer_finish_noroll(
+ struct xfs_trans **tp)
+ {
+ struct xfs_defer_pending *dfp;
+- struct list_head *li;
+- struct list_head *n;
+- void *state;
+ int error = 0;
+- const struct xfs_defer_op_type *ops;
+ LIST_HEAD(dop_pending);
+
+ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+@@ -385,83 +428,30 @@ xfs_defer_finish_noroll(
+
+ /* Until we run out of pending work to finish... */
+ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
+- /* log intents and pull in intake items */
+ xfs_defer_create_intents(*tp);
+ list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
+
+- /*
+- * Roll the transaction.
+- */
+ error = xfs_defer_trans_roll(tp);
+ if (error)
+- goto out;
++ goto out_shutdown;
+
+- /* Log an intent-done item for the first pending item. */
+ dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
+ dfp_list);
+- ops = defer_op_types[dfp->dfp_type];
+- trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
+- dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent,
+- dfp->dfp_count);
+-
+- /* Finish the work items. */
+- state = NULL;
+- list_for_each_safe(li, n, &dfp->dfp_work) {
+- list_del(li);
+- dfp->dfp_count--;
+- error = ops->finish_item(*tp, li, dfp->dfp_done,
+- &state);
+- if (error == -EAGAIN) {
+- /*
+- * Caller wants a fresh transaction;
+- * put the work item back on the list
+- * and jump out.
+- */
+- list_add(li, &dfp->dfp_work);
+- dfp->dfp_count++;
+- break;
+- } else if (error) {
+- /*
+- * Clean up after ourselves and jump out.
+- * xfs_defer_cancel will take care of freeing
+- * all these lists and stuff.
+- */
+- if (ops->finish_cleanup)
+- ops->finish_cleanup(*tp, state, error);
+- goto out;
+- }
+- }
+- if (error == -EAGAIN) {
+- /*
+- * Caller wants a fresh transaction, so log a new log
+- * intent item to replace the old one and roll the
+- * transaction. See "Requesting a Fresh Transaction
+- * while Finishing Deferred Work" above.
+- */
+- dfp->dfp_done = NULL;
+- xfs_defer_create_intent(*tp, dfp, false);
+- } else {
+- /* Done with the dfp, free it. */
+- list_del(&dfp->dfp_list);
+- kmem_free(dfp);
+- }
+-
+- if (ops->finish_cleanup)
+- ops->finish_cleanup(*tp, state, error);
+- }
+-
+-out:
+- if (error) {
+- xfs_defer_trans_abort(*tp, &dop_pending);
+- xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+- trace_xfs_defer_finish_error(*tp, error);
+- xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
+- xfs_defer_cancel(*tp);
+- return error;
++ error = xfs_defer_finish_one(*tp, dfp);
++ if (error && error != -EAGAIN)
++ goto out_shutdown;
+ }
+
+ trace_xfs_defer_finish_done(*tp, _RET_IP_);
+ return 0;
++
++out_shutdown:
++ xfs_defer_trans_abort(*tp, &dop_pending);
++ xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
++ trace_xfs_defer_finish_error(*tp, error);
++ xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
++ xfs_defer_cancel(*tp);
++ return error;
+ }
+
+ int
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:20:47 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:49:56 +0530
+Subject: xfs: remove the xfs_efd_log_item_t typedef
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-3-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit c84e819090f39e96e4d432c9047a50d2424f99e0 upstream.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_extfree_item.h | 4 ++--
+ fs/xfs/xfs_super.c | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_extfree_item.h
++++ b/fs/xfs/xfs_extfree_item.h
+@@ -63,12 +63,12 @@ struct xfs_efi_log_item {
+ * the fact that some extents earlier mentioned in an efi item
+ * have been freed.
+ */
+-typedef struct xfs_efd_log_item {
++struct xfs_efd_log_item {
+ struct xfs_log_item efd_item;
+ struct xfs_efi_log_item *efd_efip;
+ uint efd_next_extent;
+ xfs_efd_log_format_t efd_format;
+-} xfs_efd_log_item_t;
++};
+
+ /*
+ * Max number of extents in fast allocation path.
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1914,7 +1914,7 @@ xfs_init_zones(void)
+ if (!xfs_buf_item_zone)
+ goto out_destroy_trans_zone;
+
+- xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
++ xfs_efd_zone = kmem_zone_init((sizeof(struct xfs_efd_log_item) +
+ ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
+ sizeof(xfs_extent_t))), "xfs_efd_item");
+ if (!xfs_efd_zone)
--- /dev/null
+From stable-owner@vger.kernel.org Thu Feb 16 08:04:21 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:49:55 +0530
+Subject: xfs: remove the xfs_efi_log_item_t typedef
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-2-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 82ff450b2d936d778361a1de43eb078cc043c7fe upstream.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_extfree_item.c | 2 +-
+ fs/xfs/xfs_extfree_item.h | 10 +++++-----
+ fs/xfs/xfs_log_recover.c | 4 ++--
+ fs/xfs/xfs_super.c | 2 +-
+ 4 files changed, 9 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -161,7 +161,7 @@ xfs_efi_init(
+
+ ASSERT(nextents > 0);
+ if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+- size = (uint)(sizeof(xfs_efi_log_item_t) +
++ size = (uint)(sizeof(struct xfs_efi_log_item) +
+ ((nextents - 1) * sizeof(xfs_extent_t)));
+ efip = kmem_zalloc(size, 0);
+ } else {
+--- a/fs/xfs/xfs_extfree_item.h
++++ b/fs/xfs/xfs_extfree_item.h
+@@ -50,13 +50,13 @@ struct kmem_zone;
+ * of commit failure or log I/O errors. Note that the EFD is not inserted in the
+ * AIL, so at this point both the EFI and EFD are freed.
+ */
+-typedef struct xfs_efi_log_item {
++struct xfs_efi_log_item {
+ struct xfs_log_item efi_item;
+ atomic_t efi_refcount;
+ atomic_t efi_next_extent;
+ unsigned long efi_flags; /* misc flags */
+ xfs_efi_log_format_t efi_format;
+-} xfs_efi_log_item_t;
++};
+
+ /*
+ * This is the "extent free done" log item. It is used to log
+@@ -65,7 +65,7 @@ typedef struct xfs_efi_log_item {
+ */
+ typedef struct xfs_efd_log_item {
+ struct xfs_log_item efd_item;
+- xfs_efi_log_item_t *efd_efip;
++ struct xfs_efi_log_item *efd_efip;
+ uint efd_next_extent;
+ xfs_efd_log_format_t efd_format;
+ } xfs_efd_log_item_t;
+@@ -78,10 +78,10 @@ typedef struct xfs_efd_log_item {
+ extern struct kmem_zone *xfs_efi_zone;
+ extern struct kmem_zone *xfs_efd_zone;
+
+-xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint);
++struct xfs_efi_log_item *xfs_efi_init(struct xfs_mount *, uint);
+ int xfs_efi_copy_format(xfs_log_iovec_t *buf,
+ xfs_efi_log_format_t *dst_efi_fmt);
+-void xfs_efi_item_free(xfs_efi_log_item_t *);
++void xfs_efi_item_free(struct xfs_efi_log_item *);
+ void xfs_efi_release(struct xfs_efi_log_item *);
+
+ int xfs_efi_recover(struct xfs_mount *mp,
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -3384,7 +3384,7 @@ xlog_recover_efd_pass2(
+ struct xlog_recover_item *item)
+ {
+ xfs_efd_log_format_t *efd_formatp;
+- xfs_efi_log_item_t *efip = NULL;
++ struct xfs_efi_log_item *efip = NULL;
+ struct xfs_log_item *lip;
+ uint64_t efi_id;
+ struct xfs_ail_cursor cur;
+@@ -3405,7 +3405,7 @@ xlog_recover_efd_pass2(
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ while (lip != NULL) {
+ if (lip->li_type == XFS_LI_EFI) {
+- efip = (xfs_efi_log_item_t *)lip;
++ efip = (struct xfs_efi_log_item *)lip;
+ if (efip->efi_format.efi_id == efi_id) {
+ /*
+ * Drop the EFD reference to the EFI. This
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1920,7 +1920,7 @@ xfs_init_zones(void)
+ if (!xfs_efd_zone)
+ goto out_destroy_buf_item_zone;
+
+- xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
++ xfs_efi_zone = kmem_zone_init((sizeof(struct xfs_efi_log_item) +
+ ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
+ sizeof(xfs_extent_t))), "xfs_efi_item");
+ if (!xfs_efi_zone)
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:20:54 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:49:57 +0530
+Subject: xfs: remove the xfs_inode_log_item_t typedef
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-4-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit fd9cbe51215198ccffa64169c98eae35b0916088 upstream.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_inode_fork.c | 2 +-
+ fs/xfs/libxfs/xfs_trans_inode.c | 2 +-
+ fs/xfs/xfs_inode.c | 4 ++--
+ fs/xfs/xfs_inode_item.c | 2 +-
+ fs/xfs/xfs_inode_item.h | 4 ++--
+ fs/xfs/xfs_super.c | 4 ++--
+ 6 files changed, 9 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_inode_fork.c
++++ b/fs/xfs/libxfs/xfs_inode_fork.c
+@@ -592,7 +592,7 @@ void
+ xfs_iflush_fork(
+ xfs_inode_t *ip,
+ xfs_dinode_t *dip,
+- xfs_inode_log_item_t *iip,
++ struct xfs_inode_log_item *iip,
+ int whichfork)
+ {
+ char *cp;
+--- a/fs/xfs/libxfs/xfs_trans_inode.c
++++ b/fs/xfs/libxfs/xfs_trans_inode.c
+@@ -27,7 +27,7 @@ xfs_trans_ijoin(
+ struct xfs_inode *ip,
+ uint lock_flags)
+ {
+- xfs_inode_log_item_t *iip;
++ struct xfs_inode_log_item *iip;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ if (ip->i_itemp == NULL)
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2555,7 +2555,7 @@ xfs_ifree_cluster(
+ xfs_daddr_t blkno;
+ xfs_buf_t *bp;
+ xfs_inode_t *ip;
+- xfs_inode_log_item_t *iip;
++ struct xfs_inode_log_item *iip;
+ struct xfs_log_item *lip;
+ struct xfs_perag *pag;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+@@ -2617,7 +2617,7 @@ xfs_ifree_cluster(
+ */
+ list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
+ if (lip->li_type == XFS_LI_INODE) {
+- iip = (xfs_inode_log_item_t *)lip;
++ iip = (struct xfs_inode_log_item *)lip;
+ ASSERT(iip->ili_logged == 1);
+ lip->li_cb = xfs_istale_done;
+ xfs_trans_ail_copy_lsn(mp->m_ail,
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -781,7 +781,7 @@ xfs_iflush_abort(
+ xfs_inode_t *ip,
+ bool stale)
+ {
+- xfs_inode_log_item_t *iip = ip->i_itemp;
++ struct xfs_inode_log_item *iip = ip->i_itemp;
+
+ if (iip) {
+ if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) {
+--- a/fs/xfs/xfs_inode_item.h
++++ b/fs/xfs/xfs_inode_item.h
+@@ -13,7 +13,7 @@ struct xfs_bmbt_rec;
+ struct xfs_inode;
+ struct xfs_mount;
+
+-typedef struct xfs_inode_log_item {
++struct xfs_inode_log_item {
+ struct xfs_log_item ili_item; /* common portion */
+ struct xfs_inode *ili_inode; /* inode ptr */
+ xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
+@@ -23,7 +23,7 @@ typedef struct xfs_inode_log_item {
+ unsigned int ili_last_fields; /* fields when flushed */
+ unsigned int ili_fields; /* fields to be logged */
+ unsigned int ili_fsync_fields; /* logged since last fsync */
+-} xfs_inode_log_item_t;
++};
+
+ static inline int xfs_inode_clean(xfs_inode_t *ip)
+ {
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1934,8 +1934,8 @@ xfs_init_zones(void)
+ goto out_destroy_efi_zone;
+
+ xfs_ili_zone =
+- kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
+- KM_ZONE_SPREAD, NULL);
++ kmem_zone_init_flags(sizeof(struct xfs_inode_log_item),
++ "xfs_ili", KM_ZONE_SPREAD, NULL);
+ if (!xfs_ili_zone)
+ goto out_destroy_inode_zone;
+ xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:23:36 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:19 +0530
+Subject: xfs: sync lazy sb accounting on quiesce of read-only mounts
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-26-chandan.babu@oracle.com>
+
+From: Brian Foster <bfoster@redhat.com>
+
+commit 50d25484bebe94320c49dd1347d3330c7063bbdb upstream.
+
+[ Modify xfs_log_unmount_write() to return zero when the log is in a read-only
+state ]
+
+xfs_log_sbcount() syncs the superblock specifically to accumulate
+the in-core percpu superblock counters and commit them to disk. This
+is required to maintain filesystem consistency across quiesce
+(freeze, read-only mount/remount) or unmount when lazy superblock
+accounting is enabled because individual transactions do not update
+the superblock directly.
+
+This mechanism works as expected for writable mounts, but
+xfs_log_sbcount() skips the update for read-only mounts. Read-only
+mounts otherwise still allow log recovery and write out an unmount
+record during log quiesce. If a read-only mount performs log
+recovery, it can modify the in-core superblock counters and write an
+unmount record when the filesystem unmounts without ever syncing the
+in-core counters. This leaves the filesystem with a clean log but in
+an inconsistent state with regard to lazy sb counters.
+
+Update xfs_log_sbcount() to use the same logic
+xfs_log_unmount_write() uses to determine when to write an unmount
+record. This ensures that lazy accounting is always synced before
+the log is cleaned. Refactor this logic into a new helper to
+distinguish between a writable filesystem and a writable log.
+Specifically, the log is writable unless the filesystem is mounted
+with the norecovery mount option, the underlying log device is
+read-only, or the filesystem is shutdown. Drop the freeze state
+check because the update is already allowed during the freezing
+process and no context calls this function on an already frozen fs.
+Also, retain the shutdown check in xfs_log_unmount_write() to catch
+the case where the preceding log force might have triggered a
+shutdown.
+
+Signed-off-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Gao Xiang <hsiangkao@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Bill O'Donnell <billodo@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c | 28 ++++++++++++++++++++--------
+ fs/xfs/xfs_log.h | 1 +
+ fs/xfs/xfs_mount.c | 3 +--
+ 3 files changed, 22 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -369,6 +369,25 @@ xlog_tic_add_region(xlog_ticket_t *tic,
+ tic->t_res_num++;
+ }
+
++bool
++xfs_log_writable(
++ struct xfs_mount *mp)
++{
++ /*
++ * Never write to the log on norecovery mounts, if the block device is
++ * read-only, or if the filesystem is shutdown. Read-only mounts still
++ * allow internal writes for log recovery and unmount purposes, so don't
++ * restrict that case here.
++ */
++ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
++ return false;
++ if (xfs_readonly_buftarg(mp->m_log->l_targ))
++ return false;
++ if (XFS_FORCED_SHUTDOWN(mp))
++ return false;
++ return true;
++}
++
+ /*
+ * Replenish the byte reservation required by moving the grant write head.
+ */
+@@ -895,15 +914,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
+ #endif
+ int error;
+
+- /*
+- * Don't write out unmount record on norecovery mounts or ro devices.
+- * Or, if we are doing a forced umount (typically because of IO errors).
+- */
+- if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
+- xfs_readonly_buftarg(log->l_targ)) {
+- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
++ if (!xfs_log_writable(mp))
+ return 0;
+- }
+
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
+ ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
+--- a/fs/xfs/xfs_log.h
++++ b/fs/xfs/xfs_log.h
+@@ -132,6 +132,7 @@ int xfs_log_reserve(struct xfs_mount *
+ int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
+ void xfs_log_unmount(struct xfs_mount *mp);
+ int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
++bool xfs_log_writable(struct xfs_mount *mp);
+
+ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
+ void xfs_log_ticket_put(struct xlog_ticket *ticket);
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1218,8 +1218,7 @@ xfs_fs_writable(
+ int
+ xfs_log_sbcount(xfs_mount_t *mp)
+ {
+- /* allow this to proceed during the freeze sequence... */
+- if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
++ if (!xfs_log_writable(mp))
+ return 0;
+
+ /*
--- /dev/null
+From stable-owner@vger.kernel.org Thu Feb 16 06:24:16 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:01 +0530
+Subject: xfs: turn dfp_intent into a xfs_log_item
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-8-chandan.babu@oracle.com>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit 13a8333339072b8654c1d2c75550ee9f41ee15de upstream.
+
+All defer op instance place their own extension of the log item into
+the dfp_intent field. Replace that with a xfs_log_item to improve type
+safety and make the code easier to follow.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.h | 11 ++++++-----
+ fs/xfs/xfs_bmap_item.c | 12 ++++++------
+ fs/xfs/xfs_extfree_item.c | 12 ++++++------
+ fs/xfs/xfs_refcount_item.c | 12 ++++++------
+ fs/xfs/xfs_rmap_item.c | 12 ++++++------
+ 5 files changed, 30 insertions(+), 29 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -28,7 +28,7 @@ enum xfs_defer_ops_type {
+ struct xfs_defer_pending {
+ struct list_head dfp_list; /* pending items */
+ struct list_head dfp_work; /* work items */
+- void *dfp_intent; /* log intent item */
++ struct xfs_log_item *dfp_intent; /* log intent item */
+ void *dfp_done; /* log done item */
+ unsigned int dfp_count; /* # extent items */
+ enum xfs_defer_ops_type dfp_type;
+@@ -43,14 +43,15 @@ void xfs_defer_move(struct xfs_trans *dt
+
+ /* Description of a deferred type. */
+ struct xfs_defer_op_type {
+- void (*abort_intent)(void *);
+- void *(*create_done)(struct xfs_trans *, void *, unsigned int);
++ struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
++ struct list_head *items, unsigned int count, bool sort);
++ void (*abort_intent)(struct xfs_log_item *intent);
++ void *(*create_done)(struct xfs_trans *tp, struct xfs_log_item *intent,
++ unsigned int count);
+ int (*finish_item)(struct xfs_trans *, struct list_head *, void *,
+ void **);
+ void (*finish_cleanup)(struct xfs_trans *, void *, int);
+ void (*cancel_item)(struct list_head *);
+- void *(*create_intent)(struct xfs_trans *tp, struct list_head *items,
+- unsigned int count, bool sort);
+ unsigned int max_items;
+ };
+
+--- a/fs/xfs/xfs_bmap_item.c
++++ b/fs/xfs/xfs_bmap_item.c
+@@ -330,7 +330,7 @@ xfs_bmap_update_log_item(
+ bmap->bi_bmap.br_state);
+ }
+
+-STATIC void *
++static struct xfs_log_item *
+ xfs_bmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+@@ -348,17 +348,17 @@ xfs_bmap_update_create_intent(
+ list_sort(mp, items, xfs_bmap_update_diff_items);
+ list_for_each_entry(bmap, items, bi_list)
+ xfs_bmap_update_log_item(tp, buip, bmap);
+- return buip;
++ return &buip->bui_item;
+ }
+
+ /* Get an BUD so we can process all the deferred rmap updates. */
+ STATIC void *
+ xfs_bmap_update_create_done(
+ struct xfs_trans *tp,
+- void *intent,
++ struct xfs_log_item *intent,
+ unsigned int count)
+ {
+- return xfs_trans_get_bud(tp, intent);
++ return xfs_trans_get_bud(tp, BUI_ITEM(intent));
+ }
+
+ /* Process a deferred rmap update. */
+@@ -394,9 +394,9 @@ xfs_bmap_update_finish_item(
+ /* Abort all pending BUIs. */
+ STATIC void
+ xfs_bmap_update_abort_intent(
+- void *intent)
++ struct xfs_log_item *intent)
+ {
+- xfs_bui_release(intent);
++ xfs_bui_release(BUI_ITEM(intent));
+ }
+
+ /* Cancel a deferred rmap update. */
+--- a/fs/xfs/xfs_extfree_item.c
++++ b/fs/xfs/xfs_extfree_item.c
+@@ -437,7 +437,7 @@ xfs_extent_free_log_item(
+ extp->ext_len = free->xefi_blockcount;
+ }
+
+-STATIC void *
++static struct xfs_log_item *
+ xfs_extent_free_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+@@ -455,17 +455,17 @@ xfs_extent_free_create_intent(
+ list_sort(mp, items, xfs_extent_free_diff_items);
+ list_for_each_entry(free, items, xefi_list)
+ xfs_extent_free_log_item(tp, efip, free);
+- return efip;
++ return &efip->efi_item;
+ }
+
+ /* Get an EFD so we can process all the free extents. */
+ STATIC void *
+ xfs_extent_free_create_done(
+ struct xfs_trans *tp,
+- void *intent,
++ struct xfs_log_item *intent,
+ unsigned int count)
+ {
+- return xfs_trans_get_efd(tp, intent, count);
++ return xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
+ }
+
+ /* Process a free extent. */
+@@ -491,9 +491,9 @@ xfs_extent_free_finish_item(
+ /* Abort all pending EFIs. */
+ STATIC void
+ xfs_extent_free_abort_intent(
+- void *intent)
++ struct xfs_log_item *intent)
+ {
+- xfs_efi_release(intent);
++ xfs_efi_release(EFI_ITEM(intent));
+ }
+
+ /* Cancel a free extent. */
+--- a/fs/xfs/xfs_refcount_item.c
++++ b/fs/xfs/xfs_refcount_item.c
+@@ -329,7 +329,7 @@ xfs_refcount_update_log_item(
+ xfs_trans_set_refcount_flags(ext, refc->ri_type);
+ }
+
+-STATIC void *
++static struct xfs_log_item *
+ xfs_refcount_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+@@ -347,17 +347,17 @@ xfs_refcount_update_create_intent(
+ list_sort(mp, items, xfs_refcount_update_diff_items);
+ list_for_each_entry(refc, items, ri_list)
+ xfs_refcount_update_log_item(tp, cuip, refc);
+- return cuip;
++ return &cuip->cui_item;
+ }
+
+ /* Get an CUD so we can process all the deferred refcount updates. */
+ STATIC void *
+ xfs_refcount_update_create_done(
+ struct xfs_trans *tp,
+- void *intent,
++ struct xfs_log_item *intent,
+ unsigned int count)
+ {
+- return xfs_trans_get_cud(tp, intent);
++ return xfs_trans_get_cud(tp, CUI_ITEM(intent));
+ }
+
+ /* Process a deferred refcount update. */
+@@ -407,9 +407,9 @@ xfs_refcount_update_finish_cleanup(
+ /* Abort all pending CUIs. */
+ STATIC void
+ xfs_refcount_update_abort_intent(
+- void *intent)
++ struct xfs_log_item *intent)
+ {
+- xfs_cui_release(intent);
++ xfs_cui_release(CUI_ITEM(intent));
+ }
+
+ /* Cancel a deferred refcount update. */
+--- a/fs/xfs/xfs_rmap_item.c
++++ b/fs/xfs/xfs_rmap_item.c
+@@ -381,7 +381,7 @@ xfs_rmap_update_log_item(
+ rmap->ri_bmap.br_state);
+ }
+
+-STATIC void *
++static struct xfs_log_item *
+ xfs_rmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+@@ -399,17 +399,17 @@ xfs_rmap_update_create_intent(
+ list_sort(mp, items, xfs_rmap_update_diff_items);
+ list_for_each_entry(rmap, items, ri_list)
+ xfs_rmap_update_log_item(tp, ruip, rmap);
+- return ruip;
++ return &ruip->rui_item;
+ }
+
+ /* Get an RUD so we can process all the deferred rmap updates. */
+ STATIC void *
+ xfs_rmap_update_create_done(
+ struct xfs_trans *tp,
+- void *intent,
++ struct xfs_log_item *intent,
+ unsigned int count)
+ {
+- return xfs_trans_get_rud(tp, intent);
++ return xfs_trans_get_rud(tp, RUI_ITEM(intent));
+ }
+
+ /* Process a deferred rmap update. */
+@@ -451,9 +451,9 @@ xfs_rmap_update_finish_cleanup(
+ /* Abort all pending RUIs. */
+ STATIC void
+ xfs_rmap_update_abort_intent(
+- void *intent)
++ struct xfs_log_item *intent)
+ {
+- xfs_rui_release(intent);
++ xfs_rui_release(RUI_ITEM(intent));
+ }
+
+ /* Cancel a deferred rmap update. */
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:21:59 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:06 +0530
+Subject: xfs: xfs_defer_capture should absorb remaining block reservations
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-13-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 4f9a60c48078c0efa3459678fa8d6e050e8ada5d upstream.
+
+When xfs_defer_capture extracts the deferred ops and transaction state
+from a transaction, it should record the remaining block reservations so
+that when we continue the dfops chain, we can reserve the same number of
+blocks to use. We capture the reservations for both data and realtime
+volumes.
+
+This adds the requirement that every log intent item recovery function
+must be careful to reserve enough blocks to handle both itself and all
+defer ops that it can queue. On the other hand, this enables us to do
+away with the handwaving block estimation nonsense that was going on in
+xlog_finish_defer_ops.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 4 ++++
+ fs/xfs/libxfs/xfs_defer.h | 4 ++++
+ fs/xfs/xfs_log_recover.c | 21 +++------------------
+ 3 files changed, 11 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -589,6 +589,10 @@ xfs_defer_ops_capture(
+ dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
+ tp->t_flags &= ~XFS_TRANS_LOWMODE;
+
++ /* Capture the remaining block reservations along with the dfops. */
++ dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
++ dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
++
+ return dfc;
+ }
+
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -73,6 +73,10 @@ struct xfs_defer_capture {
+ /* Deferred ops state saved from the transaction. */
+ struct list_head dfc_dfops;
+ unsigned int dfc_tpflags;
++
++ /* Block reservations for the data and rt devices. */
++ unsigned int dfc_blkres;
++ unsigned int dfc_rtxres;
+ };
+
+ /*
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4766,27 +4766,12 @@ xlog_finish_defer_ops(
+ {
+ struct xfs_defer_capture *dfc, *next;
+ struct xfs_trans *tp;
+- int64_t freeblks;
+- uint64_t resblks;
+ int error = 0;
+
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+- /*
+- * We're finishing the defer_ops that accumulated as a result
+- * of recovering unfinished intent items during log recovery.
+- * We reserve an itruncate transaction because it is the
+- * largest permanent transaction type. Since we're the only
+- * user of the fs right now, take 93% (15/16) of the available
+- * free blocks. Use weird math to avoid a 64-bit division.
+- */
+- freeblks = percpu_counter_sum(&mp->m_fdblocks);
+- if (freeblks <= 0)
+- return -ENOSPC;
+-
+- resblks = min_t(uint64_t, UINT_MAX, freeblks);
+- resblks = (resblks * 15) >> 4;
+- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+- 0, XFS_TRANS_RESERVE, &tp);
++ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
++ dfc->dfc_blkres, dfc->dfc_rtxres,
++ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
--- /dev/null
+From chandan.babu@oracle.com Thu Feb 16 06:22:08 2023
+From: Chandan Babu R <chandan.babu@oracle.com>
+Date: Thu, 16 Feb 2023 10:50:07 +0530
+Subject: xfs: xfs_defer_capture should absorb remaining transaction reservation
+To: gregkh@linuxfoundation.org
+Cc: sashal@kernel.org, mcgrof@kernel.org, linux-xfs@vger.kernel.org, stable@vger.kernel.org, djwong@kernel.org, chandan.babu@oracle.com, amir73il@gmail.com, leah.rumancik@gmail.com
+Message-ID: <20230216052019.368896-14-chandan.babu@oracle.com>
+
+From: "Darrick J. Wong" <darrick.wong@oracle.com>
+
+commit 929b92f64048d90d23e40a59c47adf59f5026903 upstream.
+
+When xfs_defer_capture extracts the deferred ops and transaction state
+from a transaction, it should record the transaction reservation type
+from the old transaction so that when we continue the dfops chain, we
+still use the same reservation parameters.
+
+Doing this means that the log item recovery functions get to determine
+the transaction reservation instead of abusing tr_itruncate in yet
+another part of xfs.
+
+Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandan.babu@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_defer.c | 3 +++
+ fs/xfs/libxfs/xfs_defer.h | 3 +++
+ fs/xfs/xfs_log_recover.c | 17 ++++++++++++++---
+ 3 files changed, 20 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_defer.c
++++ b/fs/xfs/libxfs/xfs_defer.c
+@@ -593,6 +593,9 @@ xfs_defer_ops_capture(
+ dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
+ dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
+
++ /* Preserve the log reservation size. */
++ dfc->dfc_logres = tp->t_log_res;
++
+ return dfc;
+ }
+
+--- a/fs/xfs/libxfs/xfs_defer.h
++++ b/fs/xfs/libxfs/xfs_defer.h
+@@ -77,6 +77,9 @@ struct xfs_defer_capture {
+ /* Block reservations for the data and rt devices. */
+ unsigned int dfc_blkres;
+ unsigned int dfc_rtxres;
++
++ /* Log reservation saved from the transaction. */
++ unsigned int dfc_logres;
+ };
+
+ /*
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -4769,9 +4769,20 @@ xlog_finish_defer_ops(
+ int error = 0;
+
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+- dfc->dfc_blkres, dfc->dfc_rtxres,
+- XFS_TRANS_RESERVE, &tp);
++ struct xfs_trans_res resv;
++
++ /*
++ * Create a new transaction reservation from the captured
++ * information. Set logcount to 1 to force the new transaction
++ * to regrant every roll so that we can make forward progress
++ * in recovery no matter how full the log might be.
++ */
++ resv.tr_logres = dfc->dfc_logres;
++ resv.tr_logcount = 1;
++ resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
++
++ error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
++ dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+