]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/commitdiff
xfs: _{attr,data}_map_shared should take ILOCK_EXCL until iread_extents is completely...
authorDarrick J. Wong <djwong@kernel.org>
Thu, 1 Jun 2023 09:38:10 +0000 (11:38 +0200)
committerCarlos Maiolino <cem@kernel.org>
Fri, 9 Jun 2023 08:27:50 +0000 (10:27 +0200)
Source kernel commit: c95356ca884885db702670e24933ee7f2b9f1754

While fuzzing the data fork extent count on a btree-format directory
with xfs/375, I observed the following (excerpted) splat:

XFS: Assertion failed: xfs_isilocked(ip, XFS_ILOCK_EXCL), file: fs/xfs/libxfs/xfs_bmap.c, line: 1208
------------[ cut here ]------------
WARNING: CPU: 0 PID: 43192 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs]
Call Trace:
<TASK>
xfs_iread_extents+0x1af/0x210 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
xchk_dir_walk+0xb8/0x190 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
xchk_parent_count_parent_dentries+0x41/0x80 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
xchk_parent_validate+0x199/0x2e0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
xchk_parent+0xdf/0x130 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
xfs_scrub_metadata+0x2b8/0x730 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
xfs_scrubv_metadata+0x38b/0x4d0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
xfs_ioc_scrubv_metadata+0x111/0x160 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
xfs_file_ioctl+0x367/0xf50 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd]
__x64_sys_ioctl+0x82/0xa0
do_syscall_64+0x2b/0x80
entry_SYSCALL_64_after_hwframe+0x46/0xb0

The cause of this is a race condition in xfs_ilock_data_map_shared,
which performs an unlocked access to the data fork to guess which lock
mode it needs:

Thread 0                          Thread 1

xfs_need_iread_extents
<observe no iext tree>
xfs_ilock(..., ILOCK_EXCL)
xfs_iread_extents
<observe no iext tree>
<check ILOCK_EXCL>
<load bmbt extents into iext>
<notice iext size doesn't
match nextents>
xfs_need_iread_extents
<observe iext tree>
xfs_ilock(..., ILOCK_SHARED)
<tear down iext tree>
xfs_iunlock(..., ILOCK_EXCL)
xfs_iread_extents
<observe no iext tree>
<check ILOCK_EXCL>
*BOOM*

Fix this race by adding a flag to the xfs_ifork structure to indicate
that we have not yet read in the extent records and changing the
predicate to look at the flag state, not if_height.  The memory barrier
ensures that the flag will not be set until the very end of the
function.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
include/atomic.h
libxfs/libxfs_priv.h
libxfs/xfs_bmap.c
libxfs/xfs_inode_fork.c
libxfs/xfs_inode_fork.h

index 9c4aa5849ae4eedecd5774b8edd15cae75a25a39..3b7eabd0faa7fadcef88fbe597785b45c09fd69a 100644 (file)
@@ -118,4 +118,104 @@ atomic64_set(atomic64_t *a, int64_t v)
 
 #endif /* HAVE_URCU_ATOMIC64 */
 
+#define __smp_mb()             cmm_smp_mb()
+
+/* from compiler_types.h */
+/*
+ * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving
+ *                            non-scalar types unchanged.
+ */
+/*
+ * Prefer C11 _Generic for better compile-times and simpler code. Note 'char'
+ * is not type-compatible with 'signed char', and we define a separate case.
+ */
+#define __scalar_type_to_expr_cases(type)                              \
+               unsigned type:  (unsigned type)0,                       \
+               signed type:    (signed type)0
+
+#define __unqual_scalar_typeof(x) typeof(                              \
+               _Generic((x),                                           \
+                       char:   (char)0,                                \
+                       __scalar_type_to_expr_cases(char),              \
+                       __scalar_type_to_expr_cases(short),             \
+                       __scalar_type_to_expr_cases(int),               \
+                       __scalar_type_to_expr_cases(long),              \
+                       __scalar_type_to_expr_cases(long long),         \
+                       default: (x)))
+
+/* Is this type a native word size -- useful for atomic operations */
+#define __native_word(t) \
+       (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \
+        sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
+
+#define compiletime_assert(a, s)       BUILD_BUG_ON(!(a))
+
+#define compiletime_assert_atomic_type(t)                              \
+               compiletime_assert(__native_word(t),                    \
+                       "Need native word sized stores/loads for atomicity.")
+
+/* from rwonce.h */
+/*
+ * Yes, this permits 64-bit accesses on 32-bit architectures. These will
+ * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
+ * rely on the access being split into 2x32-bit accesses for a 32-bit quantity
+ * (e.g. a virtual address) and a strong prevailing wind.
+ */
+#define compiletime_assert_rwonce_type(t)                                      \
+       compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),  \
+               "Unsupported access size for {READ,WRITE}_ONCE().")
+
+/*
+ * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
+ * atomicity. Note that this may result in tears!
+ */
+#ifndef __READ_ONCE
+#define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x))
+#endif
+
+#define READ_ONCE(x)                                                           \
+({                                                                             \
+       compiletime_assert_rwonce_type(x);                                      \
+       __READ_ONCE(x);                                                         \
+})
+
+#define __WRITE_ONCE(x, val)                                                   \
+do {                                                                           \
+       *(volatile typeof(x) *)&(x) = (val);                                    \
+} while (0)
+
+#define WRITE_ONCE(x, val)                                                     \
+do {                                                                           \
+       compiletime_assert_rwonce_type(x);                                      \
+       __WRITE_ONCE(x, val);                                                   \
+} while (0)
+
+/* from barrier.h */
+#ifndef __smp_store_release
+#define __smp_store_release(p, v)                                      \
+do {                                                                   \
+       compiletime_assert_atomic_type(*p);                             \
+       __smp_mb();                                                     \
+       WRITE_ONCE(*p, v);                                              \
+} while (0)
+#endif
+
+#ifndef __smp_load_acquire
+#define __smp_load_acquire(p)                                          \
+({                                                                     \
+       __unqual_scalar_typeof(*p) ___p1 = READ_ONCE(*p);               \
+       compiletime_assert_atomic_type(*p);                             \
+       __smp_mb();                                                     \
+       (typeof(*p))___p1;                                              \
+})
+#endif
+
+#ifndef smp_store_release
+#define smp_store_release(p, v) __smp_store_release((p), (v))
+#endif
+
+#ifndef smp_load_acquire
+#define smp_load_acquire(p) __smp_load_acquire(p)
+#endif
+
 #endif /* __ATOMIC_H__ */
index 386f5706f5b68143abd9c238d9c99d8d99ebcfe5..dce5abf298d5e4d5bd31931b349e74649599d317 100644 (file)
@@ -202,9 +202,6 @@ static inline bool WARN_ON(bool expr) {
 #define percpu_counter_read_positive(x)        ((*x) > 0 ? (*x) : 0)
 #define percpu_counter_sum(x)          (*x)
 
-#define READ_ONCE(x)                   (x)
-#define WRITE_ONCE(x, val)             ((x) = (val))
-
 /*
  * get_random_u32 is used for di_gen inode allocation, it must be zero for
  * libxfs or all sorts of badness can occur!
index 01d3f8a82de3d0a4469e8e5dc191087de05dff4b..ab28bedd048601eac6719a37fc4da711c1e918fc 100644 (file)
@@ -1193,6 +1193,12 @@ xfs_iread_extents(
                goto out;
        }
        ASSERT(ir.loaded == xfs_iext_count(ifp));
+       /*
+        * Use release semantics so that we can use acquire semantics in
+        * xfs_need_iread_extents and be guaranteed to see a valid mapping tree
+        * after that load.
+        */
+       smp_store_release(&ifp->if_needextents, 0);
        return 0;
 out:
        xfs_iext_destroy(ifp);
index 47be0d5720de396455c8dda820c86ad12a36cca1..5cc056ff7a1b161080041c65aa5e6bce4479d8f8 100644 (file)
@@ -225,10 +225,15 @@ xfs_iformat_data_fork(
 
        /*
         * Initialize the extent count early, as the per-format routines may
-        * depend on it.
+        * depend on it.  Use release semantics to set needextents /after/ we
+        * set the format. This ensures that we can use acquire semantics on
+        * needextents in xfs_need_iread_extents() and be guaranteed to see a
+        * valid format value after that load.
         */
        ip->i_df.if_format = dip->di_format;
        ip->i_df.if_nextents = xfs_dfork_data_extents(dip);
+       smp_store_release(&ip->i_df.if_needextents,
+                          ip->i_df.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0);
 
        switch (inode->i_mode & S_IFMT) {
        case S_IFIFO:
@@ -281,8 +286,17 @@ xfs_ifork_init_attr(
        enum xfs_dinode_fmt     format,
        xfs_extnum_t            nextents)
 {
+       /*
+        * Initialize the extent count early, as the per-format routines may
+        * depend on it.  Use release semantics to set needextents /after/ we
+        * set the format. This ensures that we can use acquire semantics on
+        * needextents in xfs_need_iread_extents() and be guaranteed to see a
+        * valid format value after that load.
+        */
        ip->i_af.if_format = format;
        ip->i_af.if_nextents = nextents;
+       smp_store_release(&ip->i_af.if_needextents,
+                          ip->i_af.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0);
 }
 
 void
index d3943d6ad0b94127a91a02b9cf4460de2b3b5ff2..96d307784c85b63a86670299cfd64892926b4259 100644 (file)
@@ -24,6 +24,7 @@ struct xfs_ifork {
        xfs_extnum_t            if_nextents;    /* # of extents in this fork */
        short                   if_broot_bytes; /* bytes allocated for root */
        int8_t                  if_format;      /* format of this fork */
+       uint8_t                 if_needextents; /* extents have not been read */
 };
 
 /*
@@ -260,9 +261,10 @@ int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
                uint nr_to_add);
 
 /* returns true if the fork has extents but they are not read in yet. */
-static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp)
+static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp)
 {
-       return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0;
+       /* see xfs_iformat_{data,attr}_fork() for needextents semantics */
+       return smp_load_acquire(&ifp->if_needextents) != 0;
 }
 
 #endif /* __XFS_INODE_FORK_H__ */