From: Darrick J. Wong Date: Thu, 1 Jun 2023 09:38:10 +0000 (+0200) Subject: xfs: _{attr,data}_map_shared should take ILOCK_EXCL until iread_extents is completely... X-Git-Tag: v6.4.0~33 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=0bf7f8c31f7d065819dabdf7eb9e14b158b85b75;p=thirdparty%2Fxfsprogs-dev.git xfs: _{attr,data}_map_shared should take ILOCK_EXCL until iread_extents is completely done Source kernel commit: c95356ca884885db702670e24933ee7f2b9f1754 While fuzzing the data fork extent count on a btree-format directory with xfs/375, I observed the following (excerpted) splat: XFS: Assertion failed: xfs_isilocked(ip, XFS_ILOCK_EXCL), file: fs/xfs/libxfs/xfs_bmap.c, line: 1208 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 43192 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs] Call Trace: xfs_iread_extents+0x1af/0x210 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] xchk_dir_walk+0xb8/0x190 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] xchk_parent_count_parent_dentries+0x41/0x80 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] xchk_parent_validate+0x199/0x2e0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] xchk_parent+0xdf/0x130 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] xfs_scrub_metadata+0x2b8/0x730 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] xfs_scrubv_metadata+0x38b/0x4d0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] xfs_ioc_scrubv_metadata+0x111/0x160 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] xfs_file_ioctl+0x367/0xf50 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] __x64_sys_ioctl+0x82/0xa0 do_syscall_64+0x2b/0x80 entry_SYSCALL_64_after_hwframe+0x46/0xb0 The cause of this is a race condition in xfs_ilock_data_map_shared, which performs an unlocked access to the data fork to guess which lock mode it needs: Thread 0 Thread 1 xfs_need_iread_extents xfs_ilock(..., ILOCK_EXCL) xfs_iread_extents xfs_need_iread_extents xfs_ilock(..., ILOCK_SHARED) xfs_iunlock(..., ILOCK_EXCL) xfs_iread_extents *BOOM* Fix this race by adding a flag to the xfs_ifork structure to indicate that we have not yet read in the extent records and changing the predicate to look at the flag state, not if_height. The memory barrier ensures that the flag will not be set until the very end of the function. Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner Signed-off-by: Carlos Maiolino --- diff --git a/include/atomic.h b/include/atomic.h index 9c4aa5849..3b7eabd0f 100644 --- a/include/atomic.h +++ b/include/atomic.h @@ -118,4 +118,104 @@ atomic64_set(atomic64_t *a, int64_t v) #endif /* HAVE_URCU_ATOMIC64 */ +#define __smp_mb() cmm_smp_mb() + +/* from compiler_types.h */ +/* + * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving + * non-scalar types unchanged. + */ +/* + * Prefer C11 _Generic for better compile-times and simpler code. Note 'char' + * is not type-compatible with 'signed char', and we define a separate case. + */ +#define __scalar_type_to_expr_cases(type) \ + unsigned type: (unsigned type)0, \ + signed type: (signed type)0 + +#define __unqual_scalar_typeof(x) typeof( \ + _Generic((x), \ + char: (char)0, \ + __scalar_type_to_expr_cases(char), \ + __scalar_type_to_expr_cases(short), \ + __scalar_type_to_expr_cases(int), \ + __scalar_type_to_expr_cases(long), \ + __scalar_type_to_expr_cases(long long), \ + default: (x))) + +/* Is this type a native word size -- useful for atomic operations */ +#define __native_word(t) \ + (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ + sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) + +#define compiletime_assert(a, s) BUILD_BUG_ON(!(a)) + +#define compiletime_assert_atomic_type(t) \ + compiletime_assert(__native_word(t), \ + "Need native word sized stores/loads for atomicity.") + +/* from rwonce.h */ +/* + * Yes, this permits 64-bit accesses on 32-bit architectures. These will + * actually be atomic in some cases (namely Armv7 + LPAE), but for others we + * rely on the access being split into 2x32-bit accesses for a 32-bit quantity + * (e.g. a virtual address) and a strong prevailing wind. + */ +#define compiletime_assert_rwonce_type(t) \ + compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ + "Unsupported access size for {READ,WRITE}_ONCE().") + +/* + * Use __READ_ONCE() instead of READ_ONCE() if you do not require any + * atomicity. Note that this may result in tears! + */ +#ifndef __READ_ONCE +#define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) +#endif + +#define READ_ONCE(x) \ +({ \ + compiletime_assert_rwonce_type(x); \ + __READ_ONCE(x); \ +}) + +#define __WRITE_ONCE(x, val) \ +do { \ + *(volatile typeof(x) *)&(x) = (val); \ +} while (0) + +#define WRITE_ONCE(x, val) \ +do { \ + compiletime_assert_rwonce_type(x); \ + __WRITE_ONCE(x, val); \ +} while (0) + +/* from barrier.h */ +#ifndef __smp_store_release +#define __smp_store_release(p, v) \ +do { \ + compiletime_assert_atomic_type(*p); \ + __smp_mb(); \ + WRITE_ONCE(*p, v); \ +} while (0) +#endif + +#ifndef __smp_load_acquire +#define __smp_load_acquire(p) \ +({ \ + __unqual_scalar_typeof(*p) ___p1 = READ_ONCE(*p); \ + compiletime_assert_atomic_type(*p); \ + __smp_mb(); \ + (typeof(*p))___p1; \ +}) +#endif + +#ifndef smp_store_release +#define smp_store_release(p, v) __smp_store_release((p), (v)) +#endif + +#ifndef smp_load_acquire +#define smp_load_acquire(p) __smp_load_acquire(p) +#endif + #endif /* __ATOMIC_H__ */ diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h index 386f5706f..dce5abf29 100644 --- a/libxfs/libxfs_priv.h +++ b/libxfs/libxfs_priv.h @@ -202,9 +202,6 @@ static inline bool WARN_ON(bool expr) { #define percpu_counter_read_positive(x) ((*x) > 0 ? (*x) : 0) #define percpu_counter_sum(x) (*x) -#define READ_ONCE(x) (x) -#define WRITE_ONCE(x, val) ((x) = (val)) - /* * get_random_u32 is used for di_gen inode allocation, it must be zero for * libxfs or all sorts of badness can occur! diff --git a/libxfs/xfs_bmap.c b/libxfs/xfs_bmap.c index 01d3f8a82..ab28bedd0 100644 --- a/libxfs/xfs_bmap.c +++ b/libxfs/xfs_bmap.c @@ -1193,6 +1193,12 @@ xfs_iread_extents( goto out; } ASSERT(ir.loaded == xfs_iext_count(ifp)); + /* + * Use release semantics so that we can use acquire semantics in + * xfs_need_iread_extents and be guaranteed to see a valid mapping tree + * after that load. + */ + smp_store_release(&ifp->if_needextents, 0); return 0; out: xfs_iext_destroy(ifp); diff --git a/libxfs/xfs_inode_fork.c b/libxfs/xfs_inode_fork.c index 47be0d572..5cc056ff7 100644 --- a/libxfs/xfs_inode_fork.c +++ b/libxfs/xfs_inode_fork.c @@ -225,10 +225,15 @@ xfs_iformat_data_fork( /* * Initialize the extent count early, as the per-format routines may - * depend on it. + * depend on it. Use release semantics to set needextents /after/ we + * set the format. This ensures that we can use acquire semantics on + * needextents in xfs_need_iread_extents() and be guaranteed to see a + * valid format value after that load. */ ip->i_df.if_format = dip->di_format; ip->i_df.if_nextents = xfs_dfork_data_extents(dip); + smp_store_release(&ip->i_df.if_needextents, + ip->i_df.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0); switch (inode->i_mode & S_IFMT) { case S_IFIFO: @@ -281,8 +286,17 @@ xfs_ifork_init_attr( enum xfs_dinode_fmt format, xfs_extnum_t nextents) { + /* + * Initialize the extent count early, as the per-format routines may + * depend on it. Use release semantics to set needextents /after/ we + * set the format. This ensures that we can use acquire semantics on + * needextents in xfs_need_iread_extents() and be guaranteed to see a + * valid format value after that load. + */ ip->i_af.if_format = format; ip->i_af.if_nextents = nextents; + smp_store_release(&ip->i_af.if_needextents, + ip->i_af.if_format == XFS_DINODE_FMT_BTREE ? 1 : 0); } void diff --git a/libxfs/xfs_inode_fork.h b/libxfs/xfs_inode_fork.h index d3943d6ad..96d307784 100644 --- a/libxfs/xfs_inode_fork.h +++ b/libxfs/xfs_inode_fork.h @@ -24,6 +24,7 @@ struct xfs_ifork { xfs_extnum_t if_nextents; /* # of extents in this fork */ short if_broot_bytes; /* bytes allocated for root */ int8_t if_format; /* format of this fork */ + uint8_t if_needextents; /* extents have not been read */ }; /* @@ -260,9 +261,10 @@ int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip, uint nr_to_add); /* returns true if the fork has extents but they are not read in yet. */ -static inline bool xfs_need_iread_extents(struct xfs_ifork *ifp) +static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp) { - return ifp->if_format == XFS_DINODE_FMT_BTREE && ifp->if_height == 0; + /* see xfs_iformat_{data,attr}_fork() for needextents semantics */ + return smp_load_acquire(&ifp->if_needextents) != 0; } #endif /* __XFS_INODE_FORK_H__ */