From: Greg Kroah-Hartman Date: Mon, 5 Jun 2017 14:08:53 +0000 (+0200) Subject: 4.9-stable patches X-Git-Tag: v3.18.56~9 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b5e5defa077f1e319bb080a0ae2da062353a9e0b;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: xfs-fix-kernel-memory-exposure-problems.patch xfs-fix-missed-holes-in-seek_hole-implementation.patch xfs-fix-off-by-one-on-max-nr_pages-in-xfs_find_get_desired_pgoff.patch xfs-rework-the-inline-directory-verifiers.patch xfs-use-b_state-to-fix-buffer-i-o-accounting-release-race.patch xfs-use-dedicated-log-worker-wq-to-avoid-deadlock-with-cil-wq.patch xfs-verify-inline-directory-data-forks.patch --- diff --git a/queue-4.9/series b/queue-4.9/series index 9255228e0a4..02663fc2301 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -66,3 +66,10 @@ ksm-prevent-crash-after-write_protect_page-fails.patch slub-memcg-cure-the-brainless-abuse-of-sysfs-attributes.patch mm-slub.c-trace-free-objects-at-kern_info.patch drm-gma500-psb-actually-use-vbt-mode-when-it-is-found.patch +xfs-fix-missed-holes-in-seek_hole-implementation.patch +xfs-use-b_state-to-fix-buffer-i-o-accounting-release-race.patch +xfs-fix-off-by-one-on-max-nr_pages-in-xfs_find_get_desired_pgoff.patch +xfs-verify-inline-directory-data-forks.patch +xfs-rework-the-inline-directory-verifiers.patch +xfs-fix-kernel-memory-exposure-problems.patch +xfs-use-dedicated-log-worker-wq-to-avoid-deadlock-with-cil-wq.patch diff --git a/queue-4.9/xfs-fix-kernel-memory-exposure-problems.patch b/queue-4.9/xfs-fix-kernel-memory-exposure-problems.patch new file mode 100644 index 00000000000..ff07df35e54 --- /dev/null +++ b/queue-4.9/xfs-fix-kernel-memory-exposure-problems.patch @@ -0,0 +1,32 @@ +From bf9216f922612d2db7666aae01e65064da2ffb3a Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 3 Apr 2017 12:22:39 -0700 +Subject: xfs: fix kernel memory exposure problems + +From: Darrick J. Wong + +commit bf9216f922612d2db7666aae01e65064da2ffb3a upstream. + +Fix a memory exposure problems in inumbers where we allocate an array of +structures with holes, fail to zero the holes, then blindly copy the +kernel memory contents (junk and all) into userspace. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_itable.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_itable.c ++++ b/fs/xfs/xfs_itable.c +@@ -585,7 +585,7 @@ xfs_inumbers( + return error; + + bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); +- buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); ++ buffer = kmem_zalloc(bcount * sizeof(*buffer), KM_SLEEP); + do { + struct xfs_inobt_rec_incore r; + int stat; diff --git a/queue-4.9/xfs-fix-missed-holes-in-seek_hole-implementation.patch b/queue-4.9/xfs-fix-missed-holes-in-seek_hole-implementation.patch new file mode 100644 index 00000000000..a8bbb397fd5 --- /dev/null +++ b/queue-4.9/xfs-fix-missed-holes-in-seek_hole-implementation.patch @@ -0,0 +1,87 @@ +From 5375023ae1266553a7baa0845e82917d8803f48c Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Thu, 18 May 2017 16:36:22 -0700 +Subject: xfs: Fix missed holes in SEEK_HOLE implementation + +From: Jan Kara + +commit 5375023ae1266553a7baa0845e82917d8803f48c upstream. + +XFS SEEK_HOLE implementation could miss a hole in an unwritten extent as +can be seen by the following command: + +xfs_io -c "falloc 0 256k" -c "pwrite 0 56k" -c "pwrite 128k 8k" + -c "seek -h 0" file +wrote 57344/57344 bytes at offset 0 +56 KiB, 14 ops; 0.0000 sec (49.312 MiB/sec and 12623.9856 ops/sec) +wrote 8192/8192 bytes at offset 131072 +8 KiB, 2 ops; 0.0000 sec (70.383 MiB/sec and 18018.0180 ops/sec) +Whence Result +HOLE 139264 + +Where we can see that hole at offset 56k was just ignored by SEEK_HOLE +implementation. The bug is in xfs_find_get_desired_pgoff() which does +not properly detect the case when pages are not contiguous. + +Fix the problem by properly detecting when found page has larger offset +than expected. + +Fixes: d126d43f631f996daeee5006714fed914be32368 +Signed-off-by: Jan Kara +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_file.c | 29 +++++++++-------------------- + 1 file changed, 9 insertions(+), 20 deletions(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -1163,17 +1163,6 @@ xfs_find_get_desired_pgoff( + break; + } + +- /* +- * At lease we found one page. If this is the first time we +- * step into the loop, and if the first page index offset is +- * greater than the given search offset, a hole was found. +- */ +- if (type == HOLE_OFF && lastoff == startoff && +- lastoff < page_offset(pvec.pages[0])) { +- found = true; +- break; +- } +- + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + loff_t b_offset; +@@ -1185,18 +1174,18 @@ xfs_find_get_desired_pgoff( + * file mapping. However, page->index will not change + * because we have a reference on the page. + * +- * Searching done if the page index is out of range. +- * If the current offset is not reaches the end of +- * the specified search range, there should be a hole +- * between them. ++ * If current page offset is beyond where we've ended, ++ * we've found a hole. + */ +- if (page->index > end) { +- if (type == HOLE_OFF && lastoff < endoff) { +- *offset = lastoff; +- found = true; +- } ++ if (type == HOLE_OFF && lastoff < endoff && ++ lastoff < page_offset(pvec.pages[i])) { ++ found = true; ++ *offset = lastoff; + goto out; + } ++ /* Searching done if the page index is out of range. */ ++ if (page->index > end) ++ goto out; + + lock_page(page); + /* diff --git a/queue-4.9/xfs-fix-off-by-one-on-max-nr_pages-in-xfs_find_get_desired_pgoff.patch b/queue-4.9/xfs-fix-off-by-one-on-max-nr_pages-in-xfs_find_get_desired_pgoff.patch new file mode 100644 index 00000000000..80e1e58021d --- /dev/null +++ b/queue-4.9/xfs-fix-off-by-one-on-max-nr_pages-in-xfs_find_get_desired_pgoff.patch @@ -0,0 +1,54 @@ +From 8affebe16d79ebefb1d9d6d56a46dc89716f9453 Mon Sep 17 00:00:00 2001 +From: Eryu Guan +Date: Tue, 23 May 2017 08:30:46 -0700 +Subject: xfs: fix off-by-one on max nr_pages in xfs_find_get_desired_pgoff() + +From: Eryu Guan + +commit 8affebe16d79ebefb1d9d6d56a46dc89716f9453 upstream. + +xfs_find_get_desired_pgoff() is used to search for offset of hole or +data in page range [index, end] (both inclusive), and the max number +of pages to search should be at least one, if end == index. +Otherwise the only page is missed and no hole or data is found, +which is not correct. + +When block size is smaller than page size, this can be demonstrated +by preallocating a file with size smaller than page size and writing +data to the last block. E.g. run this xfs_io command on a 1k block +size XFS on x86_64 host. + + # xfs_io -fc "falloc 0 3k" -c "pwrite 2k 1k" \ + -c "seek -d 0" /mnt/xfs/testfile + wrote 1024/1024 bytes at offset 2048 + 1 KiB, 1 ops; 0.0000 sec (33.675 MiB/sec and 34482.7586 ops/sec) + Whence Result + DATA EOF + +Data at offset 2k was missed, and lseek(2) returned ENXIO. + +This is uncovered by generic/285 subtest 07 and 08 on ppc64 host, +where pagesize is 64k. Because a recent change to generic/285 +reduced the preallocated file size to smaller than 64k. + +Signed-off-by: Eryu Guan +Reviewed-by: Jan Kara +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_file.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -1136,7 +1136,7 @@ xfs_find_get_desired_pgoff( + unsigned nr_pages; + unsigned int i; + +- want = min_t(pgoff_t, end - index, PAGEVEC_SIZE); ++ want = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; + nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, + want); + /* diff --git a/queue-4.9/xfs-rework-the-inline-directory-verifiers.patch b/queue-4.9/xfs-rework-the-inline-directory-verifiers.patch new file mode 100644 index 00000000000..1369f2c250f --- /dev/null +++ b/queue-4.9/xfs-rework-the-inline-directory-verifiers.patch @@ -0,0 +1,332 @@ +From 78420281a9d74014af7616958806c3aba056319e Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Mon, 3 Apr 2017 12:22:20 -0700 +Subject: xfs: rework the inline directory verifiers + +From: Darrick J. Wong + +commit 78420281a9d74014af7616958806c3aba056319e upstream. + +The inline directory verifiers should be called on the inode fork data, +which means after iformat_local on the read side, and prior to +ifork_flush on the write side. This makes the fork verifier more +consistent with the way buffer verifiers work -- i.e. they will operate +on the memory buffer that the code will be reading and writing directly. + +Furthermore, revise the verifier function to return -EFSCORRUPTED so +that we don't flood the logs with corruption messages and assert +notices. This has been a particular problem with xfs/348, which +triggers the XFS_WANT_CORRUPTED_RETURN assertions, which halts the +kernel when CONFIG_XFS_DEBUG=y. Disk corruption isn't supposed to do +that, at least not in a verifier. + +Reviewed-by: Brian Foster +Reviewed-by: Christoph Hellwig +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_dir2_priv.h | 3 - + fs/xfs/libxfs/xfs_dir2_sf.c | 63 ++++++++++++++++++++++++++--------------- + fs/xfs/libxfs/xfs_inode_fork.c | 35 ++++++++-------------- + fs/xfs/libxfs/xfs_inode_fork.h | 2 - + fs/xfs/xfs_inode.c | 19 ++++++------ + 5 files changed, 66 insertions(+), 56 deletions(-) + +--- a/fs/xfs/libxfs/xfs_dir2_priv.h ++++ b/fs/xfs/libxfs/xfs_dir2_priv.h +@@ -126,8 +126,7 @@ extern int xfs_dir2_sf_create(struct xfs + extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); + extern int xfs_dir2_sf_removename(struct xfs_da_args *args); + extern int xfs_dir2_sf_replace(struct xfs_da_args *args); +-extern int xfs_dir2_sf_verify(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *sfp, +- int size); ++extern int xfs_dir2_sf_verify(struct xfs_inode *ip); + + /* xfs_dir2_readdir.c */ + extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, +--- a/fs/xfs/libxfs/xfs_dir2_sf.c ++++ b/fs/xfs/libxfs/xfs_dir2_sf.c +@@ -632,36 +632,49 @@ xfs_dir2_sf_check( + /* Verify the consistency of an inline directory. */ + int + xfs_dir2_sf_verify( +- struct xfs_mount *mp, +- struct xfs_dir2_sf_hdr *sfp, +- int size) ++ struct xfs_inode *ip) + { ++ struct xfs_mount *mp = ip->i_mount; ++ struct xfs_dir2_sf_hdr *sfp; + struct xfs_dir2_sf_entry *sfep; + struct xfs_dir2_sf_entry *next_sfep; + char *endp; + const struct xfs_dir_ops *dops; ++ struct xfs_ifork *ifp; + xfs_ino_t ino; + int i; + int i8count; + int offset; ++ int size; ++ int error; + __uint8_t filetype; + ++ ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL); ++ /* ++ * xfs_iread calls us before xfs_setup_inode sets up ip->d_ops, ++ * so we can only trust the mountpoint to have the right pointer. ++ */ + dops = xfs_dir_get_ops(mp, NULL); + ++ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ++ sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data; ++ size = ifp->if_bytes; ++ + /* + * Give up if the directory is way too short. + */ +- XFS_WANT_CORRUPTED_RETURN(mp, size > +- offsetof(struct xfs_dir2_sf_hdr, parent)); +- XFS_WANT_CORRUPTED_RETURN(mp, size >= +- xfs_dir2_sf_hdr_size(sfp->i8count)); ++ if (size <= offsetof(struct xfs_dir2_sf_hdr, parent) || ++ size < xfs_dir2_sf_hdr_size(sfp->i8count)) ++ return -EFSCORRUPTED; + + endp = (char *)sfp + size; + + /* Check .. entry */ + ino = dops->sf_get_parent_ino(sfp); + i8count = ino > XFS_DIR2_MAX_SHORT_INUM; +- XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); ++ error = xfs_dir_ino_validate(mp, ino); ++ if (error) ++ return error; + offset = dops->data_first_offset; + + /* Check all reported entries */ +@@ -672,12 +685,12 @@ xfs_dir2_sf_verify( + * Check the fixed-offset parts of the structure are + * within the data buffer. + */ +- XFS_WANT_CORRUPTED_RETURN(mp, +- ((char *)sfep + sizeof(*sfep)) < endp); ++ if (((char *)sfep + sizeof(*sfep)) >= endp) ++ return -EFSCORRUPTED; + + /* Don't allow names with known bad length. */ +- XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen > 0); +- XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen < MAXNAMELEN); ++ if (sfep->namelen == 0) ++ return -EFSCORRUPTED; + + /* + * Check that the variable-length part of the structure is +@@ -685,33 +698,39 @@ xfs_dir2_sf_verify( + * name component, so nextentry is an acceptable test. + */ + next_sfep = dops->sf_nextentry(sfp, sfep); +- XFS_WANT_CORRUPTED_RETURN(mp, endp >= (char *)next_sfep); ++ if (endp < (char *)next_sfep) ++ return -EFSCORRUPTED; + + /* Check that the offsets always increase. */ +- XFS_WANT_CORRUPTED_RETURN(mp, +- xfs_dir2_sf_get_offset(sfep) >= offset); ++ if (xfs_dir2_sf_get_offset(sfep) < offset) ++ return -EFSCORRUPTED; + + /* Check the inode number. */ + ino = dops->sf_get_ino(sfp, sfep); + i8count += ino > XFS_DIR2_MAX_SHORT_INUM; +- XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); ++ error = xfs_dir_ino_validate(mp, ino); ++ if (error) ++ return error; + + /* Check the file type. */ + filetype = dops->sf_get_ftype(sfep); +- XFS_WANT_CORRUPTED_RETURN(mp, filetype < XFS_DIR3_FT_MAX); ++ if (filetype >= XFS_DIR3_FT_MAX) ++ return -EFSCORRUPTED; + + offset = xfs_dir2_sf_get_offset(sfep) + + dops->data_entsize(sfep->namelen); + + sfep = next_sfep; + } +- XFS_WANT_CORRUPTED_RETURN(mp, i8count == sfp->i8count); +- XFS_WANT_CORRUPTED_RETURN(mp, (void *)sfep == (void *)endp); ++ if (i8count != sfp->i8count) ++ return -EFSCORRUPTED; ++ if ((void *)sfep != (void *)endp) ++ return -EFSCORRUPTED; + + /* Make sure this whole thing ought to be in local format. */ +- XFS_WANT_CORRUPTED_RETURN(mp, offset + +- (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + +- (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dir_geo->blksize); ++ if (offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + ++ (uint)sizeof(xfs_dir2_block_tail_t) > mp->m_dir_geo->blksize) ++ return -EFSCORRUPTED; + + return 0; + } +--- a/fs/xfs/libxfs/xfs_inode_fork.c ++++ b/fs/xfs/libxfs/xfs_inode_fork.c +@@ -212,6 +212,16 @@ xfs_iformat_fork( + if (error) + return error; + ++ /* Check inline dir contents. */ ++ if (S_ISDIR(VFS_I(ip)->i_mode) && ++ dip->di_format == XFS_DINODE_FMT_LOCAL) { ++ error = xfs_dir2_sf_verify(ip); ++ if (error) { ++ xfs_idestroy_fork(ip, XFS_DATA_FORK); ++ return error; ++ } ++ } ++ + if (xfs_is_reflink_inode(ip)) { + ASSERT(ip->i_cowfp == NULL); + xfs_ifork_init_cow(ip); +@@ -322,8 +332,6 @@ xfs_iformat_local( + int whichfork, + int size) + { +- int error; +- + /* + * If the size is unreasonable, then something + * is wrong and we just bail out rather than crash in +@@ -339,14 +347,6 @@ xfs_iformat_local( + return -EFSCORRUPTED; + } + +- if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { +- error = xfs_dir2_sf_verify(ip->i_mount, +- (struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dip), +- size); +- if (error) +- return error; +- } +- + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); + return 0; + } +@@ -867,7 +867,7 @@ xfs_iextents_copy( + * In these cases, the format always takes precedence, because the + * format indicates the current state of the fork. + */ +-int ++void + xfs_iflush_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip, +@@ -877,7 +877,6 @@ xfs_iflush_fork( + char *cp; + xfs_ifork_t *ifp; + xfs_mount_t *mp; +- int error; + static const short brootflag[2] = + { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; + static const short dataflag[2] = +@@ -886,7 +885,7 @@ xfs_iflush_fork( + { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; + + if (!iip) +- return 0; ++ return; + ifp = XFS_IFORK_PTR(ip, whichfork); + /* + * This can happen if we gave up in iformat in an error path, +@@ -894,19 +893,12 @@ xfs_iflush_fork( + */ + if (!ifp) { + ASSERT(whichfork == XFS_ATTR_FORK); +- return 0; ++ return; + } + cp = XFS_DFORK_PTR(dip, whichfork); + mp = ip->i_mount; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: +- if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { +- error = xfs_dir2_sf_verify(mp, +- (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data, +- ifp->if_bytes); +- if (error) +- return error; +- } + if ((iip->ili_fields & dataflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(ifp->if_u1.if_data != NULL); +@@ -959,7 +951,6 @@ xfs_iflush_fork( + ASSERT(0); + break; + } +- return 0; + } + + /* +--- a/fs/xfs/libxfs/xfs_inode_fork.h ++++ b/fs/xfs/libxfs/xfs_inode_fork.h +@@ -140,7 +140,7 @@ typedef struct xfs_ifork { + struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); + + int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +-int xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, ++void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, + struct xfs_inode_log_item *, int); + void xfs_idestroy_fork(struct xfs_inode *, int); + void xfs_idata_realloc(struct xfs_inode *, int, int); +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -50,6 +50,7 @@ + #include "xfs_log.h" + #include "xfs_bmap_btree.h" + #include "xfs_reflink.h" ++#include "xfs_dir2_priv.h" + + kmem_zone_t *xfs_inode_zone; + +@@ -3491,7 +3492,6 @@ xfs_iflush_int( + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_dinode *dip; + struct xfs_mount *mp = ip->i_mount; +- int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(xfs_isiflocked(ip)); +@@ -3563,6 +3563,12 @@ xfs_iflush_int( + if (ip->i_d.di_version < 3) + ip->i_d.di_flushiter++; + ++ /* Check the inline directory data. */ ++ if (S_ISDIR(VFS_I(ip)->i_mode) && ++ ip->i_d.di_format == XFS_DINODE_FMT_LOCAL && ++ xfs_dir2_sf_verify(ip)) ++ goto corrupt_out; ++ + /* + * Copy the dirty parts of the inode into the on-disk inode. We always + * copy out the core of the inode, because if the inode is dirty at all +@@ -3574,14 +3580,9 @@ xfs_iflush_int( + if (ip->i_d.di_flushiter == DI_MAX_FLUSH) + ip->i_d.di_flushiter = 0; + +- error = xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); +- if (error) +- return error; +- if (XFS_IFORK_Q(ip)) { +- error = xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); +- if (error) +- return error; +- } ++ xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); ++ if (XFS_IFORK_Q(ip)) ++ xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); + xfs_inobp_check(mp, bp); + + /* diff --git a/queue-4.9/xfs-use-b_state-to-fix-buffer-i-o-accounting-release-race.patch b/queue-4.9/xfs-use-b_state-to-fix-buffer-i-o-accounting-release-race.patch new file mode 100644 index 00000000000..074033ebde5 --- /dev/null +++ b/queue-4.9/xfs-use-b_state-to-fix-buffer-i-o-accounting-release-race.patch @@ -0,0 +1,155 @@ +From 63db7c815bc0997c29e484d2409684fdd9fcd93b Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Wed, 31 May 2017 08:22:52 -0700 +Subject: xfs: use ->b_state to fix buffer I/O accounting release race + +From: Brian Foster + +commit 63db7c815bc0997c29e484d2409684fdd9fcd93b upstream. + +We've had user reports of unmount hangs in xfs_wait_buftarg() that +analysis shows is due to btp->bt_io_count == -1. bt_io_count +represents the count of in-flight asynchronous buffers and thus +should always be >= 0. xfs_wait_buftarg() waits for this value to +stabilize to zero in order to ensure that all untracked (with +respect to the lru) buffers have completed I/O processing before +unmount proceeds to tear down in-core data structures. + +The value of -1 implies an I/O accounting decrement race. Indeed, +the fact that xfs_buf_ioacct_dec() is called from xfs_buf_rele() +(where the buffer lock is no longer held) means that bp->b_flags can +be updated from an unsafe context. While a user-level reproducer is +currently not available, some intrusive hacks to run racing buffer +lookups/ioacct/releases from multiple threads was used to +successfully manufacture this problem. + +Existing callers do not expect to acquire the buffer lock from +xfs_buf_rele(). Therefore, we can not safely update ->b_flags from +this context. It turns out that we already have separate buffer +state bits and associated serialization for dealing with buffer LRU +state in the form of ->b_state and ->b_lock. Therefore, replace the +_XBF_IN_FLIGHT flag with a ->b_state variant, update the I/O +accounting wrappers appropriately and make sure they are used with +the correct locking. This ensures that buffer in-flight state can be +modified at buffer release time without racing with modifications +from a buffer lock holder. + +Fixes: 9c7504aa72b6 ("xfs: track and serialize in-flight async buffers against unmount") +Signed-off-by: Brian Foster +Reviewed-by: Nikolay Borisov +Tested-by: Libor Pechacek +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_buf.c | 38 ++++++++++++++++++++++++++------------ + fs/xfs/xfs_buf.h | 5 ++--- + 2 files changed, 28 insertions(+), 15 deletions(-) + +--- a/fs/xfs/xfs_buf.c ++++ b/fs/xfs/xfs_buf.c +@@ -96,12 +96,16 @@ static inline void + xfs_buf_ioacct_inc( + struct xfs_buf *bp) + { +- if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) ++ if (bp->b_flags & XBF_NO_IOACCT) + return; + + ASSERT(bp->b_flags & XBF_ASYNC); +- bp->b_flags |= _XBF_IN_FLIGHT; +- percpu_counter_inc(&bp->b_target->bt_io_count); ++ spin_lock(&bp->b_lock); ++ if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { ++ bp->b_state |= XFS_BSTATE_IN_FLIGHT; ++ percpu_counter_inc(&bp->b_target->bt_io_count); ++ } ++ spin_unlock(&bp->b_lock); + } + + /* +@@ -109,14 +113,24 @@ xfs_buf_ioacct_inc( + * freed and unaccount from the buftarg. + */ + static inline void +-xfs_buf_ioacct_dec( ++__xfs_buf_ioacct_dec( + struct xfs_buf *bp) + { +- if (!(bp->b_flags & _XBF_IN_FLIGHT)) +- return; ++ ASSERT(spin_is_locked(&bp->b_lock)); + +- bp->b_flags &= ~_XBF_IN_FLIGHT; +- percpu_counter_dec(&bp->b_target->bt_io_count); ++ if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { ++ bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; ++ percpu_counter_dec(&bp->b_target->bt_io_count); ++ } ++} ++ ++static inline void ++xfs_buf_ioacct_dec( ++ struct xfs_buf *bp) ++{ ++ spin_lock(&bp->b_lock); ++ __xfs_buf_ioacct_dec(bp); ++ spin_unlock(&bp->b_lock); + } + + /* +@@ -148,9 +162,9 @@ xfs_buf_stale( + * unaccounted (released to LRU) before that occurs. Drop in-flight + * status now to preserve accounting consistency. + */ +- xfs_buf_ioacct_dec(bp); +- + spin_lock(&bp->b_lock); ++ __xfs_buf_ioacct_dec(bp); ++ + atomic_set(&bp->b_lru_ref, 0); + if (!(bp->b_state & XFS_BSTATE_DISPOSE) && + (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) +@@ -953,12 +967,12 @@ xfs_buf_rele( + * ensures the decrement occurs only once per-buf. + */ + if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) +- xfs_buf_ioacct_dec(bp); ++ __xfs_buf_ioacct_dec(bp); + goto out_unlock; + } + + /* the last reference has been dropped ... */ +- xfs_buf_ioacct_dec(bp); ++ __xfs_buf_ioacct_dec(bp); + if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + /* + * If the buffer is added to the LRU take a new reference to the +--- a/fs/xfs/xfs_buf.h ++++ b/fs/xfs/xfs_buf.h +@@ -63,7 +63,6 @@ typedef enum { + #define _XBF_KMEM (1 << 21)/* backed by heap memory */ + #define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ + #define _XBF_COMPOUND (1 << 23)/* compound buffer */ +-#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */ + + typedef unsigned int xfs_buf_flags_t; + +@@ -83,14 +82,14 @@ typedef unsigned int xfs_buf_flags_t; + { _XBF_PAGES, "PAGES" }, \ + { _XBF_KMEM, "KMEM" }, \ + { _XBF_DELWRI_Q, "DELWRI_Q" }, \ +- { _XBF_COMPOUND, "COMPOUND" }, \ +- { _XBF_IN_FLIGHT, "IN_FLIGHT" } ++ { _XBF_COMPOUND, "COMPOUND" } + + + /* + * Internal state flags. + */ + #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ ++#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ + + /* + * The xfs_buftarg contains 2 notions of "sector size" - diff --git a/queue-4.9/xfs-use-dedicated-log-worker-wq-to-avoid-deadlock-with-cil-wq.patch b/queue-4.9/xfs-use-dedicated-log-worker-wq-to-avoid-deadlock-with-cil-wq.patch new file mode 100644 index 00000000000..ff6af5f6756 --- /dev/null +++ b/queue-4.9/xfs-use-dedicated-log-worker-wq-to-avoid-deadlock-with-cil-wq.patch @@ -0,0 +1,102 @@ +From 696a562072e3c14bcd13ae5acc19cdf27679e865 Mon Sep 17 00:00:00 2001 +From: Brian Foster +Date: Tue, 28 Mar 2017 14:51:44 -0700 +Subject: xfs: use dedicated log worker wq to avoid deadlock with cil wq + +From: Brian Foster + +commit 696a562072e3c14bcd13ae5acc19cdf27679e865 upstream. + +The log covering background task used to be part of the xfssyncd +workqueue. That workqueue was removed as of commit 5889608df ("xfs: +syncd workqueue is no more") and the associated work item scheduled +to the xfs-log wq. The latter is used for log buffer I/O completion. + +Since xfs_log_worker() can invoke a log flush, a deadlock is +possible between the xfs-log and xfs-cil workqueues. Consider the +following codepath from xfs_log_worker(): + +xfs_log_worker() + xfs_log_force() + _xfs_log_force() + xlog_cil_force() + xlog_cil_force_lsn() + xlog_cil_push_now() + flush_work() + +The above is in xfs-log wq context and blocked waiting on the +completion of an xfs-cil work item. Concurrently, the cil push in +progress can end up blocked here: + +xlog_cil_push_work() + xlog_cil_push() + xlog_write() + xlog_state_get_iclog_space() + xlog_wait(&log->l_flush_wait, ...) + +The above is in xfs-cil context waiting on log buffer I/O +completion, which executes in xfs-log wq context. In this scenario +both workqueues are deadlocked waiting on eachother. + +Add a new workqueue specifically for the high level log covering and +ail pushing worker, as was the case prior to commit 5889608df. + +Diagnosed-by: David Jeffery +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/xfs_log.c | 2 +- + fs/xfs/xfs_mount.h | 1 + + fs/xfs/xfs_super.c | 8 ++++++++ + 3 files changed, 10 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_log.c ++++ b/fs/xfs/xfs_log.c +@@ -1293,7 +1293,7 @@ void + xfs_log_work_queue( + struct xfs_mount *mp) + { +- queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, ++ queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); + } + +--- a/fs/xfs/xfs_mount.h ++++ b/fs/xfs/xfs_mount.h +@@ -183,6 +183,7 @@ typedef struct xfs_mount { + struct workqueue_struct *m_reclaim_workqueue; + struct workqueue_struct *m_log_workqueue; + struct workqueue_struct *m_eofblocks_workqueue; ++ struct workqueue_struct *m_sync_workqueue; + + /* + * Generation of the filesysyem layout. This is incremented by each +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -872,8 +872,15 @@ xfs_init_mount_workqueues( + if (!mp->m_eofblocks_workqueue) + goto out_destroy_log; + ++ mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, ++ mp->m_fsname); ++ if (!mp->m_sync_workqueue) ++ goto out_destroy_eofb; ++ + return 0; + ++out_destroy_eofb: ++ destroy_workqueue(mp->m_eofblocks_workqueue); + out_destroy_log: + destroy_workqueue(mp->m_log_workqueue); + out_destroy_reclaim: +@@ -894,6 +901,7 @@ STATIC void + xfs_destroy_mount_workqueues( + struct xfs_mount *mp) + { ++ destroy_workqueue(mp->m_sync_workqueue); + destroy_workqueue(mp->m_eofblocks_workqueue); + destroy_workqueue(mp->m_log_workqueue); + destroy_workqueue(mp->m_reclaim_workqueue); diff --git a/queue-4.9/xfs-verify-inline-directory-data-forks.patch b/queue-4.9/xfs-verify-inline-directory-data-forks.patch new file mode 100644 index 00000000000..dafbe3ad8a3 --- /dev/null +++ b/queue-4.9/xfs-verify-inline-directory-data-forks.patch @@ -0,0 +1,290 @@ +From 630a04e79dd41ff746b545d4fc052e0abb836120 Mon Sep 17 00:00:00 2001 +From: "Darrick J. Wong" +Date: Wed, 15 Mar 2017 00:24:25 -0700 +Subject: xfs: verify inline directory data forks + +From: Darrick J. Wong + +commit 630a04e79dd41ff746b545d4fc052e0abb836120 upstream. + +When we're reading or writing the data fork of an inline directory, +check the contents to make sure we're not overflowing buffers or eating +garbage data. xfs/348 corrupts an inline symlink into an inline +directory, triggering a buffer overflow bug. + +v2: add more checks consistent with _dir2_sf_check and make the verifier +usable from anywhere. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Signed-off-by: Greg Kroah-Hartman + +--- + fs/xfs/libxfs/xfs_dir2_priv.h | 2 + fs/xfs/libxfs/xfs_dir2_sf.c | 87 +++++++++++++++++++++++++++++++++++++++++ + fs/xfs/libxfs/xfs_inode_fork.c | 26 ++++++++++-- + fs/xfs/libxfs/xfs_inode_fork.h | 2 + fs/xfs/xfs_dir2_readdir.c | 11 ----- + fs/xfs/xfs_inode.c | 12 ++++- + 6 files changed, 122 insertions(+), 18 deletions(-) + +--- a/fs/xfs/libxfs/xfs_dir2_priv.h ++++ b/fs/xfs/libxfs/xfs_dir2_priv.h +@@ -126,6 +126,8 @@ extern int xfs_dir2_sf_create(struct xfs + extern int xfs_dir2_sf_lookup(struct xfs_da_args *args); + extern int xfs_dir2_sf_removename(struct xfs_da_args *args); + extern int xfs_dir2_sf_replace(struct xfs_da_args *args); ++extern int xfs_dir2_sf_verify(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *sfp, ++ int size); + + /* xfs_dir2_readdir.c */ + extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx, +--- a/fs/xfs/libxfs/xfs_dir2_sf.c ++++ b/fs/xfs/libxfs/xfs_dir2_sf.c +@@ -629,6 +629,93 @@ xfs_dir2_sf_check( + } + #endif /* DEBUG */ + ++/* Verify the consistency of an inline directory. */ ++int ++xfs_dir2_sf_verify( ++ struct xfs_mount *mp, ++ struct xfs_dir2_sf_hdr *sfp, ++ int size) ++{ ++ struct xfs_dir2_sf_entry *sfep; ++ struct xfs_dir2_sf_entry *next_sfep; ++ char *endp; ++ const struct xfs_dir_ops *dops; ++ xfs_ino_t ino; ++ int i; ++ int i8count; ++ int offset; ++ __uint8_t filetype; ++ ++ dops = xfs_dir_get_ops(mp, NULL); ++ ++ /* ++ * Give up if the directory is way too short. ++ */ ++ XFS_WANT_CORRUPTED_RETURN(mp, size > ++ offsetof(struct xfs_dir2_sf_hdr, parent)); ++ XFS_WANT_CORRUPTED_RETURN(mp, size >= ++ xfs_dir2_sf_hdr_size(sfp->i8count)); ++ ++ endp = (char *)sfp + size; ++ ++ /* Check .. entry */ ++ ino = dops->sf_get_parent_ino(sfp); ++ i8count = ino > XFS_DIR2_MAX_SHORT_INUM; ++ XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); ++ offset = dops->data_first_offset; ++ ++ /* Check all reported entries */ ++ sfep = xfs_dir2_sf_firstentry(sfp); ++ for (i = 0; i < sfp->count; i++) { ++ /* ++ * struct xfs_dir2_sf_entry has a variable length. ++ * Check the fixed-offset parts of the structure are ++ * within the data buffer. ++ */ ++ XFS_WANT_CORRUPTED_RETURN(mp, ++ ((char *)sfep + sizeof(*sfep)) < endp); ++ ++ /* Don't allow names with known bad length. */ ++ XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen > 0); ++ XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen < MAXNAMELEN); ++ ++ /* ++ * Check that the variable-length part of the structure is ++ * within the data buffer. The next entry starts after the ++ * name component, so nextentry is an acceptable test. ++ */ ++ next_sfep = dops->sf_nextentry(sfp, sfep); ++ XFS_WANT_CORRUPTED_RETURN(mp, endp >= (char *)next_sfep); ++ ++ /* Check that the offsets always increase. */ ++ XFS_WANT_CORRUPTED_RETURN(mp, ++ xfs_dir2_sf_get_offset(sfep) >= offset); ++ ++ /* Check the inode number. */ ++ ino = dops->sf_get_ino(sfp, sfep); ++ i8count += ino > XFS_DIR2_MAX_SHORT_INUM; ++ XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino)); ++ ++ /* Check the file type. */ ++ filetype = dops->sf_get_ftype(sfep); ++ XFS_WANT_CORRUPTED_RETURN(mp, filetype < XFS_DIR3_FT_MAX); ++ ++ offset = xfs_dir2_sf_get_offset(sfep) + ++ dops->data_entsize(sfep->namelen); ++ ++ sfep = next_sfep; ++ } ++ XFS_WANT_CORRUPTED_RETURN(mp, i8count == sfp->i8count); ++ XFS_WANT_CORRUPTED_RETURN(mp, (void *)sfep == (void *)endp); ++ ++ /* Make sure this whole thing ought to be in local format. */ ++ XFS_WANT_CORRUPTED_RETURN(mp, offset + ++ (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + ++ (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dir_geo->blksize); ++ ++ return 0; ++} ++ + /* + * Create a new (shortform) directory. + */ +--- a/fs/xfs/libxfs/xfs_inode_fork.c ++++ b/fs/xfs/libxfs/xfs_inode_fork.c +@@ -33,6 +33,8 @@ + #include "xfs_trace.h" + #include "xfs_attr_sf.h" + #include "xfs_da_format.h" ++#include "xfs_da_btree.h" ++#include "xfs_dir2_priv.h" + + kmem_zone_t *xfs_ifork_zone; + +@@ -320,6 +322,7 @@ xfs_iformat_local( + int whichfork, + int size) + { ++ int error; + + /* + * If the size is unreasonable, then something +@@ -336,6 +339,14 @@ xfs_iformat_local( + return -EFSCORRUPTED; + } + ++ if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { ++ error = xfs_dir2_sf_verify(ip->i_mount, ++ (struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dip), ++ size); ++ if (error) ++ return error; ++ } ++ + xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size); + return 0; + } +@@ -856,7 +867,7 @@ xfs_iextents_copy( + * In these cases, the format always takes precedence, because the + * format indicates the current state of the fork. + */ +-void ++int + xfs_iflush_fork( + xfs_inode_t *ip, + xfs_dinode_t *dip, +@@ -866,6 +877,7 @@ xfs_iflush_fork( + char *cp; + xfs_ifork_t *ifp; + xfs_mount_t *mp; ++ int error; + static const short brootflag[2] = + { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT }; + static const short dataflag[2] = +@@ -874,7 +886,7 @@ xfs_iflush_fork( + { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; + + if (!iip) +- return; ++ return 0; + ifp = XFS_IFORK_PTR(ip, whichfork); + /* + * This can happen if we gave up in iformat in an error path, +@@ -882,12 +894,19 @@ xfs_iflush_fork( + */ + if (!ifp) { + ASSERT(whichfork == XFS_ATTR_FORK); +- return; ++ return 0; + } + cp = XFS_DFORK_PTR(dip, whichfork); + mp = ip->i_mount; + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: ++ if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) { ++ error = xfs_dir2_sf_verify(mp, ++ (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data, ++ ifp->if_bytes); ++ if (error) ++ return error; ++ } + if ((iip->ili_fields & dataflag[whichfork]) && + (ifp->if_bytes > 0)) { + ASSERT(ifp->if_u1.if_data != NULL); +@@ -940,6 +959,7 @@ xfs_iflush_fork( + ASSERT(0); + break; + } ++ return 0; + } + + /* +--- a/fs/xfs/libxfs/xfs_inode_fork.h ++++ b/fs/xfs/libxfs/xfs_inode_fork.h +@@ -140,7 +140,7 @@ typedef struct xfs_ifork { + struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state); + + int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *); +-void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, ++int xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *, + struct xfs_inode_log_item *, int); + void xfs_idestroy_fork(struct xfs_inode *, int); + void xfs_idata_realloc(struct xfs_inode *, int, int); +--- a/fs/xfs/xfs_dir2_readdir.c ++++ b/fs/xfs/xfs_dir2_readdir.c +@@ -71,22 +71,11 @@ xfs_dir2_sf_getdents( + struct xfs_da_geometry *geo = args->geo; + + ASSERT(dp->i_df.if_flags & XFS_IFINLINE); +- /* +- * Give up if the directory is way too short. +- */ +- if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { +- ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); +- return -EIO; +- } +- + ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); + ASSERT(dp->i_df.if_u1.if_data != NULL); + + sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; + +- if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count)) +- return -EFSCORRUPTED; +- + /* + * If the block number in the offset is out of range, we're done. + */ +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -3491,6 +3491,7 @@ xfs_iflush_int( + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_dinode *dip; + struct xfs_mount *mp = ip->i_mount; ++ int error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); + ASSERT(xfs_isiflocked(ip)); +@@ -3573,9 +3574,14 @@ xfs_iflush_int( + if (ip->i_d.di_flushiter == DI_MAX_FLUSH) + ip->i_d.di_flushiter = 0; + +- xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); +- if (XFS_IFORK_Q(ip)) +- xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); ++ error = xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); ++ if (error) ++ return error; ++ if (XFS_IFORK_Q(ip)) { ++ error = xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); ++ if (error) ++ return error; ++ } + xfs_inobp_check(mp, bp); + + /*