]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
erofs: handle overlapped pclusters out of crafted images properly
authorGao Xiang <hsiangkao@linux.alibaba.com>
Tue, 10 Sep 2024 07:08:47 +0000 (15:08 +0800)
committerGao Xiang <hsiangkao@linux.alibaba.com>
Tue, 10 Sep 2024 07:26:15 +0000 (15:26 +0800)
syzbot reported a task hang issue due to a deadlock case where it is
waiting for the folio lock of a cached folio that will be used for
cache I/Os.

After looking into the crafted fuzzed image, I found it's formed with
several overlapped big pclusters as below:

 Ext:   logical offset   |  length :     physical offset    |  length
   0:        0..   16384 |   16384 :     151552..    167936 |   16384
   1:    16384..   32768 |   16384 :     155648..    172032 |   16384
   2:    32768..   49152 |   16384 :  537223168.. 537239552 |   16384
...

Here, extent 0/1 are physically overlapped although it's entirely
_impossible_ for normal filesystem images generated by mkfs.

First, managed folios containing compressed data will be marked as
up-to-date and then unlocked immediately (unlike in-place folios) when
compressed I/Os are complete.  If physical blocks are not submitted in
the incremental order, there should be separate BIOs to avoid dependency
issues.  However, the current code mis-arranges z_erofs_fill_bio_vec()
and BIO submission which causes unexpected BIO waits.

Second, managed folios will be connected to their own pclusters for
efficient inter-queries.  However, this is somewhat hard to implement
easily if overlapped big pclusters exist.  Again, these only appear in
fuzzed images so let's simply fall back to temporary short-lived pages
for correctness.

Additionally, it justifies that referenced managed folios cannot be
truncated for now and reverts part of commit 2080ca1ed3e4 ("erofs: tidy
up `struct z_erofs_bvec`") for simplicity although it shouldn't be any
difference.

Reported-by: syzbot+4fc98ed414ae63d1ada2@syzkaller.appspotmail.com
Reported-by: syzbot+de04e06b28cfecf2281c@syzkaller.appspotmail.com
Reported-by: syzbot+c8c8238b394be4a1087d@syzkaller.appspotmail.com
Tested-by: syzbot+4fc98ed414ae63d1ada2@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/0000000000002fda01061e334873@google.com
Fixes: 8e6c8fa9f2e9 ("erofs: enable big pcluster feature")
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240910070847.3356592-1-hsiangkao@linux.alibaba.com
fs/erofs/zdata.c

index 424f656cd765e24f3bd9c8bdf98bc8b09f33c706..a0bae499c5ff65e03e38c93135bbe610b178cf4e 100644 (file)
@@ -1428,6 +1428,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
        struct z_erofs_bvec zbv;
        struct address_space *mapping;
        struct folio *folio;
+       struct page *page;
        int bs = i_blocksize(f->inode);
 
        /* Except for inplace folios, the entire folio can be used for I/Os */
@@ -1450,7 +1451,6 @@ repeat:
         * file-backed folios will be used instead.
         */
        if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
-               folio->private = 0;
                tocache = true;
                goto out_tocache;
        }
@@ -1468,7 +1468,7 @@ repeat:
        }
 
        folio_lock(folio);
-       if (folio->mapping == mc) {
+       if (likely(folio->mapping == mc)) {
                /*
                 * The cached folio is still in managed cache but without
                 * a valid `->private` pcluster hint.  Let's reconnect them.
@@ -1478,41 +1478,44 @@ repeat:
                        /* compressed_bvecs[] already takes a ref before */
                        folio_put(folio);
                }
-
-               /* no need to submit if it is already up-to-date */
-               if (folio_test_uptodate(folio)) {
-                       folio_unlock(folio);
-                       bvec->bv_page = NULL;
+               if (likely(folio->private == pcl))  {
+                       /* don't submit cache I/Os again if already uptodate */
+                       if (folio_test_uptodate(folio)) {
+                               folio_unlock(folio);
+                               bvec->bv_page = NULL;
+                       }
+                       return;
                }
-               return;
+               /*
+                * Already linked with another pcluster, which only appears in
+                * crafted images by fuzzers for now.  But handle this anyway.
+                */
+               tocache = false;        /* use temporary short-lived pages */
+       } else {
+               DBG_BUGON(1); /* referenced managed folios can't be truncated */
+               tocache = true;
        }
-
-       /*
-        * It has been truncated, so it's unsafe to reuse this one. Let's
-        * allocate a new page for compressed data.
-        */
-       DBG_BUGON(folio->mapping);
-       tocache = true;
        folio_unlock(folio);
        folio_put(folio);
 out_allocfolio:
-       zbv.page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
+       page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
        spin_lock(&pcl->obj.lockref.lock);
-       if (pcl->compressed_bvecs[nr].page) {
-               erofs_pagepool_add(&f->pagepool, zbv.page);
+       if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
+               erofs_pagepool_add(&f->pagepool, page);
                spin_unlock(&pcl->obj.lockref.lock);
                cond_resched();
                goto repeat;
        }
-       bvec->bv_page = pcl->compressed_bvecs[nr].page = zbv.page;
-       folio = page_folio(zbv.page);
-       /* first mark it as a temporary shortlived folio (now 1 ref) */
-       folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
+       bvec->bv_page = pcl->compressed_bvecs[nr].page = page;
+       folio = page_folio(page);
        spin_unlock(&pcl->obj.lockref.lock);
 out_tocache:
        if (!tocache || bs != PAGE_SIZE ||
-           filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp))
+           filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) {
+               /* turn into a temporary shortlived folio (1 ref) */
+               folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
                return;
+       }
        folio_attach_private(folio, pcl);
        /* drop a refcount added by allocpage (then 2 refs in total here) */
        folio_put(folio);
@@ -1647,13 +1650,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
                cur = mdev.m_pa;
                end = cur + pcl->pclustersize;
                do {
-                       z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
-                       if (!bvec.bv_page)
-                               continue;
-
+                       bvec.bv_page = NULL;
                        if (bio && (cur != last_pa ||
                                    bio->bi_bdev != mdev.m_bdev)) {
-io_retry:
+drain_io:
                                if (!erofs_is_fscache_mode(sb))
                                        submit_bio(bio);
                                else
@@ -1666,6 +1666,15 @@ io_retry:
                                bio = NULL;
                        }
 
+                       if (!bvec.bv_page) {
+                               z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
+                               if (!bvec.bv_page)
+                                       continue;
+                               if (cur + bvec.bv_len > end)
+                                       bvec.bv_len = end - cur;
+                               DBG_BUGON(bvec.bv_len < sb->s_blocksize);
+                       }
+
                        if (unlikely(PageWorkingset(bvec.bv_page)) &&
                            !memstall) {
                                psi_memstall_enter(&pflags);
@@ -1685,13 +1694,9 @@ io_retry:
                                ++nr_bios;
                        }
 
-                       if (cur + bvec.bv_len > end)
-                               bvec.bv_len = end - cur;
-                       DBG_BUGON(bvec.bv_len < sb->s_blocksize);
                        if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
                                          bvec.bv_offset))
-                               goto io_retry;
-
+                               goto drain_io;
                        last_pa = cur + bvec.bv_len;
                        bypass = false;
                } while ((cur += bvec.bv_len) < end);