Commit 9e2f9d34 authored by Gao Xiang's avatar Gao Xiang

erofs: handle overlapped pclusters out of crafted images properly

syzbot reported a task hang issue due to a deadlock case where it is
waiting for the folio lock of a cached folio that will be used for
cache I/Os.

After looking into the crafted fuzzed image, I found it's formed with
several overlapped big pclusters as below:

 Ext:   logical offset   |  length :     physical offset    |  length
   0:        0..   16384 |   16384 :     151552..    167936 |   16384
   1:    16384..   32768 |   16384 :     155648..    172032 |   16384
   2:    32768..   49152 |   16384 :  537223168.. 537239552 |   16384
...

Here, extent 0/1 are physically overlapped although it's entirely
_impossible_ for normal filesystem images generated by mkfs.

First, managed folios containing compressed data will be marked as
up-to-date and then unlocked immediately (unlike in-place folios) when
compressed I/Os are complete.  If physical blocks are not submitted in
the incremental order, there should be separate BIOs to avoid dependency
issues.  However, the current code mis-arranges z_erofs_fill_bio_vec()
and BIO submission which causes unexpected BIO waits.

Second, managed folios will be connected to their own pclusters for
efficient inter-queries.  However, this is somewhat hard to implement
easily if overlapped big pclusters exist.  Again, these only appear in
fuzzed images so let's simply fall back to temporary short-lived pages
for correctness.

Additionally, it justifies that referenced managed folios cannot be
truncated for now and reverts part of commit 2080ca1e ("erofs: tidy
up `struct z_erofs_bvec`") for simplicity although it shouldn't be any
difference.

Reported-by: syzbot+4fc98ed414ae63d1ada2@syzkaller.appspotmail.com
Reported-by: syzbot+de04e06b28cfecf2281c@syzkaller.appspotmail.com
Reported-by: syzbot+c8c8238b394be4a1087d@syzkaller.appspotmail.com
Tested-by: syzbot+4fc98ed414ae63d1ada2@syzkaller.appspotmail.com
Closes: https://lore.kernel.org/r/0000000000002fda01061e334873@google.com
Fixes: 8e6c8fa9 ("erofs: enable big pcluster feature")
Signed-off-by: default avatarGao Xiang <hsiangkao@linux.alibaba.com>
Link: https://lore.kernel.org/r/20240910070847.3356592-1-hsiangkao@linux.alibaba.com
parent 3fc3e45f
...@@ -1428,6 +1428,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, ...@@ -1428,6 +1428,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
struct z_erofs_bvec zbv; struct z_erofs_bvec zbv;
struct address_space *mapping; struct address_space *mapping;
struct folio *folio; struct folio *folio;
struct page *page;
int bs = i_blocksize(f->inode); int bs = i_blocksize(f->inode);
/* Except for inplace folios, the entire folio can be used for I/Os */ /* Except for inplace folios, the entire folio can be used for I/Os */
...@@ -1450,7 +1451,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, ...@@ -1450,7 +1451,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
* file-backed folios will be used instead. * file-backed folios will be used instead.
*/ */
if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) { if (folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
folio->private = 0;
tocache = true; tocache = true;
goto out_tocache; goto out_tocache;
} }
...@@ -1468,7 +1468,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, ...@@ -1468,7 +1468,7 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
} }
folio_lock(folio); folio_lock(folio);
if (folio->mapping == mc) { if (likely(folio->mapping == mc)) {
/* /*
* The cached folio is still in managed cache but without * The cached folio is still in managed cache but without
* a valid `->private` pcluster hint. Let's reconnect them. * a valid `->private` pcluster hint. Let's reconnect them.
...@@ -1478,41 +1478,44 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, ...@@ -1478,41 +1478,44 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
/* compressed_bvecs[] already takes a ref before */ /* compressed_bvecs[] already takes a ref before */
folio_put(folio); folio_put(folio);
} }
if (likely(folio->private == pcl)) {
/* no need to submit if it is already up-to-date */ /* don't submit cache I/Os again if already uptodate */
if (folio_test_uptodate(folio)) { if (folio_test_uptodate(folio)) {
folio_unlock(folio); folio_unlock(folio);
bvec->bv_page = NULL; bvec->bv_page = NULL;
} }
return; return;
} }
/* /*
* It has been truncated, so it's unsafe to reuse this one. Let's * Already linked with another pcluster, which only appears in
* allocate a new page for compressed data. * crafted images by fuzzers for now. But handle this anyway.
*/ */
DBG_BUGON(folio->mapping); tocache = false; /* use temporary short-lived pages */
} else {
DBG_BUGON(1); /* referenced managed folios can't be truncated */
tocache = true; tocache = true;
}
folio_unlock(folio); folio_unlock(folio);
folio_put(folio); folio_put(folio);
out_allocfolio: out_allocfolio:
zbv.page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
spin_lock(&pcl->obj.lockref.lock); spin_lock(&pcl->obj.lockref.lock);
if (pcl->compressed_bvecs[nr].page) { if (unlikely(pcl->compressed_bvecs[nr].page != zbv.page)) {
erofs_pagepool_add(&f->pagepool, zbv.page); erofs_pagepool_add(&f->pagepool, page);
spin_unlock(&pcl->obj.lockref.lock); spin_unlock(&pcl->obj.lockref.lock);
cond_resched(); cond_resched();
goto repeat; goto repeat;
} }
bvec->bv_page = pcl->compressed_bvecs[nr].page = zbv.page; bvec->bv_page = pcl->compressed_bvecs[nr].page = page;
folio = page_folio(zbv.page); folio = page_folio(page);
/* first mark it as a temporary shortlived folio (now 1 ref) */
folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
spin_unlock(&pcl->obj.lockref.lock); spin_unlock(&pcl->obj.lockref.lock);
out_tocache: out_tocache:
if (!tocache || bs != PAGE_SIZE || if (!tocache || bs != PAGE_SIZE ||
filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) filemap_add_folio(mc, folio, pcl->obj.index + nr, gfp)) {
/* turn into a temporary shortlived folio (1 ref) */
folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
return; return;
}
folio_attach_private(folio, pcl); folio_attach_private(folio, pcl);
/* drop a refcount added by allocpage (then 2 refs in total here) */ /* drop a refcount added by allocpage (then 2 refs in total here) */
folio_put(folio); folio_put(folio);
...@@ -1647,13 +1650,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, ...@@ -1647,13 +1650,10 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
cur = mdev.m_pa; cur = mdev.m_pa;
end = cur + pcl->pclustersize; end = cur + pcl->pclustersize;
do { do {
z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc); bvec.bv_page = NULL;
if (!bvec.bv_page)
continue;
if (bio && (cur != last_pa || if (bio && (cur != last_pa ||
bio->bi_bdev != mdev.m_bdev)) { bio->bi_bdev != mdev.m_bdev)) {
io_retry: drain_io:
if (!erofs_is_fscache_mode(sb)) if (!erofs_is_fscache_mode(sb))
submit_bio(bio); submit_bio(bio);
else else
...@@ -1666,6 +1666,15 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, ...@@ -1666,6 +1666,15 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
bio = NULL; bio = NULL;
} }
if (!bvec.bv_page) {
z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
if (!bvec.bv_page)
continue;
if (cur + bvec.bv_len > end)
bvec.bv_len = end - cur;
DBG_BUGON(bvec.bv_len < sb->s_blocksize);
}
if (unlikely(PageWorkingset(bvec.bv_page)) && if (unlikely(PageWorkingset(bvec.bv_page)) &&
!memstall) { !memstall) {
psi_memstall_enter(&pflags); psi_memstall_enter(&pflags);
...@@ -1685,13 +1694,9 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, ...@@ -1685,13 +1694,9 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
++nr_bios; ++nr_bios;
} }
if (cur + bvec.bv_len > end)
bvec.bv_len = end - cur;
DBG_BUGON(bvec.bv_len < sb->s_blocksize);
if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
bvec.bv_offset)) bvec.bv_offset))
goto io_retry; goto drain_io;
last_pa = cur + bvec.bv_len; last_pa = cur + bvec.bv_len;
bypass = false; bypass = false;
} while ((cur += bvec.bv_len) < end); } while ((cur += bvec.bv_len) < end);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment