Commit 4a6bff11 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'erofs-for-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs

Pull erofs updates from Gao Xiang:
 "In this cycle, large folios are now enabled in the iomap/fscache mode
  for uncompressed files first. In order to do that, we've also cleaned
  up better interfaces between erofs and fscache, which are acked by
  fscache/netfs folks and included in this pull request.

  Other than that, there are random fixes around erofs over fscache and
  crafted images by syzbot, minor cleanups and documentation updates.

  Summary:

   - Enable large folios for iomap/fscache mode

   - Avoid sysfs warning due to mounting twice with the same fsid and
     domain_id in fscache mode

   - Refine fscache interface among erofs, fscache, and cachefiles

   - Use kmap_local_page() only for metabuf

   - Fixes around crafted images found by syzbot

   - Minor cleanups and documentation updates"

* tag 'erofs-for-6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs:
  erofs: validate the extent length for uncompressed pclusters
  erofs: fix missing unmap if z_erofs_get_extent_compressedlen() fails
  erofs: Fix pcluster memleak when its block address is zero
  erofs: use kmap_local_page() only for erofs_bread()
  erofs: enable large folios for fscache mode
  erofs: support large folios for fscache mode
  erofs: switch to prepare_ondemand_read() in fscache mode
  fscache,cachefiles: add prepare_ondemand_read() callback
  erofs: clean up cached I/O strategies
  erofs: update documentation
  erofs: check the uniqueness of fsid in shared domain in advance
  erofs: enable large folios for iomap mode
parents ad0d9da1 c505feba
......@@ -30,12 +30,18 @@ It is implemented to be a better choice for the following scenarios:
especially for those embedded devices with limited memory and high-density
hosts with numerous containers.
Here is the main features of EROFS:
Here are the main features of EROFS:
- Little endian on-disk design;
- 4KiB block size and 32-bit block addresses, therefore 16TiB address space
at most for now;
- Block-based distribution and file-based distribution over fscache are
supported;
- Support multiple devices to refer to external blobs, which can be used
for container images;
- 4KiB block size and 32-bit block addresses for each device, therefore
16TiB address space at most for now;
- Two inode layouts for different requirements:
......@@ -50,28 +56,31 @@ Here is the main features of EROFS:
Metadata reserved 8 bytes 18 bytes
===================== ============ ======================================
- Metadata and data could be mixed as an option;
- Support extended attributes (xattrs) as an option;
- Support extended attributes as an option;
- Support tailpacking data and xattr inline compared to byte-addressed
unaligned metadata or smaller block size alternatives;
- Support POSIX.1e ACLs by using xattrs;
- Support POSIX.1e ACLs by using extended attributes;
- Support transparent data compression as an option:
LZ4 and MicroLZMA algorithms can be used on a per-file basis; In addition,
inplace decompression is also supported to avoid bounce compressed buffers
and page cache thrashing.
- Support chunk-based data deduplication and rolling-hash compressed data
deduplication;
- Support tailpacking inline compared to byte-addressed unaligned metadata
or smaller block size alternatives;
- Support merging tail-end data into a special inode as fragments.
- Support large folios for uncompressed files.
- Support direct I/O on uncompressed files to avoid double caching for loop
devices;
- Support FSDAX on uncompressed images for secure containers and ramdisks in
order to get rid of unnecessary page cache.
- Support multiple devices for multi blob container images;
- Support file-based on-demand loading with the Fscache infrastructure.
The following git tree provides the file system user-space tools under
......@@ -259,7 +268,7 @@ By the way, chunk-based files are all uncompressed for now.
Data compression
----------------
EROFS implements LZ4 fixed-sized output compression which generates fixed-sized
EROFS implements fixed-sized output compression which generates fixed-sized
compressed data blocks from variable-sized input in contrast to other existing
fixed-sized input solutions. Relatively higher compression ratios can be gotten
by using fixed-sized output compression since nowadays popular data compression
......@@ -314,3 +323,6 @@ to understand its delta0 is constantly 1, as illustrated below::
If another HEAD follows a HEAD lcluster, there is no room to record CBLKCNT,
but it's easy to know the size of such pcluster is 1 lcluster as well.
Since Linux v6.1, each pcluster can be used for multiple variable-sized extents,
therefore it can be used for compressed data deduplication.
......@@ -385,38 +385,35 @@ static int cachefiles_write(struct netfs_cache_resources *cres,
term_func, term_func_priv);
}
/*
* Prepare a read operation, shortening it to a cached/uncached
* boundary as appropriate.
*/
static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
loff_t i_size)
static inline enum netfs_io_source
cachefiles_do_prepare_read(struct netfs_cache_resources *cres,
loff_t start, size_t *_len, loff_t i_size,
unsigned long *_flags, ino_t netfs_ino)
{
enum cachefiles_prepare_read_trace why;
struct netfs_io_request *rreq = subreq->rreq;
struct netfs_cache_resources *cres = &rreq->cache_resources;
struct cachefiles_object *object;
struct cachefiles_object *object = NULL;
struct cachefiles_cache *cache;
struct fscache_cookie *cookie = fscache_cres_cookie(cres);
const struct cred *saved_cred;
struct file *file = cachefiles_cres_file(cres);
enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER;
size_t len = *_len;
loff_t off, to;
ino_t ino = file ? file_inode(file)->i_ino : 0;
int rc;
_enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size);
_enter("%zx @%llx/%llx", len, start, i_size);
if (subreq->start >= i_size) {
if (start >= i_size) {
ret = NETFS_FILL_WITH_ZEROES;
why = cachefiles_trace_read_after_eof;
goto out_no_object;
}
if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) {
__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
__set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
why = cachefiles_trace_read_no_data;
if (!test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags))
if (!test_bit(NETFS_SREQ_ONDEMAND, _flags))
goto out_no_object;
}
......@@ -437,7 +434,7 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
retry:
off = cachefiles_inject_read_error();
if (off == 0)
off = vfs_llseek(file, subreq->start, SEEK_DATA);
off = vfs_llseek(file, start, SEEK_DATA);
if (off < 0 && off >= (loff_t)-MAX_ERRNO) {
if (off == (loff_t)-ENXIO) {
why = cachefiles_trace_read_seek_nxio;
......@@ -449,21 +446,22 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
goto out;
}
if (off >= subreq->start + subreq->len) {
if (off >= start + len) {
why = cachefiles_trace_read_found_hole;
goto download_and_store;
}
if (off > subreq->start) {
if (off > start) {
off = round_up(off, cache->bsize);
subreq->len = off - subreq->start;
len = off - start;
*_len = len;
why = cachefiles_trace_read_found_part;
goto download_and_store;
}
to = cachefiles_inject_read_error();
if (to == 0)
to = vfs_llseek(file, subreq->start, SEEK_HOLE);
to = vfs_llseek(file, start, SEEK_HOLE);
if (to < 0 && to >= (loff_t)-MAX_ERRNO) {
trace_cachefiles_io_error(object, file_inode(file), to,
cachefiles_trace_seek_error);
......@@ -471,12 +469,13 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
goto out;
}
if (to < subreq->start + subreq->len) {
if (subreq->start + subreq->len >= i_size)
if (to < start + len) {
if (start + len >= i_size)
to = round_up(to, cache->bsize);
else
to = round_down(to, cache->bsize);
subreq->len = to - subreq->start;
len = to - start;
*_len = len;
}
why = cachefiles_trace_read_have_data;
......@@ -484,12 +483,11 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
goto out;
download_and_store:
__set_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
if (test_bit(NETFS_SREQ_ONDEMAND, &subreq->flags)) {
rc = cachefiles_ondemand_read(object, subreq->start,
subreq->len);
__set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags);
if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) {
rc = cachefiles_ondemand_read(object, start, len);
if (!rc) {
__clear_bit(NETFS_SREQ_ONDEMAND, &subreq->flags);
__clear_bit(NETFS_SREQ_ONDEMAND, _flags);
goto retry;
}
ret = NETFS_INVALID_READ;
......@@ -497,10 +495,34 @@ static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *
out:
cachefiles_end_secure(cache, saved_cred);
out_no_object:
trace_cachefiles_prep_read(subreq, ret, why, ino);
trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino);
return ret;
}
/*
* Prepare a read operation, shortening it to a cached/uncached
* boundary as appropriate.
*/
static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq,
loff_t i_size)
{
return cachefiles_do_prepare_read(&subreq->rreq->cache_resources,
subreq->start, &subreq->len, i_size,
&subreq->flags, subreq->rreq->inode->i_ino);
}
/*
* Prepare an on-demand read operation, shortening it to a cached/uncached
* boundary as appropriate.
*/
static enum netfs_io_source
cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
loff_t start, size_t *_len, loff_t i_size,
unsigned long *_flags, ino_t ino)
{
return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino);
}
/*
* Prepare for a write to occur.
*/
......@@ -621,6 +643,7 @@ static const struct netfs_cache_ops cachefiles_netfs_cache_ops = {
.write = cachefiles_write,
.prepare_read = cachefiles_prepare_read,
.prepare_write = cachefiles_prepare_write,
.prepare_ondemand_read = cachefiles_prepare_ondemand_read,
.query_occupancy = cachefiles_query_occupancy,
};
......
......@@ -13,9 +13,7 @@
void erofs_unmap_metabuf(struct erofs_buf *buf)
{
if (buf->kmap_type == EROFS_KMAP)
kunmap(buf->page);
else if (buf->kmap_type == EROFS_KMAP_ATOMIC)
kunmap_atomic(buf->base);
kunmap_local(buf->base);
buf->base = NULL;
buf->kmap_type = EROFS_NO_KMAP;
}
......@@ -54,9 +52,7 @@ void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
}
if (buf->kmap_type == EROFS_NO_KMAP) {
if (type == EROFS_KMAP)
buf->base = kmap(page);
else if (type == EROFS_KMAP_ATOMIC)
buf->base = kmap_atomic(page);
buf->base = kmap_local_page(page);
buf->kmap_type = type;
} else if (buf->kmap_type != type) {
DBG_BUGON(1);
......@@ -403,6 +399,8 @@ const struct address_space_operations erofs_raw_access_aops = {
.readahead = erofs_readahead,
.bmap = erofs_bmap,
.direct_IO = noop_direct_IO,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
};
#ifdef CONFIG_FS_DAX
......
......@@ -11,265 +11,201 @@ static DEFINE_MUTEX(erofs_domain_cookies_lock);
static LIST_HEAD(erofs_domain_list);
static struct vfsmount *erofs_pseudo_mnt;
static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
struct erofs_fscache_request {
struct erofs_fscache_request *primary;
struct netfs_cache_resources cache_resources;
struct address_space *mapping; /* The mapping being accessed */
loff_t start; /* Start position */
size_t len; /* Length of the request */
size_t submitted; /* Length of submitted */
short error; /* 0 or error that occurred */
refcount_t ref;
};
static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
loff_t start, size_t len)
{
struct netfs_io_request *rreq;
struct erofs_fscache_request *req;
rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
if (!rreq)
req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
if (!req)
return ERR_PTR(-ENOMEM);
rreq->start = start;
rreq->len = len;
rreq->mapping = mapping;
rreq->inode = mapping->host;
INIT_LIST_HEAD(&rreq->subrequests);
refcount_set(&rreq->ref, 1);
return rreq;
}
req->mapping = mapping;
req->start = start;
req->len = len;
refcount_set(&req->ref, 1);
static void erofs_fscache_put_request(struct netfs_io_request *rreq)
{
if (!refcount_dec_and_test(&rreq->ref))
return;
if (rreq->cache_resources.ops)
rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
kfree(rreq);
return req;
}
static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq)
static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
size_t len)
{
if (!refcount_dec_and_test(&subreq->ref))
return;
erofs_fscache_put_request(subreq->rreq);
kfree(subreq);
}
struct erofs_fscache_request *req;
static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
/* use primary request for the first submission */
if (!primary->submitted) {
refcount_inc(&primary->ref);
return primary;
}
while (!list_empty(&rreq->subrequests)) {
subreq = list_first_entry(&rreq->subrequests,
struct netfs_io_subrequest, rreq_link);
list_del(&subreq->rreq_link);
erofs_fscache_put_subrequest(subreq);
req = erofs_fscache_req_alloc(primary->mapping,
primary->start + primary->submitted, len);
if (!IS_ERR(req)) {
req->primary = primary;
refcount_inc(&primary->ref);
}
return req;
}
static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq)
static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
{
struct netfs_io_subrequest *subreq;
struct folio *folio;
unsigned int iopos = 0;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
bool subreq_failed = false;
bool failed = req->error;
pgoff_t start_page = req->start / PAGE_SIZE;
pgoff_t last_page = ((req->start + req->len) / PAGE_SIZE) - 1;
XA_STATE(xas, &rreq->mapping->i_pages, start_page);
subreq = list_first_entry(&rreq->subrequests,
struct netfs_io_subrequest, rreq_link);
subreq_failed = (subreq->error < 0);
XA_STATE(xas, &req->mapping->i_pages, start_page);
rcu_read_lock();
xas_for_each(&xas, folio, last_page) {
unsigned int pgpos, pgend;
bool pg_failed = false;
if (xas_retry(&xas, folio))
continue;
pgpos = (folio_index(folio) - start_page) * PAGE_SIZE;
pgend = pgpos + folio_size(folio);
for (;;) {
if (!subreq) {
pg_failed = true;
break;
}
pg_failed |= subreq_failed;
if (pgend < iopos + subreq->len)
break;
iopos += subreq->len;
if (!list_is_last(&subreq->rreq_link,
&rreq->subrequests)) {
subreq = list_next_entry(subreq, rreq_link);
subreq_failed = (subreq->error < 0);
} else {
subreq = NULL;
subreq_failed = false;
}
if (pgend == iopos)
break;
}
if (!pg_failed)
if (!failed)
folio_mark_uptodate(folio);
folio_unlock(folio);
}
rcu_read_unlock();
}
static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq)
static void erofs_fscache_req_put(struct erofs_fscache_request *req)
{
erofs_fscache_rreq_unlock_folios(rreq);
erofs_fscache_clear_subrequests(rreq);
erofs_fscache_put_request(rreq);
if (refcount_dec_and_test(&req->ref)) {
if (req->cache_resources.ops)
req->cache_resources.ops->end_operation(&req->cache_resources);
if (!req->primary)
erofs_fscache_req_complete(req);
else
erofs_fscache_req_put(req->primary);
kfree(req);
}
}
static void erofc_fscache_subreq_complete(void *priv,
static void erofs_fscache_subreq_complete(void *priv,
ssize_t transferred_or_error, bool was_async)
{
struct netfs_io_subrequest *subreq = priv;
struct netfs_io_request *rreq = subreq->rreq;
struct erofs_fscache_request *req = priv;
if (IS_ERR_VALUE(transferred_or_error))
subreq->error = transferred_or_error;
if (atomic_dec_and_test(&rreq->nr_outstanding))
erofs_fscache_rreq_complete(rreq);
erofs_fscache_put_subrequest(subreq);
if (IS_ERR_VALUE(transferred_or_error)) {
if (req->primary)
req->primary->error = transferred_or_error;
else
req->error = transferred_or_error;
}
erofs_fscache_req_put(req);
}
/*
* Read data from fscache and fill the read data into page cache described by
* @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes
* the start physical address in the cache file.
* Read data from fscache (cookie, pstart, len), and fill the read data into
* page cache described by (req->mapping, lstart, len). @pstart describeis the
* start physical address in the cache file.
*/
static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
struct netfs_io_request *rreq, loff_t pstart)
struct erofs_fscache_request *req, loff_t pstart, size_t len)
{
enum netfs_io_source source;
struct super_block *sb = rreq->mapping->host->i_sb;
struct netfs_io_subrequest *subreq;
struct netfs_cache_resources *cres = &rreq->cache_resources;
struct super_block *sb = req->mapping->host->i_sb;
struct netfs_cache_resources *cres = &req->cache_resources;
struct iov_iter iter;
loff_t start = rreq->start;
size_t len = rreq->len;
loff_t lstart = req->start + req->submitted;
size_t done = 0;
int ret;
atomic_set(&rreq->nr_outstanding, 1);
DBG_BUGON(len > req->len - req->submitted);
ret = fscache_begin_read_operation(cres, cookie);
if (ret)
goto out;
return ret;
while (done < len) {
subreq = kzalloc(sizeof(struct netfs_io_subrequest),
GFP_KERNEL);
if (subreq) {
INIT_LIST_HEAD(&subreq->rreq_link);
refcount_set(&subreq->ref, 2);
subreq->rreq = rreq;
refcount_inc(&rreq->ref);
} else {
ret = -ENOMEM;
goto out;
}
subreq->start = pstart + done;
subreq->len = len - done;
subreq->flags = 1 << NETFS_SREQ_ONDEMAND;
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
loff_t sstart = pstart + done;
size_t slen = len - done;
unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
source = cres->ops->prepare_read(subreq, LLONG_MAX);
if (WARN_ON(subreq->len == 0))
source = cres->ops->prepare_ondemand_read(cres,
sstart, &slen, LLONG_MAX, &flags, 0);
if (WARN_ON(slen == 0))
source = NETFS_INVALID_READ;
if (source != NETFS_READ_FROM_CACHE) {
erofs_err(sb, "failed to fscache prepare_read (source %d)",
source);
ret = -EIO;
subreq->error = ret;
erofs_fscache_put_subrequest(subreq);
goto out;
erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
return -EIO;
}
atomic_inc(&rreq->nr_outstanding);
iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
start + done, subreq->len);
refcount_inc(&req->ref);
iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages,
lstart + done, slen);
ret = fscache_read(cres, subreq->start, &iter,
NETFS_READ_HOLE_FAIL,
erofc_fscache_subreq_complete, subreq);
ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
erofs_fscache_subreq_complete, req);
if (ret == -EIOCBQUEUED)
ret = 0;
if (ret) {
erofs_err(sb, "failed to fscache_read (ret %d)", ret);
goto out;
return ret;
}
done += subreq->len;
done += slen;
}
out:
if (atomic_dec_and_test(&rreq->nr_outstanding))
erofs_fscache_rreq_complete(rreq);
return ret;
DBG_BUGON(done != len);
return 0;
}
static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
{
int ret;
struct super_block *sb = folio_mapping(folio)->host->i_sb;
struct netfs_io_request *rreq;
struct erofs_fscache_request *req;
struct erofs_map_dev mdev = {
.m_deviceid = 0,
.m_pa = folio_pos(folio),
};
ret = erofs_map_dev(sb, &mdev);
if (ret)
goto out;
if (ret) {
folio_unlock(folio);
return ret;
}
rreq = erofs_fscache_alloc_request(folio_mapping(folio),
req = erofs_fscache_req_alloc(folio_mapping(folio),
folio_pos(folio), folio_size(folio));
if (IS_ERR(rreq)) {
ret = PTR_ERR(rreq);
goto out;
if (IS_ERR(req)) {
folio_unlock(folio);
return PTR_ERR(req);
}
return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
rreq, mdev.m_pa);
out:
folio_unlock(folio);
ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
req, mdev.m_pa, folio_size(folio));
if (ret)
req->error = ret;
erofs_fscache_req_put(req);
return ret;
}
/*
* Read into page cache in the range described by (@pos, @len).
*
* On return, the caller is responsible for page unlocking if the output @unlock
* is true, or the callee will take this responsibility through netfs_io_request
* interface.
*
* The return value is the number of bytes successfully handled, or negative
* error code on failure. The only exception is that, the length of the range
* instead of the error code is returned on failure after netfs_io_request is
* allocated, so that .readahead() could advance rac accordingly.
*/
static int erofs_fscache_data_read(struct address_space *mapping,
loff_t pos, size_t len, bool *unlock)
static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
{
struct address_space *mapping = primary->mapping;
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
struct netfs_io_request *rreq;
struct erofs_fscache_request *req;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
struct iov_iter iter;
loff_t pos = primary->start + primary->submitted;
size_t count;
int ret;
*unlock = true;
map.m_la = pos;
ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
if (ret)
......@@ -297,17 +233,19 @@ static int erofs_fscache_data_read(struct address_space *mapping,
}
iov_iter_zero(PAGE_SIZE - size, &iter);
erofs_put_metabuf(&buf);
return PAGE_SIZE;
primary->submitted += PAGE_SIZE;
return 0;
}
count = primary->len - primary->submitted;
if (!(map.m_flags & EROFS_MAP_MAPPED)) {
count = len;
iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
iov_iter_zero(count, &iter);
return count;
primary->submitted += count;
return 0;
}
count = min_t(size_t, map.m_llen - (pos - map.m_la), len);
count = min_t(size_t, map.m_llen - (pos - map.m_la), count);
DBG_BUGON(!count || count % PAGE_SIZE);
mdev = (struct erofs_map_dev) {
......@@ -318,64 +256,65 @@ static int erofs_fscache_data_read(struct address_space *mapping,
if (ret)
return ret;
rreq = erofs_fscache_alloc_request(mapping, pos, count);
if (IS_ERR(rreq))
return PTR_ERR(rreq);
req = erofs_fscache_req_chain(primary, count);
if (IS_ERR(req))
return PTR_ERR(req);
*unlock = false;
erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
rreq, mdev.m_pa + (pos - map.m_la));
return count;
ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
req, mdev.m_pa + (pos - map.m_la), count);
erofs_fscache_req_put(req);
primary->submitted += count;
return ret;
}
static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
static int erofs_fscache_data_read(struct erofs_fscache_request *req)
{
bool unlock;
int ret;
DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
do {
ret = erofs_fscache_data_read_slice(req);
if (ret)
req->error = ret;
} while (!ret && req->submitted < req->len);
ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio),
folio_size(folio), &unlock);
if (unlock) {
if (ret > 0)
folio_mark_uptodate(folio);
return ret;
}
static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
{
struct erofs_fscache_request *req;
int ret;
req = erofs_fscache_req_alloc(folio_mapping(folio),
folio_pos(folio), folio_size(folio));
if (IS_ERR(req)) {
folio_unlock(folio);
return PTR_ERR(req);
}
return ret < 0 ? ret : 0;
ret = erofs_fscache_data_read(req);
erofs_fscache_req_put(req);
return ret;
}
static void erofs_fscache_readahead(struct readahead_control *rac)
{
struct folio *folio;
size_t len, done = 0;
loff_t start, pos;
bool unlock;
int ret, size;
struct erofs_fscache_request *req;
if (!readahead_count(rac))
return;
start = readahead_pos(rac);
len = readahead_length(rac);
do {
pos = start + done;
ret = erofs_fscache_data_read(rac->mapping, pos,
len - done, &unlock);
if (ret <= 0)
req = erofs_fscache_req_alloc(rac->mapping,
readahead_pos(rac), readahead_length(rac));
if (IS_ERR(req))
return;
size = ret;
while (size) {
folio = readahead_folio(rac);
size -= folio_size(folio);
if (unlock) {
folio_mark_uptodate(folio);
folio_unlock(folio);
}
}
} while ((done += ret) < len);
/* The request completion will drop refs on the folios. */
while (readahead_folio(rac))
;
erofs_fscache_data_read(req);
erofs_fscache_req_put(req);
}
static const struct address_space_operations erofs_fscache_meta_aops = {
......@@ -494,7 +433,8 @@ static int erofs_fscache_register_domain(struct super_block *sb)
static
struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
char *name, bool need_inode)
char *name,
unsigned int flags)
{
struct fscache_volume *volume = EROFS_SB(sb)->volume;
struct erofs_fscache *ctx;
......@@ -516,7 +456,7 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
fscache_use_cookie(cookie, false);
ctx->cookie = cookie;
if (need_inode) {
if (flags & EROFS_REG_COOKIE_NEED_INODE) {
struct inode *const inode = new_inode(sb);
if (!inode) {
......@@ -554,14 +494,15 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
static
struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
char *name, bool need_inode)
char *name,
unsigned int flags)
{
int err;
struct inode *inode;
struct erofs_fscache *ctx;
struct erofs_domain *domain = EROFS_SB(sb)->domain;
ctx = erofs_fscache_acquire_cookie(sb, name, need_inode);
ctx = erofs_fscache_acquire_cookie(sb, name, flags);
if (IS_ERR(ctx))
return ctx;
......@@ -589,7 +530,8 @@ struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
static
struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
char *name, bool need_inode)
char *name,
unsigned int flags)
{
struct inode *inode;
struct erofs_fscache *ctx;
......@@ -602,23 +544,30 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
ctx = inode->i_private;
if (!ctx || ctx->domain != domain || strcmp(ctx->name, name))
continue;
if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) {
igrab(inode);
} else {
erofs_err(sb, "%s already exists in domain %s", name,
domain->domain_id);
ctx = ERR_PTR(-EEXIST);
}
spin_unlock(&psb->s_inode_list_lock);
mutex_unlock(&erofs_domain_cookies_lock);
return ctx;
}
spin_unlock(&psb->s_inode_list_lock);
ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode);
ctx = erofs_fscache_domain_init_cookie(sb, name, flags);
mutex_unlock(&erofs_domain_cookies_lock);
return ctx;
}
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
char *name, bool need_inode)
char *name,
unsigned int flags)
{
if (EROFS_SB(sb)->domain_id)
return erofs_domain_register_cookie(sb, name, need_inode);
return erofs_fscache_acquire_cookie(sb, name, need_inode);
return erofs_domain_register_cookie(sb, name, flags);
return erofs_fscache_acquire_cookie(sb, name, flags);
}
void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
......@@ -647,6 +596,7 @@ int erofs_fscache_register_fs(struct super_block *sb)
int ret;
struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_fscache *fscache;
unsigned int flags;
if (sbi->domain_id)
ret = erofs_fscache_register_domain(sb);
......@@ -655,8 +605,20 @@ int erofs_fscache_register_fs(struct super_block *sb)
if (ret)
return ret;
/* acquired domain/volume will be relinquished in kill_sb() on error */
fscache = erofs_fscache_register_cookie(sb, sbi->fsid, true);
/*
* When shared domain is enabled, using NEED_NOEXIST to guarantee
* the primary data blob (aka fsid) is unique in the shared domain.
*
* For non-shared-domain case, fscache_acquire_volume() invoked by
* erofs_fscache_register_volume() has already guaranteed
* the uniqueness of primary data blob.
*
* Acquired domain/volume will be relinquished in kill_sb() on error.
*/
flags = EROFS_REG_COOKIE_NEED_INODE;
if (sbi->domain_id)
flags |= EROFS_REG_COOKIE_NEED_NOEXIST;
fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags);
if (IS_ERR(fscache))
return PTR_ERR(fscache);
......
......@@ -268,6 +268,7 @@ static int erofs_fill_inode(struct inode *inode)
case S_IFDIR:
inode->i_op = &erofs_dir_iops;
inode->i_fop = &erofs_dir_fops;
inode_nohighmem(inode);
break;
case S_IFLNK:
err = erofs_fill_symlink(inode, kaddr, ofs);
......@@ -295,6 +296,7 @@ static int erofs_fill_inode(struct inode *inode)
goto out_unlock;
}
inode->i_mapping->a_ops = &erofs_raw_access_aops;
mapping_set_large_folios(inode->i_mapping);
#ifdef CONFIG_EROFS_FS_ONDEMAND
if (erofs_is_fscache_mode(inode->i_sb))
inode->i_mapping->a_ops = &erofs_fscache_access_aops;
......
......@@ -255,8 +255,7 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
enum erofs_kmap_type {
EROFS_NO_KMAP, /* don't map the buffer */
EROFS_KMAP, /* use kmap() to map the buffer */
EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */
EROFS_KMAP, /* use kmap_local_page() to map the buffer */
};
struct erofs_buf {
......@@ -604,13 +603,18 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
/* flags for erofs_fscache_register_cookie() */
#define EROFS_REG_COOKIE_NEED_INODE 1
#define EROFS_REG_COOKIE_NEED_NOEXIST 2
/* fscache.c */
#ifdef CONFIG_EROFS_FS_ONDEMAND
int erofs_fscache_register_fs(struct super_block *sb);
void erofs_fscache_unregister_fs(struct super_block *sb);
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
char *name, bool need_inode);
char *name,
unsigned int flags);
void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
extern const struct address_space_operations erofs_fscache_access_aops;
......@@ -623,7 +627,8 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
static inline
struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
char *name, bool need_inode)
char *name,
unsigned int flags)
{
return ERR_PTR(-EOPNOTSUPP);
}
......
......@@ -245,7 +245,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
}
if (erofs_is_fscache_mode(sb)) {
fscache = erofs_fscache_register_cookie(sb, dif->path, false);
fscache = erofs_fscache_register_cookie(sb, dif->path, 0);
if (IS_ERR(fscache))
return PTR_ERR(fscache);
dif->fscache = fscache;
......
......@@ -148,7 +148,7 @@ static inline int xattr_iter_fixup(struct xattr_iter *it)
it->blkaddr += erofs_blknr(it->ofs);
it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
EROFS_KMAP_ATOMIC);
EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
it->ofs = erofs_blkoff(it->ofs);
......@@ -174,7 +174,7 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
EROFS_KMAP_ATOMIC);
EROFS_KMAP);
if (IS_ERR(it->kaddr))
return PTR_ERR(it->kaddr);
return vi->xattr_isize - xattr_header_sz;
......@@ -368,7 +368,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
EROFS_KMAP_ATOMIC);
EROFS_KMAP);
if (IS_ERR(it->it.kaddr))
return PTR_ERR(it->it.kaddr);
it->it.blkaddr = blkaddr;
......@@ -580,7 +580,7 @@ static int shared_listxattr(struct listxattr_iter *it)
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
EROFS_KMAP_ATOMIC);
EROFS_KMAP);
if (IS_ERR(it->it.kaddr))
return PTR_ERR(it->it.kaddr);
it->it.blkaddr = blkaddr;
......
......@@ -175,16 +175,6 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
DBG_BUGON(1);
}
/* how to allocate cached pages for a pcluster */
enum z_erofs_cache_alloctype {
DONTALLOC, /* don't allocate any cached pages */
/*
* try to use cached I/O if page allocation succeeds or fallback
* to in-place I/O instead to avoid any direct reclaim.
*/
TRYALLOC,
};
/*
* tagged pointer with 1-bit tag for all compressed pages
* tag 0 - the page is just found with an extra page reference
......@@ -292,12 +282,29 @@ struct z_erofs_decompress_frontend {
.inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
.mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
{
unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
return false;
if (fe->backmost)
return true;
if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
fe->map.m_la < fe->headoffset)
return true;
return false;
}
static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
enum z_erofs_cache_alloctype type,
struct page **pagepool)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
bool shouldalloc = z_erofs_should_alloc_cache(fe);
bool standalone = true;
/*
* optimistic allocation without direct reclaim since inplace I/O
......@@ -326,18 +333,19 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
} else {
/* I/O is needed, no possible to decompress directly */
standalone = false;
switch (type) {
case TRYALLOC:
if (!shouldalloc)
continue;
/*
* try to use cached I/O if page allocation
* succeeds or fallback to in-place I/O instead
* to avoid any direct reclaim.
*/
newpage = erofs_allocpage(pagepool, gfp);
if (!newpage)
continue;
set_page_private(newpage,
Z_EROFS_PREALLOCATED_PAGE);
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
t = tag_compressed_page_justfound(newpage);
break;
default: /* DONTALLOC */
continue;
}
}
if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL,
......@@ -488,7 +496,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
struct erofs_workgroup *grp;
int err;
if (!(map->m_flags & EROFS_MAP_ENCODED)) {
if (!(map->m_flags & EROFS_MAP_ENCODED) ||
(!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
......@@ -637,20 +646,6 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
return true;
}
static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
unsigned int cachestrategy,
erofs_off_t la)
{
if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
return false;
if (fe->backmost)
return true;
return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
la < fe->headoffset;
}
static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
struct page *page, unsigned int pageofs,
unsigned int len)
......@@ -687,12 +682,9 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
struct page *page, struct page **pagepool)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
const loff_t offset = page_offset(page);
bool tight = true, exclusive;
enum z_erofs_cache_alloctype cache_strategy;
unsigned int cur, end, spiltted;
int err = 0;
......@@ -746,13 +738,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
} else {
/* bind cache first when cached decompression is preferred */
if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
map->m_la))
cache_strategy = TRYALLOC;
else
cache_strategy = DONTALLOC;
z_erofs_bind_cache(fe, cache_strategy, pagepool);
z_erofs_bind_cache(fe, pagepool);
}
hitted:
/*
......
......@@ -178,7 +178,7 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int advise, type;
m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
erofs_blknr(pos), EROFS_KMAP_ATOMIC);
erofs_blknr(pos), EROFS_KMAP);
if (IS_ERR(m->kaddr))
return PTR_ERR(m->kaddr);
......@@ -416,7 +416,7 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
out:
pos += lcn * (1 << amortizedshift);
m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
erofs_blknr(pos), EROFS_KMAP_ATOMIC);
erofs_blknr(pos), EROFS_KMAP);
if (IS_ERR(m->kaddr))
return PTR_ERR(m->kaddr);
return unpack_compacted_index(m, amortizedshift, pos, lookahead);
......@@ -694,10 +694,15 @@ static int z_erofs_do_map_blocks(struct inode *inode,
map->m_pa = blknr_to_addr(m.pblk);
err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
if (err)
goto out;
goto unmap_out;
}
if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) {
if (map->m_llen > map->m_plen) {
DBG_BUGON(1);
err = -EFSCORRUPTED;
goto unmap_out;
}
if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
map->m_algorithmformat =
Z_EROFS_COMPRESSION_INTERLACED;
......@@ -718,14 +723,12 @@ static int z_erofs_do_map_blocks(struct inode *inode,
if (!err)
map->m_flags |= EROFS_MAP_FULL_MAPPED;
}
unmap_out:
erofs_unmap_metabuf(&m.map->buf);
out:
erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
__func__, map->m_la, map->m_pa,
map->m_llen, map->m_plen, map->m_flags);
return err;
}
......
......@@ -267,6 +267,14 @@ struct netfs_cache_ops {
loff_t *_start, size_t *_len, loff_t i_size,
bool no_space_allocated_yet);
/* Prepare an on-demand read operation, shortening it to a cached/uncached
* boundary as appropriate.
*/
enum netfs_io_source (*prepare_ondemand_read)(struct netfs_cache_resources *cres,
loff_t start, size_t *_len,
loff_t i_size,
unsigned long *_flags, ino_t ino);
/* Query the occupancy of the cache in a region, returning where the
* next chunk of data starts and how long it is.
*/
......
......@@ -428,16 +428,18 @@ TRACE_EVENT(cachefiles_vol_coherency,
);
TRACE_EVENT(cachefiles_prep_read,
TP_PROTO(struct netfs_io_subrequest *sreq,
TP_PROTO(struct cachefiles_object *obj,
loff_t start,
size_t len,
unsigned short flags,
enum netfs_io_source source,
enum cachefiles_prepare_read_trace why,
ino_t cache_inode),
ino_t cache_inode, ino_t netfs_inode),
TP_ARGS(sreq, source, why, cache_inode),
TP_ARGS(obj, start, len, flags, source, why, cache_inode, netfs_inode),
TP_STRUCT__entry(
__field(unsigned int, rreq )
__field(unsigned short, index )
__field(unsigned int, obj )
__field(unsigned short, flags )
__field(enum netfs_io_source, source )
__field(enum cachefiles_prepare_read_trace, why )
......@@ -448,19 +450,18 @@ TRACE_EVENT(cachefiles_prep_read,
),
TP_fast_assign(
__entry->rreq = sreq->rreq->debug_id;
__entry->index = sreq->debug_index;
__entry->flags = sreq->flags;
__entry->obj = obj ? obj->debug_id : 0;
__entry->flags = flags;
__entry->source = source;
__entry->why = why;
__entry->len = sreq->len;
__entry->start = sreq->start;
__entry->netfs_inode = sreq->rreq->inode->i_ino;
__entry->len = len;
__entry->start = start;
__entry->netfs_inode = netfs_inode;
__entry->cache_inode = cache_inode;
),
TP_printk("R=%08x[%u] %s %s f=%02x s=%llx %zx ni=%x B=%x",
__entry->rreq, __entry->index,
TP_printk("o=%08x %s %s f=%02x s=%llx %zx ni=%x B=%x",
__entry->obj,
__print_symbolic(__entry->source, netfs_sreq_sources),
__print_symbolic(__entry->why, cachefiles_prepare_read_traces),
__entry->flags,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment