Commit 9ccce092 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-5.13-ofs-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux

Pull orangefs updates from Mike Marshall:
 "orangefs: implement orangefs_readahead

  mm/readahead.c/read_pages was quite a bit different back when I put my
  open-coded readahead logic into orangefs_readpage. That logic seemed
  to work as designed back then, it is a trainwreck now.

  This implements orangefs_readahead using the new xarray and
  readahead_expand features and removes all my open-coded readahead
  logic.

  This results in an extreme read performance improvement, these sample
  numbers are from my test VM:

  Here's an example of what's upstream in
  5.11.8-200.fc33.x86_64:

     30+0 records in
     30+0 records out
     125829120 bytes (126 MB, 120 MiB) copied, 5.77943 s, 21.8 MB/s

  And here's this version of orangefs_readahead on top of 5.12.0-rc4:

     30+0 records in
     30+0 records out
     125829120 bytes (126 MB, 120 MiB) copied, 0.325919 s, 386 MB/s

  There are four xfstest regressions with this patch. David Howells and
  Matthew Wilcox have been helping me work with this code"

* tag 'for-linus-5.13-ofs-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux:
  orangefs: leave files in the page cache for a few micro seconds at least
  Orangef: implement orangefs_readahead.
parents 27787ba3 211f9f2e
......@@ -248,20 +248,6 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
* or it can pointers to struct page's
*/
/*
* When reading, readahead_size will only be zero when
* we're doing O_DIRECT, otherwise we got here from
* orangefs_readpage.
*
* If we got here from orangefs_readpage we want to
* copy either a page or the whole file into the io
* vector, whichever is smaller.
*/
if (readahead_size)
copy_amount =
min(new_op->downcall.resp.io.amt_complete,
(__s64)PAGE_SIZE);
else
copy_amount = new_op->downcall.resp.io.amt_complete;
ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
......@@ -283,19 +269,11 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
out:
if (buffer_index >= 0) {
if ((readahead_size) && (type == ORANGEFS_IO_READ)) {
/* readpage */
*index_return = buffer_index;
gossip_debug(GOSSIP_FILE_DEBUG,
"%s: hold on to buffer_index :%d:\n",
__func__, buffer_index);
} else {
/* O_DIRECT */
orangefs_bufmap_put(buffer_index);
gossip_debug(GOSSIP_FILE_DEBUG,
"%s(%pU): PUT buffer_index %d\n",
__func__, handle, buffer_index);
}
buffer_index = -1;
}
op_release(new_op);
return ret;
......
......@@ -245,6 +245,50 @@ static int orangefs_writepages(struct address_space *mapping,
static int orangefs_launder_page(struct page *);
static void orangefs_readahead(struct readahead_control *rac)
{
loff_t offset;
struct iov_iter iter;
struct file *file = rac->file;
struct inode *inode = file->f_mapping->host;
struct xarray *i_pages;
struct page *page;
loff_t new_start = readahead_pos(rac);
int ret;
size_t new_len = 0;
loff_t bytes_remaining = inode->i_size - readahead_pos(rac);
loff_t pages_remaining = bytes_remaining / PAGE_SIZE;
if (pages_remaining >= 1024)
new_len = 4194304;
else if (pages_remaining > readahead_count(rac))
new_len = bytes_remaining;
if (new_len)
readahead_expand(rac, new_start, new_len);
offset = readahead_pos(rac);
i_pages = &file->f_mapping->i_pages;
iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));
/* read in the pages. */
if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
&offset, &iter, readahead_length(rac),
inode->i_size, NULL, NULL, file)) < 0)
gossip_debug(GOSSIP_FILE_DEBUG,
"%s: wait_for_direct_io failed. \n", __func__);
else
ret = 0;
/* clean up. */
while ((page = readahead_page(rac))) {
page_endio(page, false, ret);
put_page(page);
}
}
static int orangefs_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
......@@ -252,44 +296,24 @@ static int orangefs_readpage(struct file *file, struct page *page)
struct bio_vec bv;
ssize_t ret;
loff_t off; /* offset into this page */
pgoff_t index; /* which page */
struct page *next_page;
char *kaddr;
loff_t read_size;
int buffer_index = -1; /* orangefs shared memory slot */
int slot_index; /* index into slot */
int remaining;
/*
* Get up to this many bytes from Orangefs at a time and try
* to fill them into the page cache at once. Tests with dd made
* this seem like a reasonable static number, if there was
* interest perhaps this number could be made setable through
* sysfs...
*/
read_size = 524288;
if (PageDirty(page))
orangefs_launder_page(page);
off = page_offset(page);
index = off >> PAGE_SHIFT;
bv.bv_page = page;
bv.bv_len = PAGE_SIZE;
bv.bv_offset = 0;
iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
read_size, inode->i_size, NULL, &buffer_index, file);
remaining = ret;
PAGE_SIZE, inode->i_size, NULL, NULL, file);
/* this will only zero remaining unread portions of the page data */
iov_iter_zero(~0U, &iter);
/* takes care of potential aliasing */
flush_dcache_page(page);
if (ret < 0) {
SetPageError(page);
unlock_page(page);
goto out;
} else {
SetPageUptodate(page);
if (PageError(page))
......@@ -298,59 +322,6 @@ static int orangefs_readpage(struct file *file, struct page *page)
}
/* unlock the page after the ->readpage() routine completes */
unlock_page(page);
if (remaining > PAGE_SIZE) {
slot_index = 0;
while ((remaining - PAGE_SIZE) >= PAGE_SIZE) {
remaining -= PAGE_SIZE;
/*
* It is an optimization to try and fill more than one
* page... by now we've already gotten the single
* page we were after, if stuff doesn't seem to
* be going our way at this point just return
* and hope for the best.
*
* If we look for pages and they're already there is
* one reason to give up, and if they're not there
* and we can't create them is another reason.
*/
index++;
slot_index++;
next_page = find_get_page(inode->i_mapping, index);
if (next_page) {
gossip_debug(GOSSIP_FILE_DEBUG,
"%s: found next page, quitting\n",
__func__);
put_page(next_page);
goto out;
}
next_page = find_or_create_page(inode->i_mapping,
index,
GFP_KERNEL);
/*
* I've never hit this, leave it as a printk for
* now so it will be obvious.
*/
if (!next_page) {
printk("%s: can't create next page, quitting\n",
__func__);
goto out;
}
kaddr = kmap_atomic(next_page);
orangefs_bufmap_page_fill(kaddr,
buffer_index,
slot_index);
kunmap_atomic(kaddr);
SetPageUptodate(next_page);
unlock_page(next_page);
put_page(next_page);
}
}
out:
if (buffer_index != -1)
orangefs_bufmap_put(buffer_index);
return ret;
}
......@@ -660,6 +631,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
/** ORANGEFS2 implementation of address space operations */
static const struct address_space_operations orangefs_address_operations = {
.writepage = orangefs_writepage,
.readahead = orangefs_readahead,
.readpage = orangefs_readpage,
.writepages = orangefs_writepages,
.set_page_dirty = __set_page_dirty_nobuffers,
......
......@@ -31,7 +31,7 @@ static ulong module_parm_debug_mask;
__u64 orangefs_gossip_debug_mask;
int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
int orangefs_cache_timeout_msecs = 50;
int orangefs_cache_timeout_msecs = 500;
int orangefs_dcache_timeout_msecs = 50;
int orangefs_getattr_timeout_msecs = 50;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment