Commit f57a6a5d authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Track dirtyness at sector level, not page

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent adfcfaf0
...@@ -500,52 +500,113 @@ static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info ...@@ -500,52 +500,113 @@ static inline struct bch_io_opts io_opts(struct bch_fs *c, struct bch_inode_info
/* stored in page->private: */ /* stored in page->private: */
struct bch_page_state { struct bch_page_sector {
/* existing data: */
unsigned sectors:PAGE_SECTOR_SHIFT + 1;
/* Uncompressed, fully allocated replicas: */ /* Uncompressed, fully allocated replicas: */
unsigned nr_replicas:4; unsigned nr_replicas:3;
/* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
unsigned replicas_reserved:4; unsigned replicas_reserved:3;
/* Owns PAGE_SECTORS sized quota reservation: */ /* i_sectors: */
unsigned quota_reserved:1; enum {
SECTOR_UNALLOCATED,
SECTOR_QUOTA_RESERVED,
SECTOR_DIRTY,
SECTOR_ALLOCATED,
} state:2;
};
/* struct bch_page_state {
* Number of sectors on disk - for i_blocks struct bch_page_sector s[PAGE_SECTORS];
* Uncompressed size, not compressed size:
*/
unsigned dirty_sectors:PAGE_SECTOR_SHIFT + 1;
}; };
static inline struct bch_page_state *page_state(struct page *page) static inline struct bch_page_state *__bch2_page_state(struct page *page)
{ {
struct bch_page_state *s = (void *) &page->private; return page_has_private(page)
? (struct bch_page_state *) page_private(page)
: NULL;
}
static inline struct bch_page_state *bch2_page_state(struct page *page)
{
EBUG_ON(!PageLocked(page)); EBUG_ON(!PageLocked(page));
BUILD_BUG_ON(sizeof(*s) > sizeof(page->private));
if (!PagePrivate(page)) return __bch2_page_state(page);
SetPagePrivate(page); }
/* for newly allocated pages: */
static void __bch2_page_state_release(struct page *page)
{
struct bch_page_state *s = __bch2_page_state(page);
if (!s)
return;
ClearPagePrivate(page);
set_page_private(page, 0);
put_page(page);
kfree(s);
}
static void bch2_page_state_release(struct page *page)
{
struct bch_page_state *s = bch2_page_state(page);
if (!s)
return;
ClearPagePrivate(page);
set_page_private(page, 0);
put_page(page);
kfree(s);
}
/* for newly allocated pages: */
static struct bch_page_state *__bch2_page_state_create(struct page *page,
gfp_t gfp)
{
struct bch_page_state *s;
s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
if (!s)
return NULL;
/*
* migrate_page_move_mapping() assumes that pages with private data
* have their count elevated by 1.
*/
get_page(page);
set_page_private(page, (unsigned long) s);
SetPagePrivate(page);
return s; return s;
} }
static struct bch_page_state *bch2_page_state_create(struct page *page,
gfp_t gfp)
{
return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
}
static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
struct page *page) struct page *page)
{ {
struct bch_page_state *s = page_state(page); struct bch_page_state *s = bch2_page_state(page);
struct disk_reservation disk_res = { struct disk_reservation disk_res = { 0 };
.sectors = s->replicas_reserved * PAGE_SECTORS struct quota_res quota_res = { 0 };
}; unsigned i;
struct quota_res quota_res = {
.sectors = s->quota_reserved ? PAGE_SECTORS : 0
};
s->replicas_reserved = 0; if (!s)
s->quota_reserved = 0; return;
for (i = 0; i < ARRAY_SIZE(s->s); i++) {
disk_res.sectors += s->s[i].replicas_reserved;
s->s[i].replicas_reserved = 0;
if (s->s[i].state == SECTOR_QUOTA_RESERVED) {
quota_res.sectors++;
s->s[i].state = SECTOR_UNALLOCATED;
}
}
bch2_quota_reservation_put(c, inode, &quota_res); bch2_quota_reservation_put(c, inode, &quota_res);
bch2_disk_reservation_put(c, &disk_res); bch2_disk_reservation_put(c, &disk_res);
...@@ -559,77 +620,133 @@ static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info ...@@ -559,77 +620,133 @@ static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info
: c->opts.data_replicas; : c->opts.data_replicas;
} }
static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, static inline unsigned sectors_to_reserve(struct bch_page_sector *s,
struct page *page, bool check_enospc) unsigned nr_replicas)
{ {
struct bch_page_state *s = page_state(page); return max(0, (int) nr_replicas -
s->nr_replicas -
s->replicas_reserved);
}
static int bch2_get_page_disk_reservation(struct bch_fs *c,
struct bch_inode_info *inode,
struct page *page, bool check_enospc)
{
struct bch_page_state *s = bch2_page_state_create(page, 0);
unsigned nr_replicas = inode_nr_replicas(c, inode); unsigned nr_replicas = inode_nr_replicas(c, inode);
struct disk_reservation disk_res; struct disk_reservation disk_res = { 0 };
unsigned i, disk_res_sectors = 0;
int ret;
if (!s)
return -ENOMEM;
for (i = 0; i < ARRAY_SIZE(s->s); i++)
disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
if (!disk_res_sectors)
return 0;
ret = bch2_disk_reservation_get(c, &disk_res,
disk_res_sectors, 1,
!check_enospc
? BCH_DISK_RESERVATION_NOFAIL
: 0);
if (unlikely(ret))
return ret;
for (i = 0; i < ARRAY_SIZE(s->s); i++)
s->s[i].replicas_reserved +=
sectors_to_reserve(&s->s[i], nr_replicas);
return 0;
}
static int bch2_get_page_quota_reservation(struct bch_fs *c,
struct bch_inode_info *inode,
struct page *page, bool check_enospc)
{
struct bch_page_state *s = bch2_page_state_create(page, 0);
struct quota_res quota_res = { 0 }; struct quota_res quota_res = { 0 };
unsigned i, quota_res_sectors = 0;
int ret; int ret;
EBUG_ON(!PageLocked(page)); if (!s)
return -ENOMEM;
if (s->replicas_reserved < nr_replicas) { for (i = 0; i < ARRAY_SIZE(s->s); i++)
ret = bch2_disk_reservation_get(c, &disk_res, PAGE_SECTORS, quota_res_sectors += s->s[i].state == SECTOR_UNALLOCATED;
nr_replicas - s->replicas_reserved,
!check_enospc ? BCH_DISK_RESERVATION_NOFAIL : 0);
if (unlikely(ret))
return ret;
s->replicas_reserved += disk_res.nr_replicas; if (!quota_res_sectors)
} return 0;
if (!s->quota_reserved && ret = bch2_quota_reservation_add(c, inode, &quota_res,
s->sectors + s->dirty_sectors < PAGE_SECTORS) { quota_res_sectors,
ret = bch2_quota_reservation_add(c, inode, &quota_res, check_enospc);
PAGE_SECTORS, if (unlikely(ret))
check_enospc); return ret;
if (unlikely(ret))
return ret;
s->quota_reserved = 1; for (i = 0; i < ARRAY_SIZE(s->s); i++)
} if (s->s[i].state == SECTOR_UNALLOCATED)
s->s[i].state = SECTOR_QUOTA_RESERVED;
return 0; return 0;
} }
static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
struct page *page, bool check_enospc)
{
return bch2_get_page_disk_reservation(c, inode, page, check_enospc) ?:
bch2_get_page_quota_reservation(c, inode, page, check_enospc);
}
static void bch2_clear_page_bits(struct page *page) static void bch2_clear_page_bits(struct page *page)
{ {
struct bch_inode_info *inode = to_bch_ei(page->mapping->host); struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_page_state *s; struct bch_page_state *s = bch2_page_state(page);
int i, dirty_sectors = 0;
EBUG_ON(!PageLocked(page));
if (!PagePrivate(page)) if (!s)
return; return;
s = page_state(page); for (i = 0; i < ARRAY_SIZE(s->s); i++) {
if (s->s[i].state == SECTOR_DIRTY) {
dirty_sectors++;
s->s[i].state = SECTOR_UNALLOCATED;
}
}
if (s->dirty_sectors) if (dirty_sectors)
i_sectors_acct(c, inode, NULL, -((int) s->dirty_sectors)); i_sectors_acct(c, inode, NULL, -dirty_sectors);
bch2_put_page_reservation(c, inode, page); bch2_put_page_reservation(c, inode, page);
ClearPagePrivate(page); bch2_page_state_release(page);
set_page_private(page, 0);
} }
static void __bch2_set_page_dirty(struct address_space *mapping, struct folio *folio) static void __bch2_set_page_dirty(struct address_space *mapping, struct folio *folio)
{ {
struct bch_inode_info *inode = to_bch_ei(mapping->host); struct bch_inode_info *inode = to_bch_ei(mapping->host);
struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_page_state *s = page_state(&folio->page); struct bch_page_state *s = bch2_page_state(&folio->page);
struct quota_res quota_res = { s->quota_reserved * PAGE_SECTORS }; struct quota_res quota_res = { 0 };
unsigned dirty_sectors = PAGE_SECTORS - s->sectors; unsigned i, dirty_sectors = 0;
s->quota_reserved = 0; BUG_ON(!s);
if (s->dirty_sectors != dirty_sectors) for (i = 0; i < ARRAY_SIZE(s->s); i++) {
i_sectors_acct(c, inode, &quota_res, if (s->s[i].state == SECTOR_QUOTA_RESERVED)
dirty_sectors - s->dirty_sectors); quota_res.sectors++;
s->dirty_sectors = dirty_sectors;
if (s->s[i].state == SECTOR_UNALLOCATED ||
s->s[i].state == SECTOR_QUOTA_RESERVED) {
s->s[i].state = SECTOR_DIRTY;
dirty_sectors++;
}
}
if (dirty_sectors)
i_sectors_acct(c, inode, &quota_res, dirty_sectors);
bch2_quota_reservation_put(c, inode, &quota_res); bch2_quota_reservation_put(c, inode, &quota_res);
} }
...@@ -796,6 +913,7 @@ static int readpages_iter_init(struct readpages_iter *iter, ...@@ -796,6 +913,7 @@ static int readpages_iter_init(struct readpages_iter *iter,
__readahead_batch(ractl, iter->pages, nr_pages); __readahead_batch(ractl, iter->pages, nr_pages);
for (i = 0; i < nr_pages; i++) { for (i = 0; i < nr_pages; i++) {
__bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
put_page(iter->pages[i]); put_page(iter->pages[i]);
} }
...@@ -809,7 +927,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter) ...@@ -809,7 +927,6 @@ static inline struct page *readpage_iter_next(struct readpages_iter *iter)
EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
page_state_init_for_read(iter->pages[iter->idx]);
return iter->pages[iter->idx]; return iter->pages[iter->idx];
} }
...@@ -819,21 +936,20 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) ...@@ -819,21 +936,20 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
struct bio_vec bv; struct bio_vec bv;
unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k); unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
bio_for_each_segment(bv, bio, iter) { BUG_ON(bio->bi_iter.bi_sector < bkey_start_offset(k.k));
/* brand new pages, don't need to be locked: */ BUG_ON(bio_end_sector(bio) > k.k->p.offset);
struct bch_page_state *s = page_state(bv.bv_page);
/* sectors in @k from the start of this page: */
unsigned k_sectors = k.k->size - (iter.bi_sector - k.k->p.offset);
unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
s->nr_replicas = page_sectors == PAGE_SECTORS bio_for_each_segment(bv, bio, iter) {
? nr_ptrs : 0; struct bch_page_state *s = bch2_page_state(bv.bv_page);
unsigned i;
BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
s->sectors += page_sectors; for (i = bv.bv_offset >> 9;
i < (bv.bv_offset + bv.bv_len) >> 9;
i++) {
s->s[i].nr_replicas = nr_ptrs;
s->s[i].state = SECTOR_ALLOCATED;
}
} }
} }
...@@ -864,12 +980,15 @@ static void readpage_bio_extend(struct readpages_iter *iter, ...@@ -864,12 +980,15 @@ static void readpage_bio_extend(struct readpages_iter *iter,
if (!page) if (!page)
break; break;
page_state_init_for_read(page); if (!__bch2_page_state_create(page, 0)) {
put_page(page);
break;
}
ret = add_to_page_cache_lru(page, iter->mapping, ret = add_to_page_cache_lru(page, iter->mapping,
page_offset, GFP_NOFS); page_offset, GFP_NOFS);
if (ret) { if (ret) {
ClearPagePrivate(page); __bch2_page_state_release(page);
put_page(page); put_page(page);
break; break;
} }
...@@ -1007,7 +1126,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, ...@@ -1007,7 +1126,7 @@ static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
struct btree_trans trans; struct btree_trans trans;
struct btree_iter *iter; struct btree_iter *iter;
page_state_init_for_read(page); bch2_page_state_create(page, __GFP_NOFAIL);
rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
bio_add_page_contig(&rbio->bio, page); bio_add_page_contig(&rbio->bio, page);
...@@ -1175,10 +1294,11 @@ static int __bch2_writepage(struct folio *folio, ...@@ -1175,10 +1294,11 @@ static int __bch2_writepage(struct folio *folio,
struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch_writepage_state *w = data; struct bch_writepage_state *w = data;
struct bch_page_state *s; struct bch_page_state *s;
unsigned offset, nr_replicas_this_write; unsigned offset, nr_replicas_this_write = U32_MAX;
unsigned dirty_sectors, replicas_reserved; unsigned dirty_sectors = 0, reserved_sectors = 0;
loff_t i_size = i_size_read(&inode->v); loff_t i_size = i_size_read(&inode->v);
pgoff_t end_index = i_size >> PAGE_SHIFT; pgoff_t end_index = i_size >> PAGE_SHIFT;
unsigned i;
int ret; int ret;
EBUG_ON(!PageUptodate(page)); EBUG_ON(!PageUptodate(page));
...@@ -1203,9 +1323,9 @@ static int __bch2_writepage(struct folio *folio, ...@@ -1203,9 +1323,9 @@ static int __bch2_writepage(struct folio *folio,
*/ */
zero_user_segment(page, offset, PAGE_SIZE); zero_user_segment(page, offset, PAGE_SIZE);
do_io: do_io:
s = page_state(page); s = bch2_page_state_create(page, __GFP_NOFAIL);
ret = bch2_get_page_reservation(c, inode, page, true); ret = bch2_get_page_disk_reservation(c, inode, page, true);
if (ret) { if (ret) {
SetPageError(page); SetPageError(page);
mapping_set_error(page->mapping, ret); mapping_set_error(page->mapping, ret);
...@@ -1213,27 +1333,24 @@ static int __bch2_writepage(struct folio *folio, ...@@ -1213,27 +1333,24 @@ static int __bch2_writepage(struct folio *folio,
return 0; return 0;
} }
__bch2_set_page_dirty(page->mapping, page_folio(page)); for (i = 0; i < PAGE_SECTORS; i++)
nr_replicas_this_write =
nr_replicas_this_write = min_t(unsigned, nr_replicas_this_write,
max_t(unsigned, s->s[i].nr_replicas +
s->replicas_reserved, s->s[i].replicas_reserved);
(s->sectors == PAGE_SECTORS
? s->nr_replicas : 0));
s->nr_replicas = w->opts.compression
? 0
: nr_replicas_this_write;
/* Before unlocking the page, transfer reservation to w->io: */ /* Before unlocking the page, transfer reservation to w->io: */
replicas_reserved = s->replicas_reserved;
s->replicas_reserved = 0;
dirty_sectors = s->dirty_sectors; for (i = 0; i < PAGE_SECTORS; i++) {
s->dirty_sectors = 0; s->s[i].nr_replicas = w->opts.compression
? 0 : nr_replicas_this_write;
s->sectors += dirty_sectors; reserved_sectors += s->s[i].replicas_reserved;
BUG_ON(s->sectors != PAGE_SECTORS); s->s[i].replicas_reserved = 0;
dirty_sectors += s->s[i].state == SECTOR_DIRTY;
s->s[i].state = SECTOR_ALLOCATED;
}
BUG_ON(PageWriteback(page)); BUG_ON(PageWriteback(page));
set_page_writeback(page); set_page_writeback(page);
...@@ -1253,7 +1370,7 @@ static int __bch2_writepage(struct folio *folio, ...@@ -1253,7 +1370,7 @@ static int __bch2_writepage(struct folio *folio,
BUG_ON(inode != w->io->op.inode); BUG_ON(inode != w->io->op.inode);
BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page)); BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
w->io->op.op.res.sectors += replicas_reserved * PAGE_SECTORS; w->io->op.op.res.sectors += reserved_sectors;
w->io->op.new_i_size = i_size; w->io->op.new_i_size = i_size;
if (wbc->sync_mode == WB_SYNC_ALL) if (wbc->sync_mode == WB_SYNC_ALL)
...@@ -2637,12 +2754,17 @@ long bch2_fallocate_dispatch(struct file *file, int mode, ...@@ -2637,12 +2754,17 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
static bool folio_is_data(struct folio *folio) static bool folio_is_data(struct folio *folio)
{ {
EBUG_ON(!PageLocked(&folio->page)); struct bch_page_state *s = bch2_page_state(&folio->page);
unsigned i;
if (!s)
return false;
for (i = 0; i < PAGE_SECTORS; i++)
if (s->s[i].state >= SECTOR_DIRTY)
return true;
/* XXX: should only have to check PageDirty */ return false;
return folio_test_private(folio) &&
(page_state(&folio->page)->sectors ||
page_state(&folio->page)->dirty_sectors);
} }
static loff_t bch2_next_pagecache_data(struct inode *vinode, static loff_t bch2_next_pagecache_data(struct inode *vinode,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment