Commit 27ea1d87 authored by Lars Ellenberg's avatar Lars Ellenberg Committed by Jens Axboe

drbd: al_write_transaction: skip re-scanning of bitmap page pointer array

For larger devices, the array of bitmap page pointers can grow very
large (8000 pointers per TB of storage).

For each activity log transaction, we need to flush the associated
bitmap pages to stable storage. Currently, we just "mark" the respective
pages while setting up the transaction, then tell the bitmap code to
write out all marked pages, but skip unchanged pages.

But one such transaction can affect only a small number of bitmap pages,
there is no need to scan the full array of several (ten-)thousand
page pointers to find the few marked ones.

Instead, remember the index numbers of the few affected pages,
and later only re-check those to skip duplicates and unchanged ones.
Signed-off-by: default avatarPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: default avatarLars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent 13c2088d
...@@ -341,6 +341,8 @@ static int __al_write_transaction(struct drbd_device *device, struct al_transact ...@@ -341,6 +341,8 @@ static int __al_write_transaction(struct drbd_device *device, struct al_transact
i = 0; i = 0;
drbd_bm_reset_al_hints(device);
/* Even though no one can start to change this list /* Even though no one can start to change this list
* once we set the LC_LOCKED -- from drbd_al_begin_io(), * once we set the LC_LOCKED -- from drbd_al_begin_io(),
* lc_try_lock_for_transaction() --, someone may still * lc_try_lock_for_transaction() --, someone may still
......
...@@ -96,6 +96,13 @@ struct drbd_bitmap { ...@@ -96,6 +96,13 @@ struct drbd_bitmap {
struct page **bm_pages; struct page **bm_pages;
spinlock_t bm_lock; spinlock_t bm_lock;
/* exclusively to be used by __al_write_transaction(),
* drbd_bm_mark_for_writeout() and
* and drbd_bm_write_hinted() -> bm_rw() called from there.
*/
unsigned int n_bitmap_hints;
unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION];
/* see LIMITATIONS: above */ /* see LIMITATIONS: above */
unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
...@@ -242,6 +249,11 @@ static void bm_set_page_need_writeout(struct page *page) ...@@ -242,6 +249,11 @@ static void bm_set_page_need_writeout(struct page *page)
set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
} }
void drbd_bm_reset_al_hints(struct drbd_device *device)
{
device->bitmap->n_bitmap_hints = 0;
}
/** /**
* drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
* @device: DRBD device. * @device: DRBD device.
...@@ -253,6 +265,7 @@ static void bm_set_page_need_writeout(struct page *page) ...@@ -253,6 +265,7 @@ static void bm_set_page_need_writeout(struct page *page)
*/ */
void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
{ {
struct drbd_bitmap *b = device->bitmap;
struct page *page; struct page *page;
if (page_nr >= device->bitmap->bm_number_of_pages) { if (page_nr >= device->bitmap->bm_number_of_pages) {
drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n", drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
...@@ -260,7 +273,9 @@ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) ...@@ -260,7 +273,9 @@ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
return; return;
} }
page = device->bitmap->bm_pages[page_nr]; page = device->bitmap->bm_pages[page_nr];
set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints));
if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)))
b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr;
} }
static int bm_test_page_unchanged(struct page *page) static int bm_test_page_unchanged(struct page *page)
...@@ -1030,7 +1045,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned ...@@ -1030,7 +1045,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
{ {
struct drbd_bm_aio_ctx *ctx; struct drbd_bm_aio_ctx *ctx;
struct drbd_bitmap *b = device->bitmap; struct drbd_bitmap *b = device->bitmap;
int num_pages, i, count = 0; unsigned int num_pages, i, count = 0;
unsigned long now; unsigned long now;
char ppb[10]; char ppb[10];
int err = 0; int err = 0;
...@@ -1078,16 +1093,37 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned ...@@ -1078,16 +1093,37 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
now = jiffies; now = jiffies;
/* let the layers below us try to merge these bios... */ /* let the layers below us try to merge these bios... */
for (i = 0; i < num_pages; i++) {
/* ignore completely unchanged pages */
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
if (!(flags & BM_AIO_READ)) {
if ((flags & BM_AIO_WRITE_HINTED) &&
!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
&page_private(b->bm_pages[i])))
continue;
if (flags & BM_AIO_READ) {
for (i = 0; i < num_pages; i++) {
atomic_inc(&ctx->in_flight);
bm_page_io_async(ctx, i);
++count;
cond_resched();
}
} else if (flags & BM_AIO_WRITE_HINTED) {
/* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */
unsigned int hint;
for (hint = 0; hint < b->n_bitmap_hints; hint++) {
i = b->al_bitmap_hints[hint];
if (i >= num_pages) /* == -1U: no hint here. */
continue;
/* Several AL-extents may point to the same page. */
if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
&page_private(b->bm_pages[i])))
continue;
/* Has it even changed? */
if (bm_test_page_unchanged(b->bm_pages[i]))
continue;
atomic_inc(&ctx->in_flight);
bm_page_io_async(ctx, i);
++count;
}
} else {
for (i = 0; i < num_pages; i++) {
/* ignore completely unchanged pages */
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
if (!(flags & BM_AIO_WRITE_ALL_PAGES) && if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
bm_test_page_unchanged(b->bm_pages[i])) { bm_test_page_unchanged(b->bm_pages[i])) {
dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i); dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
...@@ -1100,11 +1136,11 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned ...@@ -1100,11 +1136,11 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned
dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i); dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
continue; continue;
} }
atomic_inc(&ctx->in_flight);
bm_page_io_async(ctx, i);
++count;
cond_resched();
} }
atomic_inc(&ctx->in_flight);
bm_page_io_async(ctx, i);
++count;
cond_resched();
} }
/* /*
......
...@@ -1378,6 +1378,7 @@ extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr); ...@@ -1378,6 +1378,7 @@ extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); extern int drbd_bm_read(struct drbd_device *device) __must_hold(local);
extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write(struct drbd_device *device) __must_hold(local);
extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local);
extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment