Commit 9ae30597 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] VM dirty page balancing

- The balance_dirty_pages() logic is simply wrong.  It goes:

	if (value > threshold)
		go_and_write(value - threshold);

  which is just fine for a single process writing data.  But
  for many processes, they *all* go and bring things back into
  balance, and too much data gets written out.

- The

	go_and_write(this much)

  logic is inoperative, because I turned off the ->writeback_mapping()
  function in ext2.  So a call to writeback_unlocked_inodes(this_much)
  doesn't actually decrement and test *this_much.  It will walk every
  inode, all the time.  Silly.

So quickly fixing the above things, the amount of dirty+writeback
memory in the machine nicely stabilises at 500 megabytes across
the run.
parent afae6f7c
...@@ -591,6 +591,8 @@ struct address_space_operations ext2_aops = { ...@@ -591,6 +591,8 @@ struct address_space_operations ext2_aops = {
commit_write: generic_commit_write, commit_write: generic_commit_write,
bmap: ext2_bmap, bmap: ext2_bmap,
direct_IO: ext2_direct_IO, direct_IO: ext2_direct_IO,
writeback_mapping: generic_writeback_mapping,
vm_writeback: generic_vm_writeback,
}; };
/* /*
......
...@@ -159,7 +159,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, ...@@ -159,7 +159,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
"SwapTotal: %8lu kB\n" "SwapTotal: %8lu kB\n"
"SwapFree: %8lu kB\n" "SwapFree: %8lu kB\n"
"Dirty: %8lu kB\n" "Dirty: %8lu kB\n"
"Locked: %8lu kB\n", "Writeback: %8lu kB\n",
K(i.totalram), K(i.totalram),
K(i.freeram), K(i.freeram),
K(i.sharedram), K(i.sharedram),
...@@ -175,7 +175,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, ...@@ -175,7 +175,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
K(i.totalswap), K(i.totalswap),
K(i.freeswap), K(i.freeswap),
K(ps.nr_dirty), K(ps.nr_dirty),
K(ps.nr_locked) K(ps.nr_writeback)
); );
return proc_calc_metrics(page, start, off, count, eof, len); return proc_calc_metrics(page, start, off, count, eof, len);
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
* *
* The PG_private bitflag is set if page->private contains a valid value. * The PG_private bitflag is set if page->private contains a valid value.
* *
* During disk I/O, PG_locked_dontuse is used. This bit is set before I/O and * During disk I/O, PG_locked is used. This bit is set before I/O and
* reset when I/O completes. page_waitqueue(page) is a wait queue of all tasks * reset when I/O completes. page_waitqueue(page) is a wait queue of all tasks
* waiting for the I/O on this page to complete. * waiting for the I/O on this page to complete.
* *
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
* *
* Note that the referenced bit, the page->lru list_head and the active, * Note that the referenced bit, the page->lru list_head and the active,
* inactive_dirty and inactive_clean lists are protected by the * inactive_dirty and inactive_clean lists are protected by the
* pagemap_lru_lock, and *NOT* by the usual PG_locked_dontuse bit! * pagemap_lru_lock, and *NOT* by the usual PG_locked bit!
* *
* PG_error is set to indicate that an I/O error occurred on this page. * PG_error is set to indicate that an I/O error occurred on this page.
* *
...@@ -47,7 +47,7 @@ ...@@ -47,7 +47,7 @@
* locked- and dirty-page accounting. The top eight bits of page->flags are * locked- and dirty-page accounting. The top eight bits of page->flags are
* used for page->zone, so putting flag bits there doesn't work. * used for page->zone, so putting flag bits there doesn't work.
*/ */
#define PG_locked_dontuse 0 /* Page is locked. Don't touch. */ #define PG_locked 0 /* Page is locked. Don't touch. */
#define PG_error 1 #define PG_error 1
#define PG_referenced 2 #define PG_referenced 2
#define PG_uptodate 3 #define PG_uptodate 3
...@@ -71,7 +71,7 @@ ...@@ -71,7 +71,7 @@
*/ */
extern struct page_state { extern struct page_state {
unsigned long nr_dirty; unsigned long nr_dirty;
unsigned long nr_locked; unsigned long nr_writeback;
unsigned long nr_pagecache; unsigned long nr_pagecache;
} ____cacheline_aligned_in_smp page_states[NR_CPUS]; } ____cacheline_aligned_in_smp page_states[NR_CPUS];
...@@ -91,37 +91,16 @@ extern void get_page_state(struct page_state *ret); ...@@ -91,37 +91,16 @@ extern void get_page_state(struct page_state *ret);
/* /*
* Manipulation of page state flags * Manipulation of page state flags
*/ */
#define PageLocked(page) test_bit(PG_locked_dontuse, &(page)->flags) #define PageLocked(page) \
test_bit(PG_locked, &(page)->flags)
#define SetPageLocked(page) \ #define SetPageLocked(page) \
do { \ set_bit(PG_locked, &(page)->flags)
if (!test_and_set_bit(PG_locked_dontuse, \
&(page)->flags)) \
inc_page_state(nr_locked); \
} while (0)
#define TestSetPageLocked(page) \ #define TestSetPageLocked(page) \
({ \ test_and_set_bit(PG_locked, &(page)->flags)
int ret; \
ret = test_and_set_bit(PG_locked_dontuse, \
&(page)->flags); \
if (!ret) \
inc_page_state(nr_locked); \
ret; \
})
#define ClearPageLocked(page) \ #define ClearPageLocked(page) \
do { \ clear_bit(PG_locked, &(page)->flags)
if (test_and_clear_bit(PG_locked_dontuse, \
&(page)->flags)) \
dec_page_state(nr_locked); \
} while (0)
#define TestClearPageLocked(page) \ #define TestClearPageLocked(page) \
({ \ test_and_clear_bit(PG_locked, &(page)->flags)
int ret; \
ret = test_and_clear_bit(PG_locked_dontuse, \
&(page)->flags); \
if (ret) \
dec_page_state(nr_locked); \
ret; \
})
#define PageError(page) test_bit(PG_error, &(page)->flags) #define PageError(page) test_bit(PG_error, &(page)->flags)
#define SetPageError(page) set_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags)
...@@ -201,12 +180,36 @@ extern void get_page_state(struct page_state *ret); ...@@ -201,12 +180,36 @@ extern void get_page_state(struct page_state *ret);
#define PagePrivate(page) test_bit(PG_private, &(page)->flags) #define PagePrivate(page) test_bit(PG_private, &(page)->flags)
#define PageWriteback(page) test_bit(PG_writeback, &(page)->flags) #define PageWriteback(page) test_bit(PG_writeback, &(page)->flags)
#define SetPageWriteback(page) set_bit(PG_writeback, &(page)->flags) #define SetPageWriteback(page) \
#define ClearPageWriteback(page) clear_bit(PG_writeback, &(page)->flags) do { \
if (!test_and_set_bit(PG_writeback, \
&(page)->flags)) \
inc_page_state(nr_writeback); \
} while (0)
#define TestSetPageWriteback(page) \ #define TestSetPageWriteback(page) \
test_and_set_bit(PG_writeback, &(page)->flags) ({ \
int ret; \
ret = test_and_set_bit(PG_writeback, \
&(page)->flags); \
if (!ret) \
inc_page_state(nr_writeback); \
ret; \
})
#define ClearPageWriteback(page) \
do { \
if (test_and_clear_bit(PG_writeback, \
&(page)->flags)) \
dec_page_state(nr_writeback); \
} while (0)
#define TestClearPageWriteback(page) \ #define TestClearPageWriteback(page) \
test_and_clear_bit(PG_writeback, &(page)->flags) ({ \
int ret; \
ret = test_and_clear_bit(PG_writeback, \
&(page)->flags); \
if (ret) \
dec_page_state(nr_writeback); \
ret; \
})
/* /*
* The PageSwapCache predicate doesn't use a PG_flag at this time, * The PageSwapCache predicate doesn't use a PG_flag at this time,
......
...@@ -628,7 +628,7 @@ static void wait_on_page_bit(struct page *page, int bit_nr) ...@@ -628,7 +628,7 @@ static void wait_on_page_bit(struct page *page, int bit_nr)
*/ */
void ___wait_on_page_locked(struct page *page) void ___wait_on_page_locked(struct page *page)
{ {
wait_on_page_bit(page, PG_locked_dontuse); wait_on_page_bit(page, PG_locked);
} }
EXPORT_SYMBOL(___wait_on_page_locked); EXPORT_SYMBOL(___wait_on_page_locked);
......
...@@ -29,12 +29,12 @@ ...@@ -29,12 +29,12 @@
/* /*
* Start background writeback (via pdflush) at this level * Start background writeback (via pdflush) at this level
*/ */
static int dirty_background_ratio = 30; static int dirty_background_ratio = 40;
/* /*
* The generator of dirty data starts async writeback at this level * The generator of dirty data starts async writeback at this level
*/ */
static int dirty_async_ratio = 45; static int dirty_async_ratio = 50;
/* /*
* The generator of dirty data performs sync writeout at this level * The generator of dirty data performs sync writeout at this level
...@@ -62,25 +62,28 @@ void balance_dirty_pages(struct address_space *mapping) ...@@ -62,25 +62,28 @@ void balance_dirty_pages(struct address_space *mapping)
int async_thresh; int async_thresh;
int sync_thresh; int sync_thresh;
int wake_pdflush = 0; int wake_pdflush = 0;
unsigned long dirty_and_locked; unsigned long dirty_and_writeback;
get_page_state(&ps); get_page_state(&ps);
dirty_and_locked = ps.nr_dirty + ps.nr_locked; dirty_and_writeback = ps.nr_dirty + ps.nr_writeback;
background_thresh = (dirty_background_ratio * tot) / 100; background_thresh = (dirty_background_ratio * tot) / 100;
async_thresh = (dirty_async_ratio * tot) / 100; async_thresh = (dirty_async_ratio * tot) / 100;
sync_thresh = (dirty_sync_ratio * tot) / 100; sync_thresh = (dirty_sync_ratio * tot) / 100;
if (dirty_and_locked > sync_thresh) { if (dirty_and_writeback > sync_thresh) {
int nr_to_write = dirty_and_locked - async_thresh; int nr_to_write = 1500;
printk("sync thresh\n");
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_LAST, NULL); writeback_unlocked_inodes(&nr_to_write, WB_SYNC_LAST, NULL);
get_page_state(&ps);
dirty_and_writeback = ps.nr_dirty + ps.nr_writeback;
wake_pdflush = 1; wake_pdflush = 1;
} else if (dirty_and_locked > async_thresh) { } else if (dirty_and_writeback > async_thresh) {
int nr_to_write = dirty_and_locked - async_thresh; int nr_to_write = 1500;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL); writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
} else if (dirty_and_locked > background_thresh) { } else if (dirty_and_writeback > background_thresh) {
wake_pdflush = 1; wake_pdflush = 1;
} }
...@@ -88,9 +91,8 @@ void balance_dirty_pages(struct address_space *mapping) ...@@ -88,9 +91,8 @@ void balance_dirty_pages(struct address_space *mapping)
/* /*
* There is no flush thread against this device. Start one now. * There is no flush thread against this device. Start one now.
*/ */
get_page_state(&ps); if (dirty_and_writeback > async_thresh) {
if (ps.nr_dirty > 0) { pdflush_flush(dirty_and_writeback - async_thresh);
pdflush_flush(ps.nr_dirty);
yield(); yield();
} }
} }
...@@ -109,7 +111,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) ...@@ -109,7 +111,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
preempt_disable(); preempt_disable();
cpu = smp_processor_id(); cpu = smp_processor_id();
if (ratelimits[cpu].count++ >= 32) { if (ratelimits[cpu].count++ >= 1000) {
ratelimits[cpu].count = 0; ratelimits[cpu].count = 0;
preempt_enable(); preempt_enable();
balance_dirty_pages(mapping); balance_dirty_pages(mapping);
......
...@@ -584,7 +584,7 @@ void get_page_state(struct page_state *ret) ...@@ -584,7 +584,7 @@ void get_page_state(struct page_state *ret)
int pcpu; int pcpu;
ret->nr_dirty = 0; ret->nr_dirty = 0;
ret->nr_locked = 0; ret->nr_writeback = 0;
ret->nr_pagecache = 0; ret->nr_pagecache = 0;
for (pcpu = 0; pcpu < smp_num_cpus; pcpu++) { for (pcpu = 0; pcpu < smp_num_cpus; pcpu++) {
...@@ -592,7 +592,7 @@ void get_page_state(struct page_state *ret) ...@@ -592,7 +592,7 @@ void get_page_state(struct page_state *ret)
ps = &page_states[cpu_logical_map(pcpu)]; ps = &page_states[cpu_logical_map(pcpu)];
ret->nr_dirty += ps->nr_dirty; ret->nr_dirty += ps->nr_dirty;
ret->nr_locked += ps->nr_locked; ret->nr_writeback += ps->nr_writeback;
ret->nr_pagecache += ps->nr_pagecache; ret->nr_pagecache += ps->nr_pagecache;
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment