Commit 2db938be authored by Jan Kara's avatar Jan Kara

jbd: Refine commit writeout logic

Currently we write out all journal buffers in WRITE_SYNC mode. This improves
performance for fsync heavy workloads but hinders performance when writes
are mostly asynchronous, most noticably it slows down readers and users
complain about slow desktop response etc.

So submit writes as asynchronous in the normal case and only submit writes as
WRITE_SYNC if we detect someone is waiting for current transaction commit.

I've gathered some numbers to back this change. The first is the read latency
test. It measures time to read 1 MB after several seconds of sleeping in
presence of streaming writes.

Top 10 times (out of 90) in us:
Before		After
2131586		697473
1709932		557487
1564598		535642
1480462		347573
1478579		323153
1408496		222181
1388960		181273
1329565		181070
1252486		172832
1223265		172278

Average:
619377		82180

So the improvement in both maximum and average latency is massive.

I've measured fsync throughput by:
fs_mark -n 100 -t 1 -s 16384 -d /mnt/fsync/ -S 1 -L 4

in presence of streaming reader. The numbers (fsyncs/s) are:
Before		After
9.9		6.3
6.8		6.0
6.3		6.2
5.8		6.1

So fsync performance seems unharmed by this change.
Signed-off-by: default avatarJan Kara <jack@suse.cz>
parent 923e9a13
...@@ -298,6 +298,7 @@ void journal_commit_transaction(journal_t *journal) ...@@ -298,6 +298,7 @@ void journal_commit_transaction(journal_t *journal)
int tag_flag; int tag_flag;
int i; int i;
struct blk_plug plug; struct blk_plug plug;
int write_op = WRITE;
/* /*
* First job: lock down the current transaction and wait for * First job: lock down the current transaction and wait for
...@@ -413,13 +414,16 @@ void journal_commit_transaction(journal_t *journal) ...@@ -413,13 +414,16 @@ void journal_commit_transaction(journal_t *journal)
jbd_debug (3, "JBD: commit phase 2\n"); jbd_debug (3, "JBD: commit phase 2\n");
if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
write_op = WRITE_SYNC;
/* /*
* Now start flushing things to disk, in the order they appear * Now start flushing things to disk, in the order they appear
* on the transaction lists. Data blocks go first. * on the transaction lists. Data blocks go first.
*/ */
blk_start_plug(&plug); blk_start_plug(&plug);
err = journal_submit_data_buffers(journal, commit_transaction, err = journal_submit_data_buffers(journal, commit_transaction,
WRITE_SYNC); write_op);
blk_finish_plug(&plug); blk_finish_plug(&plug);
/* /*
...@@ -478,7 +482,7 @@ void journal_commit_transaction(journal_t *journal) ...@@ -478,7 +482,7 @@ void journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug); blk_start_plug(&plug);
journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC); journal_write_revoke_records(journal, commit_transaction, write_op);
/* /*
* If we found any dirty or locked buffers, then we should have * If we found any dirty or locked buffers, then we should have
...@@ -649,7 +653,7 @@ void journal_commit_transaction(journal_t *journal) ...@@ -649,7 +653,7 @@ void journal_commit_transaction(journal_t *journal)
clear_buffer_dirty(bh); clear_buffer_dirty(bh);
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync; bh->b_end_io = journal_end_buffer_io_sync;
submit_bh(WRITE_SYNC, bh); submit_bh(write_op, bh);
} }
cond_resched(); cond_resched();
......
...@@ -563,6 +563,8 @@ int log_wait_commit(journal_t *journal, tid_t tid) ...@@ -563,6 +563,8 @@ int log_wait_commit(journal_t *journal, tid_t tid)
spin_unlock(&journal->j_state_lock); spin_unlock(&journal->j_state_lock);
#endif #endif
spin_lock(&journal->j_state_lock); spin_lock(&journal->j_state_lock);
if (!tid_geq(journal->j_commit_waited, tid))
journal->j_commit_waited = tid;
while (tid_gt(tid, journal->j_commit_sequence)) { while (tid_gt(tid, journal->j_commit_sequence)) {
jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
tid, journal->j_commit_sequence); tid, journal->j_commit_sequence);
......
...@@ -1433,8 +1433,6 @@ int journal_stop(handle_t *handle) ...@@ -1433,8 +1433,6 @@ int journal_stop(handle_t *handle)
} }
} }
if (handle->h_sync)
transaction->t_synchronous_commit = 1;
current->journal_info = NULL; current->journal_info = NULL;
spin_lock(&journal->j_state_lock); spin_lock(&journal->j_state_lock);
spin_lock(&transaction->t_handle_lock); spin_lock(&transaction->t_handle_lock);
......
...@@ -479,12 +479,6 @@ struct transaction_s ...@@ -479,12 +479,6 @@ struct transaction_s
* How many handles used this transaction? [t_handle_lock] * How many handles used this transaction? [t_handle_lock]
*/ */
int t_handle_count; int t_handle_count;
/*
* This transaction is being forced and some process is
* waiting for it to finish.
*/
unsigned int t_synchronous_commit:1;
}; };
/** /**
...@@ -531,6 +525,8 @@ struct transaction_s ...@@ -531,6 +525,8 @@ struct transaction_s
* transaction * transaction
* @j_commit_request: Sequence number of the most recent transaction wanting * @j_commit_request: Sequence number of the most recent transaction wanting
* commit * commit
* @j_commit_waited: Sequence number of the most recent transaction someone
* is waiting for to commit.
* @j_uuid: Uuid of client object. * @j_uuid: Uuid of client object.
* @j_task: Pointer to the current commit thread for this journal * @j_task: Pointer to the current commit thread for this journal
* @j_max_transaction_buffers: Maximum number of metadata buffers to allow in a * @j_max_transaction_buffers: Maximum number of metadata buffers to allow in a
...@@ -695,6 +691,13 @@ struct journal_s ...@@ -695,6 +691,13 @@ struct journal_s
*/ */
tid_t j_commit_request; tid_t j_commit_request;
/*
* Sequence number of the most recent transaction someone is waiting
* for to commit.
* [j_state_lock]
*/
tid_t j_commit_waited;
/* /*
* Journal uuid: identifies the object (filesystem, LVM volume etc) * Journal uuid: identifies the object (filesystem, LVM volume etc)
* backed by this journal. This will eventually be replaced by an array * backed by this journal. This will eventually be replaced by an array
......
...@@ -36,19 +36,17 @@ DECLARE_EVENT_CLASS(jbd_commit, ...@@ -36,19 +36,17 @@ DECLARE_EVENT_CLASS(jbd_commit,
TP_STRUCT__entry( TP_STRUCT__entry(
__field( dev_t, dev ) __field( dev_t, dev )
__field( char, sync_commit )
__field( int, transaction ) __field( int, transaction )
), ),
TP_fast_assign( TP_fast_assign(
__entry->dev = journal->j_fs_dev->bd_dev; __entry->dev = journal->j_fs_dev->bd_dev;
__entry->sync_commit = commit_transaction->t_synchronous_commit;
__entry->transaction = commit_transaction->t_tid; __entry->transaction = commit_transaction->t_tid;
), ),
TP_printk("dev %d,%d transaction %d sync %d", TP_printk("dev %d,%d transaction %d",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->transaction, __entry->sync_commit) __entry->transaction)
); );
DEFINE_EVENT(jbd_commit, jbd_start_commit, DEFINE_EVENT(jbd_commit, jbd_start_commit,
...@@ -87,19 +85,17 @@ TRACE_EVENT(jbd_drop_transaction, ...@@ -87,19 +85,17 @@ TRACE_EVENT(jbd_drop_transaction,
TP_STRUCT__entry( TP_STRUCT__entry(
__field( dev_t, dev ) __field( dev_t, dev )
__field( char, sync_commit )
__field( int, transaction ) __field( int, transaction )
), ),
TP_fast_assign( TP_fast_assign(
__entry->dev = journal->j_fs_dev->bd_dev; __entry->dev = journal->j_fs_dev->bd_dev;
__entry->sync_commit = commit_transaction->t_synchronous_commit;
__entry->transaction = commit_transaction->t_tid; __entry->transaction = commit_transaction->t_tid;
), ),
TP_printk("dev %d,%d transaction %d sync %d", TP_printk("dev %d,%d transaction %d",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->transaction, __entry->sync_commit) __entry->transaction)
); );
TRACE_EVENT(jbd_end_commit, TRACE_EVENT(jbd_end_commit,
...@@ -109,21 +105,19 @@ TRACE_EVENT(jbd_end_commit, ...@@ -109,21 +105,19 @@ TRACE_EVENT(jbd_end_commit,
TP_STRUCT__entry( TP_STRUCT__entry(
__field( dev_t, dev ) __field( dev_t, dev )
__field( char, sync_commit )
__field( int, transaction ) __field( int, transaction )
__field( int, head ) __field( int, head )
), ),
TP_fast_assign( TP_fast_assign(
__entry->dev = journal->j_fs_dev->bd_dev; __entry->dev = journal->j_fs_dev->bd_dev;
__entry->sync_commit = commit_transaction->t_synchronous_commit;
__entry->transaction = commit_transaction->t_tid; __entry->transaction = commit_transaction->t_tid;
__entry->head = journal->j_tail_sequence; __entry->head = journal->j_tail_sequence;
), ),
TP_printk("dev %d,%d transaction %d sync %d head %d", TP_printk("dev %d,%d transaction %d head %d",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->transaction, __entry->sync_commit, __entry->head) __entry->transaction, __entry->head)
); );
TRACE_EVENT(jbd_do_submit_data, TRACE_EVENT(jbd_do_submit_data,
...@@ -133,19 +127,17 @@ TRACE_EVENT(jbd_do_submit_data, ...@@ -133,19 +127,17 @@ TRACE_EVENT(jbd_do_submit_data,
TP_STRUCT__entry( TP_STRUCT__entry(
__field( dev_t, dev ) __field( dev_t, dev )
__field( char, sync_commit )
__field( int, transaction ) __field( int, transaction )
), ),
TP_fast_assign( TP_fast_assign(
__entry->dev = journal->j_fs_dev->bd_dev; __entry->dev = journal->j_fs_dev->bd_dev;
__entry->sync_commit = commit_transaction->t_synchronous_commit;
__entry->transaction = commit_transaction->t_tid; __entry->transaction = commit_transaction->t_tid;
), ),
TP_printk("dev %d,%d transaction %d sync %d", TP_printk("dev %d,%d transaction %d",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->transaction, __entry->sync_commit) __entry->transaction)
); );
TRACE_EVENT(jbd_cleanup_journal_tail, TRACE_EVENT(jbd_cleanup_journal_tail,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment