Commit bbfeb614 authored by Dave Chinner's avatar Dave Chinner

Merge branch 'xfs-4.8-buf-fixes' into for-next

parents f6371617 9c7504aa
......@@ -79,6 +79,47 @@ xfs_buf_vmap_len(
return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
}
/*
* Bump the I/O in flight count on the buftarg if we haven't yet done so for
* this buffer. The count is incremented once per buffer (per hold cycle)
* because the corresponding decrement is deferred to buffer release. Buffers
* can undergo I/O multiple times in a hold-release cycle and per buffer I/O
* tracking adds unnecessary overhead. This is used for sychronization purposes
* with unmount (see xfs_wait_buftarg()), so all we really need is a count of
* in-flight buffers.
*
* Buffers that are never released (e.g., superblock, iclog buffers) must set
* the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
* never reaches zero and unmount hangs indefinitely.
*/
static inline void
xfs_buf_ioacct_inc(
struct xfs_buf *bp)
{
if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
return;
ASSERT(bp->b_flags & XBF_ASYNC);
bp->b_flags |= _XBF_IN_FLIGHT;
percpu_counter_inc(&bp->b_target->bt_io_count);
}
/*
* Clear the in-flight state on a buffer about to be released to the LRU or
* freed and unaccount from the buftarg.
*/
static inline void
xfs_buf_ioacct_dec(
struct xfs_buf *bp)
{
if (!(bp->b_flags & _XBF_IN_FLIGHT))
return;
ASSERT(bp->b_flags & XBF_ASYNC);
bp->b_flags &= ~_XBF_IN_FLIGHT;
percpu_counter_dec(&bp->b_target->bt_io_count);
}
/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
......@@ -102,6 +143,14 @@ xfs_buf_stale(
*/
bp->b_flags &= ~_XBF_DELWRI_Q;
/*
* Once the buffer is marked stale and unlocked, a subsequent lookup
* could reset b_flags. There is no guarantee that the buffer is
* unaccounted (released to LRU) before that occurs. Drop in-flight
* status now to preserve accounting consistency.
*/
xfs_buf_ioacct_dec(bp);
spin_lock(&bp->b_lock);
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
......@@ -815,7 +864,8 @@ xfs_buf_get_uncached(
struct xfs_buf *bp;
DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
bp = _xfs_buf_alloc(target, &map, 1, 0);
/* flags might contain irrelevant bits, pass only what we care about */
bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
if (unlikely(bp == NULL))
goto fail;
......@@ -866,63 +916,85 @@ xfs_buf_hold(
}
/*
* Releases a hold on the specified buffer. If the
* the hold count is 1, calls xfs_buf_free.
* Release a hold on the specified buffer. If the hold count is 1, the buffer is
* placed on LRU or freed (depending on b_lru_ref).
*/
void
xfs_buf_rele(
xfs_buf_t *bp)
{
struct xfs_perag *pag = bp->b_pag;
bool release;
bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_);
if (!pag) {
ASSERT(list_empty(&bp->b_lru));
ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold))
if (atomic_dec_and_test(&bp->b_hold)) {
xfs_buf_ioacct_dec(bp);
xfs_buf_free(bp);
}
return;
}
ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
spin_lock(&bp->b_lock);
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
/*
* If the buffer is added to the LRU take a new
* reference to the buffer for the LRU and clear the
* (now stale) dispose list state flag
*/
if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold);
}
spin_unlock(&bp->b_lock);
spin_unlock(&pag->pag_buf_lock);
} else {
/*
* most of the time buffers will already be removed from
* the LRU, so optimise that case by checking for the
* XFS_BSTATE_DISPOSE flag indicating the last list the
* buffer was on was the disposal list
*/
if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
} else {
ASSERT(list_empty(&bp->b_lru));
}
spin_unlock(&bp->b_lock);
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
xfs_buf_free(bp);
release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
spin_lock(&bp->b_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
* and it holds the only reference. This is racy because we
* haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
* ensures the decrement occurs only once per-buf.
*/
if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
xfs_buf_ioacct_dec(bp);
goto out_unlock;
}
/* the last reference has been dropped ... */
xfs_buf_ioacct_dec(bp);
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
/*
* If the buffer is added to the LRU take a new reference to the
* buffer for the LRU and clear the (now stale) dispose list
* state flag
*/
if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold);
}
spin_unlock(&pag->pag_buf_lock);
} else {
/*
* most of the time buffers will already be removed from the
* LRU, so optimise that case by checking for the
* XFS_BSTATE_DISPOSE flag indicating the last list the buffer
* was on was the disposal list
*/
if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
} else {
ASSERT(list_empty(&bp->b_lru));
}
ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
freebuf = true;
}
out_unlock:
spin_unlock(&bp->b_lock);
if (freebuf)
xfs_buf_free(bp);
}
......@@ -1341,6 +1413,7 @@ xfs_buf_submit(
* xfs_buf_ioend too early.
*/
atomic_set(&bp->b_io_remaining, 1);
xfs_buf_ioacct_inc(bp);
_xfs_buf_ioapply(bp);
/*
......@@ -1526,13 +1599,19 @@ xfs_wait_buftarg(
int loop = 0;
/*
* We need to flush the buffer workqueue to ensure that all IO
* completion processing is 100% done. Just waiting on buffer locks is
* not sufficient for async IO as the reference count held over IO is
* not released until after the buffer lock is dropped. Hence we need to
* ensure here that all reference counts have been dropped before we
* start walking the LRU list.
* First wait on the buftarg I/O count for all in-flight buffers to be
* released. This is critical as new buffers do not make the LRU until
* they are released.
*
* Next, flush the buffer workqueue to ensure all completion processing
* has finished. Just waiting on buffer locks is not sufficient for
* async IO as the reference count held over IO is not released until
* after the buffer lock is dropped. Hence we need to ensure here that
* all reference counts have been dropped before we start walking the
* LRU list.
*/
while (percpu_counter_sum(&btp->bt_io_count))
delay(100);
drain_workqueue(btp->bt_mount->m_buf_workqueue);
/* loop until there is nothing left on the lru list. */
......@@ -1629,6 +1708,8 @@ xfs_free_buftarg(
struct xfs_buftarg *btp)
{
unregister_shrinker(&btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
if (mp->m_flags & XFS_MOUNT_BARRIER)
......@@ -1693,6 +1774,9 @@ xfs_alloc_buftarg(
if (list_lru_init(&btp->bt_lru))
goto error;
if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
goto error;
btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
......@@ -1834,7 +1918,7 @@ xfs_buf_delwri_submit_buffers(
* side. We need to move the buffer onto the io_list
* at this point so the caller can still access it.
*/
bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
bp->b_flags |= XBF_WRITE | XBF_ASYNC;
if (wait_list) {
xfs_buf_hold(bp);
......
......@@ -43,6 +43,7 @@ typedef enum {
#define XBF_READ (1 << 0) /* buffer intended for reading from device */
#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
......@@ -62,6 +63,7 @@ typedef enum {
#define _XBF_KMEM (1 << 21)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
#define _XBF_COMPOUND (1 << 23)/* compound buffer */
#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */
typedef unsigned int xfs_buf_flags_t;
......@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_KMEM, "KMEM" }, \
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
{ _XBF_COMPOUND, "COMPOUND" }
{ _XBF_COMPOUND, "COMPOUND" }, \
{ _XBF_IN_FLIGHT, "IN_FLIGHT" }
/*
......@@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
/* LRU control structures */
struct shrinker bt_shrinker;
struct list_lru bt_lru;
struct percpu_counter bt_io_count;
} xfs_buftarg_t;
struct xfs_buf;
......
......@@ -1081,6 +1081,8 @@ xfs_buf_iodone_callback_error(
trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
ASSERT(bp->b_iodone != NULL);
cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
/*
* If the write was asynchronous then no one will be looking for the
* error. If this is the first failure of this type, clear the error
......@@ -1088,13 +1090,12 @@ xfs_buf_iodone_callback_error(
* async write failure at least once, but we also need to set the buffer
* up to behave correctly now for repeated failures.
*/
if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) ||
if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
bp->b_last_error != bp->b_error) {
bp->b_flags |= (XBF_WRITE | XBF_ASYNC |
XBF_DONE | XBF_WRITE_FAIL);
bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
bp->b_last_error = bp->b_error;
bp->b_retries = 0;
bp->b_first_retry_time = jiffies;
if (cfg->retry_timeout && !bp->b_first_retry_time)
bp->b_first_retry_time = jiffies;
xfs_buf_ioerror(bp, 0);
xfs_buf_submit(bp);
......@@ -1105,7 +1106,6 @@ xfs_buf_iodone_callback_error(
* Repeated failure on an async write. Take action according to the
* error configuration we have been set up to use.
*/
cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
++bp->b_retries > cfg->max_retries)
......
......@@ -1415,7 +1415,7 @@ xlog_alloc_log(
*/
error = -ENOMEM;
bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
BTOBB(log->l_iclog_size), 0);
BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
if (!bp)
goto out_free_log;
......@@ -1454,7 +1454,8 @@ xlog_alloc_log(
prev_iclog = iclog;
bp = xfs_buf_get_uncached(mp->m_logdev_targp,
BTOBB(log->l_iclog_size), 0);
BTOBB(log->l_iclog_size),
XBF_NO_IOACCT);
if (!bp)
goto out_free_iclog;
......
......@@ -272,13 +272,15 @@ xfs_readsb(
buf_ops = NULL;
/*
* Allocate a (locked) buffer to hold the superblock.
* This will be kept around at all times to optimize
* access to the superblock.
* Allocate a (locked) buffer to hold the superblock. This will be kept
* around at all times to optimize access to the superblock. Therefore,
* set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
* elevated.
*/
reread:
error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
BTOBB(sector_size), 0, &bp, buf_ops);
BTOBB(sector_size), XBF_NO_IOACCT, &bp,
buf_ops);
if (error) {
if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error);
......
......@@ -634,6 +634,9 @@ xfs_error_get_cfg(
{
struct xfs_error_cfg *cfg;
if (error < 0)
error = -error;
switch (error) {
case EIO:
cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment