Commit 7bc4a4ce authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus-merged' of git://oss.sgi.com/xfs/xfs

* 'for-linus-merged' of git://oss.sgi.com/xfs/xfs: (47 commits)
  xfs: convert grant head manipulations to lockless algorithm
  xfs: introduce new locks for the log grant ticket wait queues
  xfs: convert log grant heads to atomic variables
  xfs: convert l_tail_lsn to an atomic variable.
  xfs: convert l_last_sync_lsn to an atomic variable
  xfs: make AIL tail pushing independent of the grant lock
  xfs: use wait queues directly for the log wait queues
  xfs: combine grant heads into a single 64 bit integer
  xfs: rework log grant space calculations
  xfs: fact out common grant head/log tail verification code
  xfs: convert log grant ticket queues to list heads
  xfs: use AIL bulk delete function to implement single delete
  xfs: use AIL bulk update function to implement single updates
  xfs: remove all the inodes on a buffer from the AIL in bulk
  xfs: consume iodone callback items on buffers as they are processed
  xfs: reduce the number of AIL push wakeups
  xfs: bulk AIL insertion during transaction commit
  xfs: clean up xfs_ail_delete()
  xfs: Pull EFI/EFD handling out from under the AIL lock
  xfs: fix EFI transaction cancellation.
  ...
parents 498f7f50 92f1c008
/*
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef __XFS_SUPPORT_SV_H__
#define __XFS_SUPPORT_SV_H__
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
/*
* Synchronisation variables.
*
* (Parameters "pri", "svf" and "rts" are not implemented)
*/
typedef struct sv_s {
wait_queue_head_t waiters;
} sv_t;
static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
{
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&sv->waiters, &wait);
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(lock);
schedule();
remove_wait_queue(&sv->waiters, &wait);
}
#define sv_init(sv,flag,name) \
init_waitqueue_head(&(sv)->waiters)
#define sv_destroy(sv) \
/*NOTHING*/
#define sv_wait(sv, pri, lock, s) \
_sv_wait(sv, lock)
#define sv_signal(sv) \
wake_up(&(sv)->waiters)
#define sv_broadcast(sv) \
wake_up_all(&(sv)->waiters)
#endif /* __XFS_SUPPORT_SV_H__ */
This diff is collapsed.
...@@ -22,6 +22,22 @@ extern struct workqueue_struct *xfsdatad_workqueue; ...@@ -22,6 +22,22 @@ extern struct workqueue_struct *xfsdatad_workqueue;
extern struct workqueue_struct *xfsconvertd_workqueue; extern struct workqueue_struct *xfsconvertd_workqueue;
extern mempool_t *xfs_ioend_pool; extern mempool_t *xfs_ioend_pool;
/*
* Types of I/O for bmap clustering and I/O completion tracking.
*/
enum {
IO_DIRECT = 0, /* special case for direct I/O ioends */
IO_DELALLOC, /* mapping covers delalloc region */
IO_UNWRITTEN, /* mapping covers allocated but uninitialized data */
IO_OVERWRITE, /* mapping covers already allocated extent */
};
#define XFS_IO_TYPES \
{ 0, "" }, \
{ IO_DELALLOC, "delalloc" }, \
{ IO_UNWRITTEN, "unwritten" }, \
{ IO_OVERWRITE, "overwrite" }
/* /*
* xfs_ioend struct manages large extent writes for XFS. * xfs_ioend struct manages large extent writes for XFS.
* It can manage several multi-page bio's at once. * It can manage several multi-page bio's at once.
......
...@@ -44,12 +44,7 @@ ...@@ -44,12 +44,7 @@
static kmem_zone_t *xfs_buf_zone; static kmem_zone_t *xfs_buf_zone;
STATIC int xfsbufd(void *); STATIC int xfsbufd(void *);
STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
static struct shrinker xfs_buf_shake = {
.shrink = xfsbufd_wakeup,
.seeks = DEFAULT_SEEKS,
};
static struct workqueue_struct *xfslogd_workqueue; static struct workqueue_struct *xfslogd_workqueue;
struct workqueue_struct *xfsdatad_workqueue; struct workqueue_struct *xfsdatad_workqueue;
...@@ -168,8 +163,79 @@ test_page_region( ...@@ -168,8 +163,79 @@ test_page_region(
} }
/* /*
* Internal xfs_buf_t object manipulation * xfs_buf_lru_add - add a buffer to the LRU.
*
* The LRU takes a new reference to the buffer so that it will only be freed
* once the shrinker takes the buffer off the LRU.
*/ */
STATIC void
xfs_buf_lru_add(
struct xfs_buf *bp)
{
struct xfs_buftarg *btp = bp->b_target;
spin_lock(&btp->bt_lru_lock);
if (list_empty(&bp->b_lru)) {
atomic_inc(&bp->b_hold);
list_add_tail(&bp->b_lru, &btp->bt_lru);
btp->bt_lru_nr++;
}
spin_unlock(&btp->bt_lru_lock);
}
/*
* xfs_buf_lru_del - remove a buffer from the LRU
*
* The unlocked check is safe here because it only occurs when there are not
* b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
* to optimise the shrinker removing the buffer from the LRU and calling
* xfs_buf_free(). i.e. it removes an unneccessary round trip on the
* bt_lru_lock.
*/
STATIC void
xfs_buf_lru_del(
struct xfs_buf *bp)
{
struct xfs_buftarg *btp = bp->b_target;
if (list_empty(&bp->b_lru))
return;
spin_lock(&btp->bt_lru_lock);
if (!list_empty(&bp->b_lru)) {
list_del_init(&bp->b_lru);
btp->bt_lru_nr--;
}
spin_unlock(&btp->bt_lru_lock);
}
/*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer
* reference count falls to zero. If the buffer is already on the LRU, we need
* to remove the reference that LRU holds on the buffer.
*
* This prevents build-up of stale buffers on the LRU.
*/
void
xfs_buf_stale(
struct xfs_buf *bp)
{
bp->b_flags |= XBF_STALE;
atomic_set(&(bp)->b_lru_ref, 0);
if (!list_empty(&bp->b_lru)) {
struct xfs_buftarg *btp = bp->b_target;
spin_lock(&btp->bt_lru_lock);
if (!list_empty(&bp->b_lru)) {
list_del_init(&bp->b_lru);
btp->bt_lru_nr--;
atomic_dec(&bp->b_hold);
}
spin_unlock(&btp->bt_lru_lock);
}
ASSERT(atomic_read(&bp->b_hold) >= 1);
}
STATIC void STATIC void
_xfs_buf_initialize( _xfs_buf_initialize(
...@@ -186,7 +252,9 @@ _xfs_buf_initialize( ...@@ -186,7 +252,9 @@ _xfs_buf_initialize(
memset(bp, 0, sizeof(xfs_buf_t)); memset(bp, 0, sizeof(xfs_buf_t));
atomic_set(&bp->b_hold, 1); atomic_set(&bp->b_hold, 1);
atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait); init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru);
INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_list);
RB_CLEAR_NODE(&bp->b_rbnode); RB_CLEAR_NODE(&bp->b_rbnode);
sema_init(&bp->b_sema, 0); /* held, no waiters */ sema_init(&bp->b_sema, 0); /* held, no waiters */
...@@ -262,6 +330,8 @@ xfs_buf_free( ...@@ -262,6 +330,8 @@ xfs_buf_free(
{ {
trace_xfs_buf_free(bp, _RET_IP_); trace_xfs_buf_free(bp, _RET_IP_);
ASSERT(list_empty(&bp->b_lru));
if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
uint i; uint i;
...@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages( ...@@ -337,7 +407,6 @@ _xfs_buf_lookup_pages(
__func__, gfp_mask); __func__, gfp_mask);
XFS_STATS_INC(xb_page_retries); XFS_STATS_INC(xb_page_retries);
xfsbufd_wakeup(NULL, 0, gfp_mask);
congestion_wait(BLK_RW_ASYNC, HZ/50); congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry; goto retry;
} }
...@@ -828,6 +897,7 @@ xfs_buf_rele( ...@@ -828,6 +897,7 @@ xfs_buf_rele(
if (!pag) { if (!pag) {
ASSERT(!bp->b_relse); ASSERT(!bp->b_relse);
ASSERT(list_empty(&bp->b_lru));
ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
if (atomic_dec_and_test(&bp->b_hold)) if (atomic_dec_and_test(&bp->b_hold))
xfs_buf_free(bp); xfs_buf_free(bp);
...@@ -835,13 +905,19 @@ xfs_buf_rele( ...@@ -835,13 +905,19 @@ xfs_buf_rele(
} }
ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
ASSERT(atomic_read(&bp->b_hold) > 0); ASSERT(atomic_read(&bp->b_hold) > 0);
if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
if (bp->b_relse) { if (bp->b_relse) {
atomic_inc(&bp->b_hold); atomic_inc(&bp->b_hold);
spin_unlock(&pag->pag_buf_lock); spin_unlock(&pag->pag_buf_lock);
bp->b_relse(bp); bp->b_relse(bp);
} else if (!(bp->b_flags & XBF_STALE) &&
atomic_read(&bp->b_lru_ref)) {
xfs_buf_lru_add(bp);
spin_unlock(&pag->pag_buf_lock);
} else { } else {
xfs_buf_lru_del(bp);
ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
spin_unlock(&pag->pag_buf_lock); spin_unlock(&pag->pag_buf_lock);
...@@ -1438,51 +1514,84 @@ xfs_buf_iomove( ...@@ -1438,51 +1514,84 @@ xfs_buf_iomove(
*/ */
/* /*
* Wait for any bufs with callbacks that have been submitted but * Wait for any bufs with callbacks that have been submitted but have not yet
* have not yet returned... walk the hash list for the target. * returned. These buffers will have an elevated hold count, so wait on those
* while freeing all the buffers only held by the LRU.
*/ */
void void
xfs_wait_buftarg( xfs_wait_buftarg(
struct xfs_buftarg *btp) struct xfs_buftarg *btp)
{ {
struct xfs_perag *pag; struct xfs_buf *bp;
uint i;
for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) { restart:
pag = xfs_perag_get(btp->bt_mount, i); spin_lock(&btp->bt_lru_lock);
spin_lock(&pag->pag_buf_lock); while (!list_empty(&btp->bt_lru)) {
while (rb_first(&pag->pag_buf_tree)) { bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
spin_unlock(&pag->pag_buf_lock); if (atomic_read(&bp->b_hold) > 1) {
spin_unlock(&btp->bt_lru_lock);
delay(100); delay(100);
spin_lock(&pag->pag_buf_lock); goto restart;
} }
spin_unlock(&pag->pag_buf_lock); /*
xfs_perag_put(pag); * clear the LRU reference count so the bufer doesn't get
* ignored in xfs_buf_rele().
*/
atomic_set(&bp->b_lru_ref, 0);
spin_unlock(&btp->bt_lru_lock);
xfs_buf_rele(bp);
spin_lock(&btp->bt_lru_lock);
} }
spin_unlock(&btp->bt_lru_lock);
} }
/* int
* buftarg list for delwrite queue processing xfs_buftarg_shrink(
*/ struct shrinker *shrink,
static LIST_HEAD(xfs_buftarg_list); int nr_to_scan,
static DEFINE_SPINLOCK(xfs_buftarg_lock); gfp_t mask)
STATIC void
xfs_register_buftarg(
xfs_buftarg_t *btp)
{ {
spin_lock(&xfs_buftarg_lock); struct xfs_buftarg *btp = container_of(shrink,
list_add(&btp->bt_list, &xfs_buftarg_list); struct xfs_buftarg, bt_shrinker);
spin_unlock(&xfs_buftarg_lock); struct xfs_buf *bp;
} LIST_HEAD(dispose);
STATIC void if (!nr_to_scan)
xfs_unregister_buftarg( return btp->bt_lru_nr;
xfs_buftarg_t *btp)
{ spin_lock(&btp->bt_lru_lock);
spin_lock(&xfs_buftarg_lock); while (!list_empty(&btp->bt_lru)) {
list_del(&btp->bt_list); if (nr_to_scan-- <= 0)
spin_unlock(&xfs_buftarg_lock); break;
bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
/*
* Decrement the b_lru_ref count unless the value is already
* zero. If the value is already zero, we need to reclaim the
* buffer, otherwise it gets another trip through the LRU.
*/
if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
list_move_tail(&bp->b_lru, &btp->bt_lru);
continue;
}
/*
* remove the buffer from the LRU now to avoid needing another
* lock round trip inside xfs_buf_rele().
*/
list_move(&bp->b_lru, &dispose);
btp->bt_lru_nr--;
}
spin_unlock(&btp->bt_lru_lock);
while (!list_empty(&dispose)) {
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
xfs_buf_rele(bp);
}
return btp->bt_lru_nr;
} }
void void
...@@ -1490,17 +1599,14 @@ xfs_free_buftarg( ...@@ -1490,17 +1599,14 @@ xfs_free_buftarg(
struct xfs_mount *mp, struct xfs_mount *mp,
struct xfs_buftarg *btp) struct xfs_buftarg *btp)
{ {
unregister_shrinker(&btp->bt_shrinker);
xfs_flush_buftarg(btp, 1); xfs_flush_buftarg(btp, 1);
if (mp->m_flags & XFS_MOUNT_BARRIER) if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_blkdev_issue_flush(btp); xfs_blkdev_issue_flush(btp);
iput(btp->bt_mapping->host); iput(btp->bt_mapping->host);
/* Unregister the buftarg first so that we don't get a
* wakeup finding a non-existent task
*/
xfs_unregister_buftarg(btp);
kthread_stop(btp->bt_task); kthread_stop(btp->bt_task);
kmem_free(btp); kmem_free(btp);
} }
...@@ -1597,20 +1703,13 @@ xfs_alloc_delwrite_queue( ...@@ -1597,20 +1703,13 @@ xfs_alloc_delwrite_queue(
xfs_buftarg_t *btp, xfs_buftarg_t *btp,
const char *fsname) const char *fsname)
{ {
int error = 0;
INIT_LIST_HEAD(&btp->bt_list);
INIT_LIST_HEAD(&btp->bt_delwrite_queue); INIT_LIST_HEAD(&btp->bt_delwrite_queue);
spin_lock_init(&btp->bt_delwrite_lock); spin_lock_init(&btp->bt_delwrite_lock);
btp->bt_flags = 0; btp->bt_flags = 0;
btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
if (IS_ERR(btp->bt_task)) { if (IS_ERR(btp->bt_task))
error = PTR_ERR(btp->bt_task); return PTR_ERR(btp->bt_task);
goto out_error; return 0;
}
xfs_register_buftarg(btp);
out_error:
return error;
} }
xfs_buftarg_t * xfs_buftarg_t *
...@@ -1627,12 +1726,17 @@ xfs_alloc_buftarg( ...@@ -1627,12 +1726,17 @@ xfs_alloc_buftarg(
btp->bt_mount = mp; btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev; btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev; btp->bt_bdev = bdev;
INIT_LIST_HEAD(&btp->bt_lru);
spin_lock_init(&btp->bt_lru_lock);
if (xfs_setsize_buftarg_early(btp, bdev)) if (xfs_setsize_buftarg_early(btp, bdev))
goto error; goto error;
if (xfs_mapping_buftarg(btp, bdev)) if (xfs_mapping_buftarg(btp, bdev))
goto error; goto error;
if (xfs_alloc_delwrite_queue(btp, fsname)) if (xfs_alloc_delwrite_queue(btp, fsname))
goto error; goto error;
btp->bt_shrinker.shrink = xfs_buftarg_shrink;
btp->bt_shrinker.seeks = DEFAULT_SEEKS;
register_shrinker(&btp->bt_shrinker);
return btp; return btp;
error: error:
...@@ -1737,27 +1841,6 @@ xfs_buf_runall_queues( ...@@ -1737,27 +1841,6 @@ xfs_buf_runall_queues(
flush_workqueue(queue); flush_workqueue(queue);
} }
STATIC int
xfsbufd_wakeup(
struct shrinker *shrink,
int priority,
gfp_t mask)
{
xfs_buftarg_t *btp;
spin_lock(&xfs_buftarg_lock);
list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
continue;
if (list_empty(&btp->bt_delwrite_queue))
continue;
set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
wake_up_process(btp->bt_task);
}
spin_unlock(&xfs_buftarg_lock);
return 0;
}
/* /*
* Move as many buffers as specified to the supplied list * Move as many buffers as specified to the supplied list
* idicating if we skipped any buffers to prevent deadlocks. * idicating if we skipped any buffers to prevent deadlocks.
...@@ -1952,7 +2035,6 @@ xfs_buf_init(void) ...@@ -1952,7 +2035,6 @@ xfs_buf_init(void)
if (!xfsconvertd_workqueue) if (!xfsconvertd_workqueue)
goto out_destroy_xfsdatad_workqueue; goto out_destroy_xfsdatad_workqueue;
register_shrinker(&xfs_buf_shake);
return 0; return 0;
out_destroy_xfsdatad_workqueue: out_destroy_xfsdatad_workqueue:
...@@ -1968,7 +2050,6 @@ xfs_buf_init(void) ...@@ -1968,7 +2050,6 @@ xfs_buf_init(void)
void void
xfs_buf_terminate(void) xfs_buf_terminate(void)
{ {
unregister_shrinker(&xfs_buf_shake);
destroy_workqueue(xfsconvertd_workqueue); destroy_workqueue(xfsconvertd_workqueue);
destroy_workqueue(xfsdatad_workqueue); destroy_workqueue(xfsdatad_workqueue);
destroy_workqueue(xfslogd_workqueue); destroy_workqueue(xfslogd_workqueue);
......
...@@ -128,10 +128,15 @@ typedef struct xfs_buftarg { ...@@ -128,10 +128,15 @@ typedef struct xfs_buftarg {
/* per device delwri queue */ /* per device delwri queue */
struct task_struct *bt_task; struct task_struct *bt_task;
struct list_head bt_list;
struct list_head bt_delwrite_queue; struct list_head bt_delwrite_queue;
spinlock_t bt_delwrite_lock; spinlock_t bt_delwrite_lock;
unsigned long bt_flags; unsigned long bt_flags;
/* LRU control structures */
struct shrinker bt_shrinker;
struct list_head bt_lru;
spinlock_t bt_lru_lock;
unsigned int bt_lru_nr;
} xfs_buftarg_t; } xfs_buftarg_t;
/* /*
...@@ -164,9 +169,11 @@ typedef struct xfs_buf { ...@@ -164,9 +169,11 @@ typedef struct xfs_buf {
xfs_off_t b_file_offset; /* offset in file */ xfs_off_t b_file_offset; /* offset in file */
size_t b_buffer_length;/* size of buffer in bytes */ size_t b_buffer_length;/* size of buffer in bytes */
atomic_t b_hold; /* reference count */ atomic_t b_hold; /* reference count */
atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */ xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */ struct semaphore b_sema; /* semaphore for lockables */
struct list_head b_lru; /* lru list */
wait_queue_head_t b_waiters; /* unpin waiters */ wait_queue_head_t b_waiters; /* unpin waiters */
struct list_head b_list; struct list_head b_list;
struct xfs_perag *b_pag; /* contains rbtree root */ struct xfs_perag *b_pag; /* contains rbtree root */
...@@ -264,7 +271,8 @@ extern void xfs_buf_terminate(void); ...@@ -264,7 +271,8 @@ extern void xfs_buf_terminate(void);
#define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ #define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
#define XFS_BUF_STALE(bp) ((bp)->b_flags |= XBF_STALE) void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_STALE(bp) xfs_buf_stale(bp);
#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
#define XFS_BUF_SUPER_STALE(bp) do { \ #define XFS_BUF_SUPER_STALE(bp) do { \
...@@ -328,9 +336,15 @@ extern void xfs_buf_terminate(void); ...@@ -328,9 +336,15 @@ extern void xfs_buf_terminate(void);
#define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) #define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
#define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) #define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) static inline void
xfs_buf_set_ref(
struct xfs_buf *bp,
int lru_ref)
{
atomic_set(&bp->b_lru_ref, lru_ref);
}
#define XFS_BUF_SET_VTYPE_REF(bp, type, ref) xfs_buf_set_ref(bp, ref)
#define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) #define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
#define XFS_BUF_SET_REF(bp, ref) do { } while (0)
#define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count)) #define XFS_BUF_ISPINNED(bp) atomic_read(&((bp)->b_pin_count))
......
...@@ -70,8 +70,16 @@ xfs_fs_encode_fh( ...@@ -70,8 +70,16 @@ xfs_fs_encode_fh(
else else
fileid_type = FILEID_INO32_GEN_PARENT; fileid_type = FILEID_INO32_GEN_PARENT;
/* filesystem may contain 64bit inode numbers */ /*
if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS)) * If the the filesystem may contain 64bit inode numbers, we need
* to use larger file handles that can represent them.
*
* While we only allocate inodes that do not fit into 32 bits any
* large enough filesystem may contain them, thus the slightly
* confusing looking conditional below.
*/
if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
fileid_type |= XFS_FILEID_TYPE_64FLAG; fileid_type |= XFS_FILEID_TYPE_64FLAG;
/* /*
......
...@@ -37,7 +37,6 @@ ...@@ -37,7 +37,6 @@
#include <kmem.h> #include <kmem.h>
#include <mrlock.h> #include <mrlock.h>
#include <sv.h>
#include <time.h> #include <time.h>
#include <support/debug.h> #include <support/debug.h>
......
...@@ -834,8 +834,11 @@ xfsaild_wakeup( ...@@ -834,8 +834,11 @@ xfsaild_wakeup(
struct xfs_ail *ailp, struct xfs_ail *ailp,
xfs_lsn_t threshold_lsn) xfs_lsn_t threshold_lsn)
{ {
ailp->xa_target = threshold_lsn; /* only ever move the target forwards */
wake_up_process(ailp->xa_task); if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
ailp->xa_target = threshold_lsn;
wake_up_process(ailp->xa_task);
}
} }
STATIC int STATIC int
...@@ -847,8 +850,17 @@ xfsaild( ...@@ -847,8 +850,17 @@ xfsaild(
long tout = 0; /* milliseconds */ long tout = 0; /* milliseconds */
while (!kthread_should_stop()) { while (!kthread_should_stop()) {
schedule_timeout_interruptible(tout ? /*
msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); * for short sleeps indicating congestion, don't allow us to
* get woken early. Otherwise all we do is bang on the AIL lock
* without making progress.
*/
if (tout && tout <= 20)
__set_current_state(TASK_KILLABLE);
else
__set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(tout ?
msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
/* swsusp */ /* swsusp */
try_to_freeze(); try_to_freeze();
...@@ -1118,6 +1130,8 @@ xfs_fs_evict_inode( ...@@ -1118,6 +1130,8 @@ xfs_fs_evict_inode(
*/ */
ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
&xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
xfs_inactive(ip); xfs_inactive(ip);
} }
......
...@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab( ...@@ -53,14 +53,30 @@ xfs_inode_ag_walk_grab(
{ {
struct inode *inode = VFS_I(ip); struct inode *inode = VFS_I(ip);
ASSERT(rcu_read_lock_held());
/*
* check for stale RCU freed inode
*
* If the inode has been reallocated, it doesn't matter if it's not in
* the AG we are walking - we are walking for writeback, so if it
* passes all the "valid inode" checks and is dirty, then we'll write
* it back anyway. If it has been reallocated and still being
* initialised, the XFS_INEW check below will catch it.
*/
spin_lock(&ip->i_flags_lock);
if (!ip->i_ino)
goto out_unlock_noent;
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
goto out_unlock_noent;
spin_unlock(&ip->i_flags_lock);
/* nothing to sync during shutdown */ /* nothing to sync during shutdown */
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return EFSCORRUPTED; return EFSCORRUPTED;
/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
return ENOENT;
/* If we can't grab the inode, it must on it's way to reclaim. */ /* If we can't grab the inode, it must on it's way to reclaim. */
if (!igrab(inode)) if (!igrab(inode))
return ENOENT; return ENOENT;
...@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab( ...@@ -72,6 +88,10 @@ xfs_inode_ag_walk_grab(
/* inode is valid */ /* inode is valid */
return 0; return 0;
out_unlock_noent:
spin_unlock(&ip->i_flags_lock);
return ENOENT;
} }
STATIC int STATIC int
...@@ -98,12 +118,12 @@ xfs_inode_ag_walk( ...@@ -98,12 +118,12 @@ xfs_inode_ag_walk(
int error = 0; int error = 0;
int i; int i;
read_lock(&pag->pag_ici_lock); rcu_read_lock();
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)batch, first_index, (void **)batch, first_index,
XFS_LOOKUP_BATCH); XFS_LOOKUP_BATCH);
if (!nr_found) { if (!nr_found) {
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
break; break;
} }
...@@ -118,18 +138,26 @@ xfs_inode_ag_walk( ...@@ -118,18 +138,26 @@ xfs_inode_ag_walk(
batch[i] = NULL; batch[i] = NULL;
/* /*
* Update the index for the next lookup. Catch overflows * Update the index for the next lookup. Catch
* into the next AG range which can occur if we have inodes * overflows into the next AG range which can occur if
* in the last block of the AG and we are currently * we have inodes in the last block of the AG and we
* pointing to the last inode. * are currently pointing to the last inode.
*
* Because we may see inodes that are from the wrong AG
* due to RCU freeing and reallocation, only update the
* index if it lies in this AG. It was a race that lead
* us to see this inode, so another lookup from the
* same index will not find it again.
*/ */
if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
done = 1; done = 1;
} }
/* unlock now we've grabbed the inodes. */ /* unlock now we've grabbed the inodes. */
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
for (i = 0; i < nr_found; i++) { for (i = 0; i < nr_found; i++) {
if (!batch[i]) if (!batch[i])
...@@ -592,12 +620,12 @@ xfs_inode_set_reclaim_tag( ...@@ -592,12 +620,12 @@ xfs_inode_set_reclaim_tag(
struct xfs_perag *pag; struct xfs_perag *pag;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
write_lock(&pag->pag_ici_lock); spin_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
__xfs_inode_set_reclaim_tag(pag, ip); __xfs_inode_set_reclaim_tag(pag, ip);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE); __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock); spin_unlock(&pag->pag_ici_lock);
xfs_perag_put(pag); xfs_perag_put(pag);
} }
...@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab( ...@@ -639,9 +667,14 @@ xfs_reclaim_inode_grab(
struct xfs_inode *ip, struct xfs_inode *ip,
int flags) int flags)
{ {
ASSERT(rcu_read_lock_held());
/* quick check for stale RCU freed inode */
if (!ip->i_ino)
return 1;
/* /*
* do some unlocked checks first to avoid unnecceary lock traffic. * do some unlocked checks first to avoid unnecessary lock traffic.
* The first is a flush lock check, the second is a already in reclaim * The first is a flush lock check, the second is a already in reclaim
* check. Only do these checks if we are not going to block on locks. * check. Only do these checks if we are not going to block on locks.
*/ */
...@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab( ...@@ -654,11 +687,16 @@ xfs_reclaim_inode_grab(
* The radix tree lock here protects a thread in xfs_iget from racing * The radix tree lock here protects a thread in xfs_iget from racing
* with us starting reclaim on the inode. Once we have the * with us starting reclaim on the inode. Once we have the
* XFS_IRECLAIM flag set it will not touch us. * XFS_IRECLAIM flag set it will not touch us.
*
* Due to RCU lookup, we may find inodes that have been freed and only
* have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that
* aren't candidates for reclaim at all, so we must check the
* XFS_IRECLAIMABLE is set first before proceeding to reclaim.
*/ */
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { __xfs_iflags_test(ip, XFS_IRECLAIM)) {
/* ignore as it is already under reclaim */ /* not a reclaim candidate. */
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
return 1; return 1;
} }
...@@ -795,12 +833,12 @@ xfs_reclaim_inode( ...@@ -795,12 +833,12 @@ xfs_reclaim_inode(
* added to the tree assert that it's been there before to catch * added to the tree assert that it's been there before to catch
* problems with the inode life time early on. * problems with the inode life time early on.
*/ */
write_lock(&pag->pag_ici_lock); spin_lock(&pag->pag_ici_lock);
if (!radix_tree_delete(&pag->pag_ici_root, if (!radix_tree_delete(&pag->pag_ici_root,
XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
ASSERT(0); ASSERT(0);
__xfs_inode_clear_reclaim(pag, ip); __xfs_inode_clear_reclaim(pag, ip);
write_unlock(&pag->pag_ici_lock); spin_unlock(&pag->pag_ici_lock);
/* /*
* Here we do an (almost) spurious inode lock in order to coordinate * Here we do an (almost) spurious inode lock in order to coordinate
...@@ -864,14 +902,14 @@ xfs_reclaim_inodes_ag( ...@@ -864,14 +902,14 @@ xfs_reclaim_inodes_ag(
struct xfs_inode *batch[XFS_LOOKUP_BATCH]; struct xfs_inode *batch[XFS_LOOKUP_BATCH];
int i; int i;
write_lock(&pag->pag_ici_lock); rcu_read_lock();
nr_found = radix_tree_gang_lookup_tag( nr_found = radix_tree_gang_lookup_tag(
&pag->pag_ici_root, &pag->pag_ici_root,
(void **)batch, first_index, (void **)batch, first_index,
XFS_LOOKUP_BATCH, XFS_LOOKUP_BATCH,
XFS_ICI_RECLAIM_TAG); XFS_ICI_RECLAIM_TAG);
if (!nr_found) { if (!nr_found) {
write_unlock(&pag->pag_ici_lock); rcu_read_unlock();
break; break;
} }
...@@ -891,14 +929,24 @@ xfs_reclaim_inodes_ag( ...@@ -891,14 +929,24 @@ xfs_reclaim_inodes_ag(
* occur if we have inodes in the last block of * occur if we have inodes in the last block of
* the AG and we are currently pointing to the * the AG and we are currently pointing to the
* last inode. * last inode.
*
* Because we may see inodes that are from the
* wrong AG due to RCU freeing and
* reallocation, only update the index if it
* lies in this AG. It was a race that lead us
* to see this inode, so another lookup from
* the same index will not find it again.
*/ */
if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
pag->pag_agno)
continue;
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
done = 1; done = 1;
} }
/* unlock now we've grabbed the inodes. */ /* unlock now we've grabbed the inodes. */
write_unlock(&pag->pag_ici_lock); rcu_read_unlock();
for (i = 0; i < nr_found; i++) { for (i = 0; i < nr_found; i++) {
if (!batch[i]) if (!batch[i])
......
...@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, ...@@ -766,8 +766,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__field(int, curr_res) __field(int, curr_res)
__field(int, unit_res) __field(int, unit_res)
__field(unsigned int, flags) __field(unsigned int, flags)
__field(void *, reserve_headq) __field(int, reserveq)
__field(void *, write_headq) __field(int, writeq)
__field(int, grant_reserve_cycle) __field(int, grant_reserve_cycle)
__field(int, grant_reserve_bytes) __field(int, grant_reserve_bytes)
__field(int, grant_write_cycle) __field(int, grant_write_cycle)
...@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, ...@@ -784,19 +784,21 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__entry->curr_res = tic->t_curr_res; __entry->curr_res = tic->t_curr_res;
__entry->unit_res = tic->t_unit_res; __entry->unit_res = tic->t_unit_res;
__entry->flags = tic->t_flags; __entry->flags = tic->t_flags;
__entry->reserve_headq = log->l_reserve_headq; __entry->reserveq = list_empty(&log->l_reserveq);
__entry->write_headq = log->l_write_headq; __entry->writeq = list_empty(&log->l_writeq);
__entry->grant_reserve_cycle = log->l_grant_reserve_cycle; xlog_crack_grant_head(&log->l_grant_reserve_head,
__entry->grant_reserve_bytes = log->l_grant_reserve_bytes; &__entry->grant_reserve_cycle,
__entry->grant_write_cycle = log->l_grant_write_cycle; &__entry->grant_reserve_bytes);
__entry->grant_write_bytes = log->l_grant_write_bytes; xlog_crack_grant_head(&log->l_grant_write_head,
&__entry->grant_write_cycle,
&__entry->grant_write_bytes);
__entry->curr_cycle = log->l_curr_cycle; __entry->curr_cycle = log->l_curr_cycle;
__entry->curr_block = log->l_curr_block; __entry->curr_block = log->l_curr_block;
__entry->tail_lsn = log->l_tail_lsn; __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
), ),
TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
"t_unit_res %u t_flags %s reserve_headq 0x%p " "t_unit_res %u t_flags %s reserveq %s "
"write_headq 0x%p grant_reserve_cycle %d " "writeq %s grant_reserve_cycle %d "
"grant_reserve_bytes %d grant_write_cycle %d " "grant_reserve_bytes %d grant_write_cycle %d "
"grant_write_bytes %d curr_cycle %d curr_block %d " "grant_write_bytes %d curr_cycle %d curr_block %d "
"tail_cycle %d tail_block %d", "tail_cycle %d tail_block %d",
...@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class, ...@@ -807,8 +809,8 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
__entry->curr_res, __entry->curr_res,
__entry->unit_res, __entry->unit_res,
__print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
__entry->reserve_headq, __entry->reserveq ? "empty" : "active",
__entry->write_headq, __entry->writeq ? "empty" : "active",
__entry->grant_reserve_cycle, __entry->grant_reserve_cycle,
__entry->grant_reserve_bytes, __entry->grant_reserve_bytes,
__entry->grant_write_cycle, __entry->grant_write_cycle,
...@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1); ...@@ -835,6 +837,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2); DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2); DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
...@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1); ...@@ -842,6 +845,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub); DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
...@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage); ...@@ -935,10 +939,10 @@ DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage); DEFINE_PAGE_EVENT(xfs_invalidatepage);
DECLARE_EVENT_CLASS(xfs_iomap_class, DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int flags, struct xfs_bmbt_irec *irec), int type, struct xfs_bmbt_irec *irec),
TP_ARGS(ip, offset, count, flags, irec), TP_ARGS(ip, offset, count, type, irec),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(dev_t, dev) __field(dev_t, dev)
__field(xfs_ino_t, ino) __field(xfs_ino_t, ino)
...@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class, ...@@ -946,7 +950,7 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
__field(loff_t, new_size) __field(loff_t, new_size)
__field(loff_t, offset) __field(loff_t, offset)
__field(size_t, count) __field(size_t, count)
__field(int, flags) __field(int, type)
__field(xfs_fileoff_t, startoff) __field(xfs_fileoff_t, startoff)
__field(xfs_fsblock_t, startblock) __field(xfs_fsblock_t, startblock)
__field(xfs_filblks_t, blockcount) __field(xfs_filblks_t, blockcount)
...@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class, ...@@ -958,13 +962,13 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
__entry->new_size = ip->i_new_size; __entry->new_size = ip->i_new_size;
__entry->offset = offset; __entry->offset = offset;
__entry->count = count; __entry->count = count;
__entry->flags = flags; __entry->type = type;
__entry->startoff = irec ? irec->br_startoff : 0; __entry->startoff = irec ? irec->br_startoff : 0;
__entry->startblock = irec ? irec->br_startblock : 0; __entry->startblock = irec ? irec->br_startblock : 0;
__entry->blockcount = irec ? irec->br_blockcount : 0; __entry->blockcount = irec ? irec->br_blockcount : 0;
), ),
TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
"offset 0x%llx count %zd flags %s " "offset 0x%llx count %zd type %s "
"startoff 0x%llx startblock %lld blockcount 0x%llx", "startoff 0x%llx startblock %lld blockcount 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino, __entry->ino,
...@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class, ...@@ -972,20 +976,21 @@ DECLARE_EVENT_CLASS(xfs_iomap_class,
__entry->new_size, __entry->new_size,
__entry->offset, __entry->offset,
__entry->count, __entry->count,
__print_flags(__entry->flags, "|", BMAPI_FLAGS), __print_symbolic(__entry->type, XFS_IO_TYPES),
__entry->startoff, __entry->startoff,
(__int64_t)__entry->startblock, (__int64_t)__entry->startblock,
__entry->blockcount) __entry->blockcount)
) )
#define DEFINE_IOMAP_EVENT(name) \ #define DEFINE_IOMAP_EVENT(name) \
DEFINE_EVENT(xfs_iomap_class, name, \ DEFINE_EVENT(xfs_imap_class, name, \
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
int flags, struct xfs_bmbt_irec *irec), \ int type, struct xfs_bmbt_irec *irec), \
TP_ARGS(ip, offset, count, flags, irec)) TP_ARGS(ip, offset, count, type, irec))
DEFINE_IOMAP_EVENT(xfs_iomap_enter); DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
DEFINE_IOMAP_EVENT(xfs_iomap_found); DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_iomap_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
DECLARE_EVENT_CLASS(xfs_simple_io_class, DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
...@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \ ...@@ -1022,6 +1027,7 @@ DEFINE_EVENT(xfs_simple_io_class, name, \
TP_ARGS(ip, offset, count)) TP_ARGS(ip, offset, count))
DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
TRACE_EVENT(xfs_itruncate_start, TRACE_EVENT(xfs_itruncate_start,
...@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \ ...@@ -1420,6 +1426,7 @@ DEFINE_EVENT(xfs_alloc_class, name, \
TP_PROTO(struct xfs_alloc_arg *args), \ TP_PROTO(struct xfs_alloc_arg *args), \
TP_ARGS(args)) TP_ARGS(args))
DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
DEFINE_ALLOC_EVENT(xfs_alloc_near_first); DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
......
...@@ -149,7 +149,6 @@ xfs_qm_dqdestroy( ...@@ -149,7 +149,6 @@ xfs_qm_dqdestroy(
ASSERT(list_empty(&dqp->q_freelist)); ASSERT(list_empty(&dqp->q_freelist));
mutex_destroy(&dqp->q_qlock); mutex_destroy(&dqp->q_qlock);
sv_destroy(&dqp->q_pinwait);
kmem_zone_free(xfs_Gqm->qm_dqzone, dqp); kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
atomic_dec(&xfs_Gqm->qm_totaldquots); atomic_dec(&xfs_Gqm->qm_totaldquots);
......
...@@ -227,7 +227,7 @@ typedef struct xfs_perag { ...@@ -227,7 +227,7 @@ typedef struct xfs_perag {
atomic_t pagf_fstrms; /* # of filestreams active in this AG */ atomic_t pagf_fstrms; /* # of filestreams active in this AG */
rwlock_t pag_ici_lock; /* incore inode lock */ spinlock_t pag_ici_lock; /* incore inode cache lock */
struct radix_tree_root pag_ici_root; /* incore inode cache root */ struct radix_tree_root pag_ici_root; /* incore inode cache root */
int pag_ici_reclaimable; /* reclaimable inodes */ int pag_ici_reclaimable; /* reclaimable inodes */
struct mutex pag_ici_reclaim_lock; /* serialisation point */ struct mutex pag_ici_reclaim_lock; /* serialisation point */
......
This diff is collapsed.
...@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) ...@@ -637,7 +637,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
* It didn't all fit, so we have to sort everything on hashval. * It didn't all fit, so we have to sort everything on hashval.
*/ */
sbsize = sf->hdr.count * sizeof(*sbuf); sbsize = sf->hdr.count * sizeof(*sbuf);
sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP); sbp = sbuf = kmem_alloc(sbsize, KM_SLEEP | KM_NOFS);
/* /*
* Scan the attribute list for the rest of the entries, storing * Scan the attribute list for the rest of the entries, storing
...@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context) ...@@ -2386,7 +2386,7 @@ xfs_attr_leaf_list_int(xfs_dabuf_t *bp, xfs_attr_list_context_t *context)
args.dp = context->dp; args.dp = context->dp;
args.whichfork = XFS_ATTR_FORK; args.whichfork = XFS_ATTR_FORK;
args.valuelen = valuelen; args.valuelen = valuelen;
args.value = kmem_alloc(valuelen, KM_SLEEP); args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
args.rmtblkno = be32_to_cpu(name_rmt->valueblk); args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen);
retval = xfs_attr_rmtval_get(&args); retval = xfs_attr_rmtval_get(&args);
......
...@@ -634,9 +634,8 @@ xfs_btree_read_bufl( ...@@ -634,9 +634,8 @@ xfs_btree_read_bufl(
return error; return error;
} }
ASSERT(!bp || !XFS_BUF_GETERROR(bp)); ASSERT(!bp || !XFS_BUF_GETERROR(bp));
if (bp != NULL) { if (bp)
XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval); XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
}
*bpp = bp; *bpp = bp;
return 0; return 0;
} }
...@@ -944,13 +943,13 @@ xfs_btree_set_refs( ...@@ -944,13 +943,13 @@ xfs_btree_set_refs(
switch (cur->bc_btnum) { switch (cur->bc_btnum) {
case XFS_BTNUM_BNO: case XFS_BTNUM_BNO:
case XFS_BTNUM_CNT: case XFS_BTNUM_CNT:
XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF); XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
break; break;
case XFS_BTNUM_INO: case XFS_BTNUM_INO:
XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF); XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, XFS_INO_BTREE_REF);
break; break;
case XFS_BTNUM_BMAP: case XFS_BTNUM_BMAP:
XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF); XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, XFS_BMAP_BTREE_REF);
break; break;
default: default:
ASSERT(0); ASSERT(0);
......
...@@ -142,7 +142,7 @@ xfs_buf_item_log_check( ...@@ -142,7 +142,7 @@ xfs_buf_item_log_check(
#endif #endif
STATIC void xfs_buf_error_relse(xfs_buf_t *bp); STATIC void xfs_buf_error_relse(xfs_buf_t *bp);
STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); STATIC void xfs_buf_do_callbacks(struct xfs_buf *bp);
/* /*
* This returns the number of log iovecs needed to log the * This returns the number of log iovecs needed to log the
...@@ -450,7 +450,7 @@ xfs_buf_item_unpin( ...@@ -450,7 +450,7 @@ xfs_buf_item_unpin(
* xfs_trans_ail_delete() drops the AIL lock. * xfs_trans_ail_delete() drops the AIL lock.
*/ */
if (bip->bli_flags & XFS_BLI_STALE_INODE) { if (bip->bli_flags & XFS_BLI_STALE_INODE) {
xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); xfs_buf_do_callbacks(bp);
XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp); XFS_BUF_CLR_IODONE_FUNC(bp);
} else { } else {
...@@ -918,15 +918,26 @@ xfs_buf_attach_iodone( ...@@ -918,15 +918,26 @@ xfs_buf_attach_iodone(
XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
} }
/*
* We can have many callbacks on a buffer. Running the callbacks individually
* can cause a lot of contention on the AIL lock, so we allow for a single
* callback to be able to scan the remaining lip->li_bio_list for other items
* of the same type and callback to be processed in the first call.
*
* As a result, the loop walking the callback list below will also modify the
* list. it removes the first item from the list and then runs the callback.
* The loop then restarts from the new head of the list. This allows the
* callback to scan and modify the list attached to the buffer and we don't
* have to care about maintaining a next item pointer.
*/
STATIC void STATIC void
xfs_buf_do_callbacks( xfs_buf_do_callbacks(
xfs_buf_t *bp, struct xfs_buf *bp)
xfs_log_item_t *lip)
{ {
xfs_log_item_t *nlip; struct xfs_log_item *lip;
while (lip != NULL) { while ((lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *)) != NULL) {
nlip = lip->li_bio_list; XFS_BUF_SET_FSPRIVATE(bp, lip->li_bio_list);
ASSERT(lip->li_cb != NULL); ASSERT(lip->li_cb != NULL);
/* /*
* Clear the next pointer so we don't have any * Clear the next pointer so we don't have any
...@@ -936,7 +947,6 @@ xfs_buf_do_callbacks( ...@@ -936,7 +947,6 @@ xfs_buf_do_callbacks(
*/ */
lip->li_bio_list = NULL; lip->li_bio_list = NULL;
lip->li_cb(bp, lip); lip->li_cb(bp, lip);
lip = nlip;
} }
} }
...@@ -970,7 +980,7 @@ xfs_buf_iodone_callbacks( ...@@ -970,7 +980,7 @@ xfs_buf_iodone_callbacks(
ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
XFS_BUF_SUPER_STALE(bp); XFS_BUF_SUPER_STALE(bp);
trace_xfs_buf_item_iodone(bp, _RET_IP_); trace_xfs_buf_item_iodone(bp, _RET_IP_);
xfs_buf_do_callbacks(bp, lip); xfs_buf_do_callbacks(bp);
XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp); XFS_BUF_CLR_IODONE_FUNC(bp);
xfs_buf_ioend(bp, 0); xfs_buf_ioend(bp, 0);
...@@ -1029,7 +1039,7 @@ xfs_buf_iodone_callbacks( ...@@ -1029,7 +1039,7 @@ xfs_buf_iodone_callbacks(
return; return;
} }
xfs_buf_do_callbacks(bp, lip); xfs_buf_do_callbacks(bp);
XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp); XFS_BUF_CLR_IODONE_FUNC(bp);
xfs_buf_ioend(bp, 0); xfs_buf_ioend(bp, 0);
...@@ -1063,7 +1073,7 @@ xfs_buf_error_relse( ...@@ -1063,7 +1073,7 @@ xfs_buf_error_relse(
* We have to unpin the pinned buffers so do the * We have to unpin the pinned buffers so do the
* callbacks. * callbacks.
*/ */
xfs_buf_do_callbacks(bp, lip); xfs_buf_do_callbacks(bp);
XFS_BUF_SET_FSPRIVATE(bp, NULL); XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp); XFS_BUF_CLR_IODONE_FUNC(bp);
XFS_BUF_SET_BRELSE_FUNC(bp,NULL); XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
......
...@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item { ...@@ -105,17 +105,6 @@ typedef struct xfs_buf_log_item {
xfs_buf_log_format_t bli_format; /* in-log header */ xfs_buf_log_format_t bli_format; /* in-log header */
} xfs_buf_log_item_t; } xfs_buf_log_item_t;
/*
* This structure is used during recovery to record the buf log
* items which have been canceled and should not be replayed.
*/
typedef struct xfs_buf_cancel {
xfs_daddr_t bc_blkno;
uint bc_len;
int bc_refcount;
struct xfs_buf_cancel *bc_next;
} xfs_buf_cancel_t;
void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_relse(struct xfs_buf *);
void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
......
...@@ -47,6 +47,28 @@ xfs_efi_item_free( ...@@ -47,6 +47,28 @@ xfs_efi_item_free(
kmem_zone_free(xfs_efi_zone, efip); kmem_zone_free(xfs_efi_zone, efip);
} }
/*
* Freeing the efi requires that we remove it from the AIL if it has already
* been placed there. However, the EFI may not yet have been placed in the AIL
* when called by xfs_efi_release() from EFD processing due to the ordering of
* committed vs unpin operations in bulk insert operations. Hence the
* test_and_clear_bit(XFS_EFI_COMMITTED) to ensure only the last caller frees
* the EFI.
*/
STATIC void
__xfs_efi_release(
struct xfs_efi_log_item *efip)
{
struct xfs_ail *ailp = efip->efi_item.li_ailp;
if (!test_and_clear_bit(XFS_EFI_COMMITTED, &efip->efi_flags)) {
spin_lock(&ailp->xa_lock);
/* xfs_trans_ail_delete() drops the AIL lock. */
xfs_trans_ail_delete(ailp, &efip->efi_item);
xfs_efi_item_free(efip);
}
}
/* /*
* This returns the number of iovecs needed to log the given efi item. * This returns the number of iovecs needed to log the given efi item.
* We only need 1 iovec for an efi item. It just logs the efi_log_format * We only need 1 iovec for an efi item. It just logs the efi_log_format
...@@ -74,7 +96,8 @@ xfs_efi_item_format( ...@@ -74,7 +96,8 @@ xfs_efi_item_format(
struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_efi_log_item *efip = EFI_ITEM(lip);
uint size; uint size;
ASSERT(efip->efi_next_extent == efip->efi_format.efi_nextents); ASSERT(atomic_read(&efip->efi_next_extent) ==
efip->efi_format.efi_nextents);
efip->efi_format.efi_type = XFS_LI_EFI; efip->efi_format.efi_type = XFS_LI_EFI;
...@@ -99,10 +122,12 @@ xfs_efi_item_pin( ...@@ -99,10 +122,12 @@ xfs_efi_item_pin(
} }
/* /*
* While EFIs cannot really be pinned, the unpin operation is the * While EFIs cannot really be pinned, the unpin operation is the last place at
* last place at which the EFI is manipulated during a transaction. * which the EFI is manipulated during a transaction. If we are being asked to
* Here we coordinate with xfs_efi_cancel() to determine who gets to * remove the EFI it's because the transaction has been cancelled and by
* free the EFI. * definition that means the EFI cannot be in the AIL so remove it from the
* transaction and free it. Otherwise coordinate with xfs_efi_release() (via
* XFS_EFI_COMMITTED) to determine who gets to free the EFI.
*/ */
STATIC void STATIC void
xfs_efi_item_unpin( xfs_efi_item_unpin(
...@@ -110,20 +135,14 @@ xfs_efi_item_unpin( ...@@ -110,20 +135,14 @@ xfs_efi_item_unpin(
int remove) int remove)
{ {
struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_efi_log_item *efip = EFI_ITEM(lip);
struct xfs_ail *ailp = lip->li_ailp;
spin_lock(&ailp->xa_lock);
if (efip->efi_flags & XFS_EFI_CANCELED) {
if (remove)
xfs_trans_del_item(lip);
/* xfs_trans_ail_delete() drops the AIL lock. */ if (remove) {
xfs_trans_ail_delete(ailp, lip); ASSERT(!(lip->li_flags & XFS_LI_IN_AIL));
xfs_trans_del_item(lip);
xfs_efi_item_free(efip); xfs_efi_item_free(efip);
} else { return;
efip->efi_flags |= XFS_EFI_COMMITTED;
spin_unlock(&ailp->xa_lock);
} }
__xfs_efi_release(efip);
} }
/* /*
...@@ -152,16 +171,20 @@ xfs_efi_item_unlock( ...@@ -152,16 +171,20 @@ xfs_efi_item_unlock(
} }
/* /*
* The EFI is logged only once and cannot be moved in the log, so * The EFI is logged only once and cannot be moved in the log, so simply return
* simply return the lsn at which it's been logged. The canceled * the lsn at which it's been logged. For bulk transaction committed
* flag is not paid any attention here. Checking for that is delayed * processing, the EFI may be processed but not yet unpinned prior to the EFD
* until the EFI is unpinned. * being processed. Set the XFS_EFI_COMMITTED flag so this case can be detected
* when processing the EFD.
*/ */
STATIC xfs_lsn_t STATIC xfs_lsn_t
xfs_efi_item_committed( xfs_efi_item_committed(
struct xfs_log_item *lip, struct xfs_log_item *lip,
xfs_lsn_t lsn) xfs_lsn_t lsn)
{ {
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
set_bit(XFS_EFI_COMMITTED, &efip->efi_flags);
return lsn; return lsn;
} }
...@@ -230,6 +253,7 @@ xfs_efi_init( ...@@ -230,6 +253,7 @@ xfs_efi_init(
xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
efip->efi_format.efi_nextents = nextents; efip->efi_format.efi_nextents = nextents;
efip->efi_format.efi_id = (__psint_t)(void*)efip; efip->efi_format.efi_id = (__psint_t)(void*)efip;
atomic_set(&efip->efi_next_extent, 0);
return efip; return efip;
} }
...@@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt) ...@@ -289,37 +313,18 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
} }
/* /*
* This is called by the efd item code below to release references to * This is called by the efd item code below to release references to the given
* the given efi item. Each efd calls this with the number of * efi item. Each efd calls this with the number of extents that it has
* extents that it has logged, and when the sum of these reaches * logged, and when the sum of these reaches the total number of extents logged
* the total number of extents logged by this efi item we can free * by this efi item we can free the efi item.
* the efi item.
*
* Freeing the efi item requires that we remove it from the AIL.
* We'll use the AIL lock to protect our counters as well as
* the removal from the AIL.
*/ */
void void
xfs_efi_release(xfs_efi_log_item_t *efip, xfs_efi_release(xfs_efi_log_item_t *efip,
uint nextents) uint nextents)
{ {
struct xfs_ail *ailp = efip->efi_item.li_ailp; ASSERT(atomic_read(&efip->efi_next_extent) >= nextents);
int extents_left; if (atomic_sub_and_test(nextents, &efip->efi_next_extent))
__xfs_efi_release(efip);
ASSERT(efip->efi_next_extent > 0);
ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
spin_lock(&ailp->xa_lock);
ASSERT(efip->efi_next_extent >= nextents);
efip->efi_next_extent -= nextents;
extents_left = efip->efi_next_extent;
if (extents_left == 0) {
/* xfs_trans_ail_delete() drops the AIL lock. */
xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
xfs_efi_item_free(efip);
} else {
spin_unlock(&ailp->xa_lock);
}
} }
static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip) static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
......
...@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 { ...@@ -111,11 +111,10 @@ typedef struct xfs_efd_log_format_64 {
#define XFS_EFI_MAX_FAST_EXTENTS 16 #define XFS_EFI_MAX_FAST_EXTENTS 16
/* /*
* Define EFI flags. * Define EFI flag bits. Manipulated by set/clear/test_bit operators.
*/ */
#define XFS_EFI_RECOVERED 0x1 #define XFS_EFI_RECOVERED 1
#define XFS_EFI_COMMITTED 0x2 #define XFS_EFI_COMMITTED 2
#define XFS_EFI_CANCELED 0x4
/* /*
* This is the "extent free intention" log item. It is used * This is the "extent free intention" log item. It is used
...@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 { ...@@ -125,8 +124,8 @@ typedef struct xfs_efd_log_format_64 {
*/ */
typedef struct xfs_efi_log_item { typedef struct xfs_efi_log_item {
xfs_log_item_t efi_item; xfs_log_item_t efi_item;
uint efi_flags; /* misc flags */ atomic_t efi_next_extent;
uint efi_next_extent; unsigned long efi_flags; /* misc flags */
xfs_efi_log_format_t efi_format; xfs_efi_log_format_t efi_format;
} xfs_efi_log_item_t; } xfs_efi_log_item_t;
......
...@@ -374,6 +374,7 @@ xfs_growfs_data_private( ...@@ -374,6 +374,7 @@ xfs_growfs_data_private(
mp->m_maxicount = icount << mp->m_sb.sb_inopblog; mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
} else } else
mp->m_maxicount = 0; mp->m_maxicount = 0;
xfs_set_low_space_thresholds(mp);
/* update secondary superblocks. */ /* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) { for (agno = 1; agno < nagcount; agno++) {
......
...@@ -42,6 +42,17 @@ ...@@ -42,6 +42,17 @@
#include "xfs_trace.h" #include "xfs_trace.h"
/*
* Define xfs inode iolock lockdep classes. We need to ensure that all active
* inodes are considered the same for lockdep purposes, including inodes that
* are recycled through the XFS_IRECLAIMABLE state. This is the the only way to
* guarantee the locks are considered the same when there are multiple lock
* initialisation siteѕ. Also, define a reclaimable inode class so it is
* obvious in lockdep reports which class the report is against.
*/
static struct lock_class_key xfs_iolock_active;
struct lock_class_key xfs_iolock_reclaimable;
/* /*
* Allocate and initialise an xfs_inode. * Allocate and initialise an xfs_inode.
*/ */
...@@ -69,8 +80,11 @@ xfs_inode_alloc( ...@@ -69,8 +80,11 @@ xfs_inode_alloc(
ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush)); ASSERT(completion_done(&ip->i_flush));
ASSERT(ip->i_ino == 0);
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
&xfs_iolock_active, "xfs_iolock_active");
/* initialise the xfs inode */ /* initialise the xfs inode */
ip->i_ino = ino; ip->i_ino = ino;
...@@ -85,9 +99,6 @@ xfs_inode_alloc( ...@@ -85,9 +99,6 @@ xfs_inode_alloc(
ip->i_size = 0; ip->i_size = 0;
ip->i_new_size = 0; ip->i_new_size = 0;
/* prevent anyone from using this yet */
VFS_I(ip)->i_state = I_NEW;
return ip; return ip;
} }
...@@ -145,7 +156,18 @@ xfs_inode_free( ...@@ -145,7 +156,18 @@ xfs_inode_free(
ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(completion_done(&ip->i_flush)); ASSERT(completion_done(&ip->i_flush));
call_rcu(&ip->i_vnode.i_rcu, xfs_inode_free_callback); /*
* Because we use RCU freeing we need to ensure the inode always
* appears to be reclaimed with an invalid inode number when in the
* free state. The ip->i_flags_lock provides the barrier against lookup
* races.
*/
spin_lock(&ip->i_flags_lock);
ip->i_flags = XFS_IRECLAIM;
ip->i_ino = 0;
spin_unlock(&ip->i_flags_lock);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
} }
/* /*
...@@ -155,14 +177,29 @@ static int ...@@ -155,14 +177,29 @@ static int
xfs_iget_cache_hit( xfs_iget_cache_hit(
struct xfs_perag *pag, struct xfs_perag *pag,
struct xfs_inode *ip, struct xfs_inode *ip,
xfs_ino_t ino,
int flags, int flags,
int lock_flags) __releases(pag->pag_ici_lock) int lock_flags) __releases(RCU)
{ {
struct inode *inode = VFS_I(ip); struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount; struct xfs_mount *mp = ip->i_mount;
int error; int error;
/*
* check for re-use of an inode within an RCU grace period due to the
* radix tree nodes not being updated yet. We monitor for this by
* setting the inode number to zero before freeing the inode structure.
* If the inode has been reallocated and set up, then the inode number
* will not match, so check for that, too.
*/
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
if (ip->i_ino != ino) {
trace_xfs_iget_skip(ip);
XFS_STATS_INC(xs_ig_frecycle);
error = EAGAIN;
goto out_error;
}
/* /*
* If we are racing with another cache hit that is currently * If we are racing with another cache hit that is currently
...@@ -205,7 +242,7 @@ xfs_iget_cache_hit( ...@@ -205,7 +242,7 @@ xfs_iget_cache_hit(
ip->i_flags |= XFS_IRECLAIM; ip->i_flags |= XFS_IRECLAIM;
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
error = -inode_init_always(mp->m_super, inode); error = -inode_init_always(mp->m_super, inode);
if (error) { if (error) {
...@@ -213,7 +250,7 @@ xfs_iget_cache_hit( ...@@ -213,7 +250,7 @@ xfs_iget_cache_hit(
* Re-initializing the inode failed, and we are in deep * Re-initializing the inode failed, and we are in deep
* trouble. Try to re-add it to the reclaim list. * trouble. Try to re-add it to the reclaim list.
*/ */
read_lock(&pag->pag_ici_lock); rcu_read_lock();
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
ip->i_flags &= ~XFS_INEW; ip->i_flags &= ~XFS_INEW;
...@@ -223,14 +260,20 @@ xfs_iget_cache_hit( ...@@ -223,14 +260,20 @@ xfs_iget_cache_hit(
goto out_error; goto out_error;
} }
write_lock(&pag->pag_ici_lock); spin_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM); ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
ip->i_flags |= XFS_INEW; ip->i_flags |= XFS_INEW;
__xfs_inode_clear_reclaim_tag(mp, pag, ip); __xfs_inode_clear_reclaim_tag(mp, pag, ip);
inode->i_state = I_NEW; inode->i_state = I_NEW;
ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
&xfs_iolock_active, "xfs_iolock_active");
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock); spin_unlock(&pag->pag_ici_lock);
} else { } else {
/* If the VFS inode is being torn down, pause and try again. */ /* If the VFS inode is being torn down, pause and try again. */
if (!igrab(inode)) { if (!igrab(inode)) {
...@@ -241,7 +284,7 @@ xfs_iget_cache_hit( ...@@ -241,7 +284,7 @@ xfs_iget_cache_hit(
/* We've got a live one. */ /* We've got a live one. */
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
trace_xfs_iget_hit(ip); trace_xfs_iget_hit(ip);
} }
...@@ -255,7 +298,7 @@ xfs_iget_cache_hit( ...@@ -255,7 +298,7 @@ xfs_iget_cache_hit(
out_error: out_error:
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
return error; return error;
} }
...@@ -308,7 +351,7 @@ xfs_iget_cache_miss( ...@@ -308,7 +351,7 @@ xfs_iget_cache_miss(
BUG(); BUG();
} }
write_lock(&pag->pag_ici_lock); spin_lock(&pag->pag_ici_lock);
/* insert the new inode */ /* insert the new inode */
error = radix_tree_insert(&pag->pag_ici_root, agino, ip); error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
...@@ -323,14 +366,14 @@ xfs_iget_cache_miss( ...@@ -323,14 +366,14 @@ xfs_iget_cache_miss(
ip->i_udquot = ip->i_gdquot = NULL; ip->i_udquot = ip->i_gdquot = NULL;
xfs_iflags_set(ip, XFS_INEW); xfs_iflags_set(ip, XFS_INEW);
write_unlock(&pag->pag_ici_lock); spin_unlock(&pag->pag_ici_lock);
radix_tree_preload_end(); radix_tree_preload_end();
*ipp = ip; *ipp = ip;
return 0; return 0;
out_preload_end: out_preload_end:
write_unlock(&pag->pag_ici_lock); spin_unlock(&pag->pag_ici_lock);
radix_tree_preload_end(); radix_tree_preload_end();
if (lock_flags) if (lock_flags)
xfs_iunlock(ip, lock_flags); xfs_iunlock(ip, lock_flags);
...@@ -377,7 +420,7 @@ xfs_iget( ...@@ -377,7 +420,7 @@ xfs_iget(
xfs_agino_t agino; xfs_agino_t agino;
/* reject inode numbers outside existing AGs */ /* reject inode numbers outside existing AGs */
if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return EINVAL; return EINVAL;
/* get the perag structure and ensure that it's inode capable */ /* get the perag structure and ensure that it's inode capable */
...@@ -386,15 +429,15 @@ xfs_iget( ...@@ -386,15 +429,15 @@ xfs_iget(
again: again:
error = 0; error = 0;
read_lock(&pag->pag_ici_lock); rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, agino); ip = radix_tree_lookup(&pag->pag_ici_root, agino);
if (ip) { if (ip) {
error = xfs_iget_cache_hit(pag, ip, flags, lock_flags); error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
if (error) if (error)
goto out_error_or_again; goto out_error_or_again;
} else { } else {
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
XFS_STATS_INC(xs_ig_missed); XFS_STATS_INC(xs_ig_missed);
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
......
...@@ -887,7 +887,7 @@ xfs_iread( ...@@ -887,7 +887,7 @@ xfs_iread(
* around for a while. This helps to keep recently accessed * around for a while. This helps to keep recently accessed
* meta-data in-core longer. * meta-data in-core longer.
*/ */
XFS_BUF_SET_REF(bp, XFS_INO_REF); xfs_buf_set_ref(bp, XFS_INO_REF);
/* /*
* Use xfs_trans_brelse() to release the buffer containing the * Use xfs_trans_brelse() to release the buffer containing the
...@@ -2000,16 +2000,32 @@ xfs_ifree_cluster( ...@@ -2000,16 +2000,32 @@ xfs_ifree_cluster(
*/ */
for (i = 0; i < ninodes; i++) { for (i = 0; i < ninodes; i++) {
retry: retry:
read_lock(&pag->pag_ici_lock); rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, ip = radix_tree_lookup(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, (inum + i))); XFS_INO_TO_AGINO(mp, (inum + i)));
/* Inode not in memory or stale, nothing to do */ /* Inode not in memory, nothing to do */
if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { if (!ip) {
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
continue; continue;
} }
/*
* because this is an RCU protected lookup, we could
* find a recently freed or even reallocated inode
* during the lookup. We need to check under the
* i_flags_lock for a valid inode here. Skip it if it
* is not valid, the wrong inode or stale.
*/
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != inum + i ||
__xfs_iflags_test(ip, XFS_ISTALE)) {
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
continue;
}
spin_unlock(&ip->i_flags_lock);
/* /*
* Don't try to lock/unlock the current inode, but we * Don't try to lock/unlock the current inode, but we
* _cannot_ skip the other inodes that we did not find * _cannot_ skip the other inodes that we did not find
...@@ -2019,11 +2035,11 @@ xfs_ifree_cluster( ...@@ -2019,11 +2035,11 @@ xfs_ifree_cluster(
*/ */
if (ip != free_ip && if (ip != free_ip &&
!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
delay(1); delay(1);
goto retry; goto retry;
} }
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
xfs_iflock(ip); xfs_iflock(ip);
xfs_iflags_set(ip, XFS_ISTALE); xfs_iflags_set(ip, XFS_ISTALE);
...@@ -2629,7 +2645,7 @@ xfs_iflush_cluster( ...@@ -2629,7 +2645,7 @@ xfs_iflush_cluster(
mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask; first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
read_lock(&pag->pag_ici_lock); rcu_read_lock();
/* really need a gang lookup range call here */ /* really need a gang lookup range call here */
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist, nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
first_index, inodes_per_cluster); first_index, inodes_per_cluster);
...@@ -2640,9 +2656,21 @@ xfs_iflush_cluster( ...@@ -2640,9 +2656,21 @@ xfs_iflush_cluster(
iq = ilist[i]; iq = ilist[i];
if (iq == ip) if (iq == ip)
continue; continue;
/* if the inode lies outside this cluster, we're done. */
if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) /*
break; * because this is an RCU protected lookup, we could find a
* recently freed or even reallocated inode during the lookup.
* We need to check under the i_flags_lock for a valid inode
* here. Skip it if it is not valid or the wrong inode.
*/
spin_lock(&ip->i_flags_lock);
if (!ip->i_ino ||
(XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
spin_unlock(&ip->i_flags_lock);
continue;
}
spin_unlock(&ip->i_flags_lock);
/* /*
* Do an un-protected check to see if the inode is dirty and * Do an un-protected check to see if the inode is dirty and
* is a candidate for flushing. These checks will be repeated * is a candidate for flushing. These checks will be repeated
...@@ -2692,7 +2720,7 @@ xfs_iflush_cluster( ...@@ -2692,7 +2720,7 @@ xfs_iflush_cluster(
} }
out_free: out_free:
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
kmem_free(ilist); kmem_free(ilist);
out_put: out_put:
xfs_perag_put(pag); xfs_perag_put(pag);
...@@ -2704,7 +2732,7 @@ xfs_iflush_cluster( ...@@ -2704,7 +2732,7 @@ xfs_iflush_cluster(
* Corruption detected in the clustering loop. Invalidate the * Corruption detected in the clustering loop. Invalidate the
* inode buffer and shut down the filesystem. * inode buffer and shut down the filesystem.
*/ */
read_unlock(&pag->pag_ici_lock); rcu_read_unlock();
/* /*
* Clean up the buffer. If it was B_DELWRI, just release it -- * Clean up the buffer. If it was B_DELWRI, just release it --
* brelse can handle it with no problems. If not, shut down the * brelse can handle it with no problems. If not, shut down the
......
...@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) ...@@ -376,12 +376,13 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
/* /*
* In-core inode flags. * In-core inode flags.
*/ */
#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */ #define XFS_IRECLAIM 0x0001 /* started reclaiming this inode */
#define XFS_ISTALE 0x0002 /* inode has been staled */ #define XFS_ISTALE 0x0002 /* inode has been staled */
#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */ #define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
#define XFS_INEW 0x0008 /* inode has just been allocated */ #define XFS_INEW 0x0008 /* inode has just been allocated */
#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */ #define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */ #define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE 0x0040 /* dirty release already seen */
/* /*
* Flags for inode locking. * Flags for inode locking.
...@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip) ...@@ -438,6 +439,8 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) #define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT)
#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) #define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT)
extern struct lock_class_key xfs_iolock_reclaimable;
/* /*
* Flags for xfs_itruncate_start(). * Flags for xfs_itruncate_start().
*/ */
......
...@@ -842,15 +842,64 @@ xfs_inode_item_destroy( ...@@ -842,15 +842,64 @@ xfs_inode_item_destroy(
* flushed to disk. It is responsible for removing the inode item * flushed to disk. It is responsible for removing the inode item
* from the AIL if it has not been re-logged, and unlocking the inode's * from the AIL if it has not been re-logged, and unlocking the inode's
* flush lock. * flush lock.
*
* To reduce AIL lock traffic as much as possible, we scan the buffer log item
* list for other inodes that will run this function. We remove them from the
* buffer list so we can process all the inode IO completions in one AIL lock
* traversal.
*/ */
void void
xfs_iflush_done( xfs_iflush_done(
struct xfs_buf *bp, struct xfs_buf *bp,
struct xfs_log_item *lip) struct xfs_log_item *lip)
{ {
struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode_log_item *iip;
xfs_inode_t *ip = iip->ili_inode; struct xfs_log_item *blip;
struct xfs_log_item *next;
struct xfs_log_item *prev;
struct xfs_ail *ailp = lip->li_ailp; struct xfs_ail *ailp = lip->li_ailp;
int need_ail = 0;
/*
* Scan the buffer IO completions for other inodes being completed and
* attach them to the current inode log item.
*/
blip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
prev = NULL;
while (blip != NULL) {
if (lip->li_cb != xfs_iflush_done) {
prev = blip;
blip = blip->li_bio_list;
continue;
}
/* remove from list */
next = blip->li_bio_list;
if (!prev) {
XFS_BUF_SET_FSPRIVATE(bp, next);
} else {
prev->li_bio_list = next;
}
/* add to current list */
blip->li_bio_list = lip->li_bio_list;
lip->li_bio_list = blip;
/*
* while we have the item, do the unlocked check for needing
* the AIL lock.
*/
iip = INODE_ITEM(blip);
if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
need_ail++;
blip = next;
}
/* make sure we capture the state of the initial inode. */
iip = INODE_ITEM(lip);
if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
need_ail++;
/* /*
* We only want to pull the item from the AIL if it is * We only want to pull the item from the AIL if it is
...@@ -861,28 +910,37 @@ xfs_iflush_done( ...@@ -861,28 +910,37 @@ xfs_iflush_done(
* the lock since it's cheaper, and then we recheck while * the lock since it's cheaper, and then we recheck while
* holding the lock before removing the inode from the AIL. * holding the lock before removing the inode from the AIL.
*/ */
if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) { if (need_ail) {
struct xfs_log_item *log_items[need_ail];
int i = 0;
spin_lock(&ailp->xa_lock); spin_lock(&ailp->xa_lock);
if (lip->li_lsn == iip->ili_flush_lsn) { for (blip = lip; blip; blip = blip->li_bio_list) {
/* xfs_trans_ail_delete() drops the AIL lock. */ iip = INODE_ITEM(blip);
xfs_trans_ail_delete(ailp, lip); if (iip->ili_logged &&
} else { blip->li_lsn == iip->ili_flush_lsn) {
spin_unlock(&ailp->xa_lock); log_items[i++] = blip;
}
ASSERT(i <= need_ail);
} }
/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
xfs_trans_ail_delete_bulk(ailp, log_items, i);
} }
iip->ili_logged = 0;
/* /*
* Clear the ili_last_fields bits now that we know that the * clean up and unlock the flush lock now we are done. We can clear the
* data corresponding to them is safely on disk. * ili_last_fields bits now that we know that the data corresponding to
* them is safely on disk.
*/ */
iip->ili_last_fields = 0; for (blip = lip; blip; blip = next) {
next = blip->li_bio_list;
blip->li_bio_list = NULL;
/* iip = INODE_ITEM(blip);
* Release the inode's flush lock since we're done with it. iip->ili_logged = 0;
*/ iip->ili_last_fields = 0;
xfs_ifunlock(ip); xfs_ifunlock(iip->ili_inode);
}
} }
/* /*
......
...@@ -47,127 +47,8 @@ ...@@ -47,127 +47,8 @@
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
<< mp->m_writeio_log) << mp->m_writeio_log)
#define XFS_STRAT_WRITE_IMAPS 2
#define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP #define XFS_WRITE_IMAPS XFS_BMAP_MAX_NMAP
STATIC int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
int, struct xfs_bmbt_irec *, int *);
STATIC int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t, int,
struct xfs_bmbt_irec *, int *);
STATIC int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int *);
int
xfs_iomap(
struct xfs_inode *ip,
xfs_off_t offset,
ssize_t count,
int flags,
struct xfs_bmbt_irec *imap,
int *nimaps,
int *new)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb, end_fsb;
int error = 0;
int lockmode = 0;
int bmapi_flags = 0;
ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG);
*new = 0;
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
trace_xfs_iomap_enter(ip, offset, count, flags, NULL);
switch (flags & (BMAPI_READ | BMAPI_WRITE | BMAPI_ALLOCATE)) {
case BMAPI_READ:
lockmode = xfs_ilock_map_shared(ip);
bmapi_flags = XFS_BMAPI_ENTIRE;
break;
case BMAPI_WRITE:
lockmode = XFS_ILOCK_EXCL;
if (flags & BMAPI_IGNSTATE)
bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
xfs_ilock(ip, lockmode);
break;
case BMAPI_ALLOCATE:
lockmode = XFS_ILOCK_SHARED;
bmapi_flags = XFS_BMAPI_ENTIRE;
/* Attempt non-blocking lock */
if (flags & BMAPI_TRYLOCK) {
if (!xfs_ilock_nowait(ip, lockmode))
return XFS_ERROR(EAGAIN);
} else {
xfs_ilock(ip, lockmode);
}
break;
default:
BUG();
}
ASSERT(offset <= mp->m_maxioffset);
if ((xfs_fsize_t)offset + count > mp->m_maxioffset)
count = mp->m_maxioffset - offset;
end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
error = xfs_bmapi(NULL, ip, offset_fsb,
(xfs_filblks_t)(end_fsb - offset_fsb),
bmapi_flags, NULL, 0, imap,
nimaps, NULL);
if (error)
goto out;
switch (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)) {
case BMAPI_WRITE:
/* If we found an extent, return it */
if (*nimaps &&
(imap->br_startblock != HOLESTARTBLOCK) &&
(imap->br_startblock != DELAYSTARTBLOCK)) {
trace_xfs_iomap_found(ip, offset, count, flags, imap);
break;
}
if (flags & BMAPI_DIRECT) {
error = xfs_iomap_write_direct(ip, offset, count, flags,
imap, nimaps);
} else {
error = xfs_iomap_write_delay(ip, offset, count, flags,
imap, nimaps);
}
if (!error) {
trace_xfs_iomap_alloc(ip, offset, count, flags, imap);
}
*new = 1;
break;
case BMAPI_ALLOCATE:
/* If we found an extent, return it */
xfs_iunlock(ip, lockmode);
lockmode = 0;
if (*nimaps && !isnullstartblock(imap->br_startblock)) {
trace_xfs_iomap_found(ip, offset, count, flags, imap);
break;
}
error = xfs_iomap_write_allocate(ip, offset, count,
imap, nimaps);
break;
}
ASSERT(*nimaps <= 1);
out:
if (lockmode)
xfs_iunlock(ip, lockmode);
return XFS_ERROR(error);
}
STATIC int STATIC int
xfs_iomap_eof_align_last_fsb( xfs_iomap_eof_align_last_fsb(
xfs_mount_t *mp, xfs_mount_t *mp,
...@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero( ...@@ -236,14 +117,13 @@ xfs_cmn_err_fsblock_zero(
return EFSCORRUPTED; return EFSCORRUPTED;
} }
STATIC int int
xfs_iomap_write_direct( xfs_iomap_write_direct(
xfs_inode_t *ip, xfs_inode_t *ip,
xfs_off_t offset, xfs_off_t offset,
size_t count, size_t count,
int flags,
xfs_bmbt_irec_t *imap, xfs_bmbt_irec_t *imap,
int *nmaps) int nmaps)
{ {
xfs_mount_t *mp = ip->i_mount; xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb; xfs_fileoff_t offset_fsb;
...@@ -279,7 +159,7 @@ xfs_iomap_write_direct( ...@@ -279,7 +159,7 @@ xfs_iomap_write_direct(
if (error) if (error)
goto error_out; goto error_out;
} else { } else {
if (*nmaps && (imap->br_startblock == HOLESTARTBLOCK)) if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t) last_fsb = MIN(last_fsb, (xfs_fileoff_t)
imap->br_blockcount + imap->br_blockcount +
imap->br_startoff); imap->br_startoff);
...@@ -331,7 +211,7 @@ xfs_iomap_write_direct( ...@@ -331,7 +211,7 @@ xfs_iomap_write_direct(
xfs_trans_ijoin(tp, ip); xfs_trans_ijoin(tp, ip);
bmapi_flag = XFS_BMAPI_WRITE; bmapi_flag = XFS_BMAPI_WRITE;
if ((flags & BMAPI_DIRECT) && (offset < ip->i_size || extsz)) if (offset < ip->i_size || extsz)
bmapi_flag |= XFS_BMAPI_PREALLOC; bmapi_flag |= XFS_BMAPI_PREALLOC;
/* /*
...@@ -370,7 +250,6 @@ xfs_iomap_write_direct( ...@@ -370,7 +250,6 @@ xfs_iomap_write_direct(
goto error_out; goto error_out;
} }
*nmaps = 1;
return 0; return 0;
error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
...@@ -379,7 +258,6 @@ xfs_iomap_write_direct( ...@@ -379,7 +258,6 @@ xfs_iomap_write_direct(
error1: /* Just cancel transaction */ error1: /* Just cancel transaction */
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
*nmaps = 0; /* nothing set-up here */
error_out: error_out:
return XFS_ERROR(error); return XFS_ERROR(error);
...@@ -389,6 +267,9 @@ xfs_iomap_write_direct( ...@@ -389,6 +267,9 @@ xfs_iomap_write_direct(
* If the caller is doing a write at the end of the file, then extend the * If the caller is doing a write at the end of the file, then extend the
* allocation out to the file system's write iosize. We clean up any extra * allocation out to the file system's write iosize. We clean up any extra
* space left over when the file is closed in xfs_inactive(). * space left over when the file is closed in xfs_inactive().
*
* If we find we already have delalloc preallocation beyond EOF, don't do more
* preallocation as it it not needed.
*/ */
STATIC int STATIC int
xfs_iomap_eof_want_preallocate( xfs_iomap_eof_want_preallocate(
...@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate( ...@@ -396,7 +277,6 @@ xfs_iomap_eof_want_preallocate(
xfs_inode_t *ip, xfs_inode_t *ip,
xfs_off_t offset, xfs_off_t offset,
size_t count, size_t count,
int ioflag,
xfs_bmbt_irec_t *imap, xfs_bmbt_irec_t *imap,
int nimaps, int nimaps,
int *prealloc) int *prealloc)
...@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate( ...@@ -405,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
xfs_filblks_t count_fsb; xfs_filblks_t count_fsb;
xfs_fsblock_t firstblock; xfs_fsblock_t firstblock;
int n, error, imaps; int n, error, imaps;
int found_delalloc = 0;
*prealloc = 0; *prealloc = 0;
if ((offset + count) <= ip->i_size) if ((offset + count) <= ip->i_size)
...@@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate( ...@@ -429,20 +310,66 @@ xfs_iomap_eof_want_preallocate(
return 0; return 0;
start_fsb += imap[n].br_blockcount; start_fsb += imap[n].br_blockcount;
count_fsb -= imap[n].br_blockcount; count_fsb -= imap[n].br_blockcount;
if (imap[n].br_startblock == DELAYSTARTBLOCK)
found_delalloc = 1;
} }
} }
*prealloc = 1; if (!found_delalloc)
*prealloc = 1;
return 0; return 0;
} }
STATIC int /*
* If we don't have a user specified preallocation size, dynamically increase
* the preallocation size as the size of the file grows. Cap the maximum size
* at a single extent or less if the filesystem is near full. The closer the
* filesystem is to full, the smaller the maximum prealocation.
*/
STATIC xfs_fsblock_t
xfs_iomap_prealloc_size(
struct xfs_mount *mp,
struct xfs_inode *ip)
{
xfs_fsblock_t alloc_blocks = 0;
if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
int shift = 0;
int64_t freesp;
alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
rounddown_pow_of_two(alloc_blocks));
xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
freesp = mp->m_sb.sb_fdblocks;
if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
shift = 2;
if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
shift++;
if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
shift++;
if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
shift++;
if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
shift++;
}
if (shift)
alloc_blocks >>= shift;
}
if (alloc_blocks < mp->m_writeio_blocks)
alloc_blocks = mp->m_writeio_blocks;
return alloc_blocks;
}
int
xfs_iomap_write_delay( xfs_iomap_write_delay(
xfs_inode_t *ip, xfs_inode_t *ip,
xfs_off_t offset, xfs_off_t offset,
size_t count, size_t count,
int ioflag, xfs_bmbt_irec_t *ret_imap)
xfs_bmbt_irec_t *ret_imap,
int *nmaps)
{ {
xfs_mount_t *mp = ip->i_mount; xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb; xfs_fileoff_t offset_fsb;
...@@ -469,16 +396,19 @@ xfs_iomap_write_delay( ...@@ -469,16 +396,19 @@ xfs_iomap_write_delay(
extsz = xfs_get_extsz_hint(ip); extsz = xfs_get_extsz_hint(ip);
offset_fsb = XFS_B_TO_FSBT(mp, offset); offset_fsb = XFS_B_TO_FSBT(mp, offset);
error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count, error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
ioflag, imap, XFS_WRITE_IMAPS, &prealloc); imap, XFS_WRITE_IMAPS, &prealloc);
if (error) if (error)
return error; return error;
retry: retry:
if (prealloc) { if (prealloc) {
xfs_fsblock_t alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1)); aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
ioalign = XFS_B_TO_FSBT(mp, aligned_offset); ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
last_fsb = ioalign + mp->m_writeio_blocks; last_fsb = ioalign + alloc_blocks;
} else { } else {
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
} }
...@@ -496,22 +426,31 @@ xfs_iomap_write_delay( ...@@ -496,22 +426,31 @@ xfs_iomap_write_delay(
XFS_BMAPI_DELAY | XFS_BMAPI_WRITE | XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
XFS_BMAPI_ENTIRE, &firstblock, 1, imap, XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
&nimaps, NULL); &nimaps, NULL);
if (error && (error != ENOSPC)) switch (error) {
case 0:
case ENOSPC:
case EDQUOT:
break;
default:
return XFS_ERROR(error); return XFS_ERROR(error);
}
/* /*
* If bmapi returned us nothing, and if we didn't get back EDQUOT, * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
* then we must have run out of space - flush all other inodes with * ENOSPC, * flush all other inodes with delalloc blocks to free up
* delalloc blocks and retry without EOF preallocation. * some of the excess reserved metadata space. For both cases, retry
* without EOF preallocation.
*/ */
if (nimaps == 0) { if (nimaps == 0) {
trace_xfs_delalloc_enospc(ip, offset, count); trace_xfs_delalloc_enospc(ip, offset, count);
if (flushed) if (flushed)
return XFS_ERROR(ENOSPC); return XFS_ERROR(error ? error : ENOSPC);
xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error == ENOSPC) {
xfs_flush_inodes(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_flush_inodes(ip);
xfs_ilock(ip, XFS_ILOCK_EXCL);
}
flushed = 1; flushed = 1;
error = 0; error = 0;
...@@ -523,8 +462,6 @@ xfs_iomap_write_delay( ...@@ -523,8 +462,6 @@ xfs_iomap_write_delay(
return xfs_cmn_err_fsblock_zero(ip, &imap[0]); return xfs_cmn_err_fsblock_zero(ip, &imap[0]);
*ret_imap = imap[0]; *ret_imap = imap[0];
*nmaps = 1;
return 0; return 0;
} }
...@@ -538,13 +475,12 @@ xfs_iomap_write_delay( ...@@ -538,13 +475,12 @@ xfs_iomap_write_delay(
* We no longer bother to look at the incoming map - all we have to * We no longer bother to look at the incoming map - all we have to
* guarantee is that whatever we allocate fills the required range. * guarantee is that whatever we allocate fills the required range.
*/ */
STATIC int int
xfs_iomap_write_allocate( xfs_iomap_write_allocate(
xfs_inode_t *ip, xfs_inode_t *ip,
xfs_off_t offset, xfs_off_t offset,
size_t count, size_t count,
xfs_bmbt_irec_t *imap, xfs_bmbt_irec_t *imap)
int *retmap)
{ {
xfs_mount_t *mp = ip->i_mount; xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb, last_block; xfs_fileoff_t offset_fsb, last_block;
...@@ -557,8 +493,6 @@ xfs_iomap_write_allocate( ...@@ -557,8 +493,6 @@ xfs_iomap_write_allocate(
int error = 0; int error = 0;
int nres; int nres;
*retmap = 0;
/* /*
* Make sure that the dquots are there. * Make sure that the dquots are there.
*/ */
...@@ -680,7 +614,6 @@ xfs_iomap_write_allocate( ...@@ -680,7 +614,6 @@ xfs_iomap_write_allocate(
if ((offset_fsb >= imap->br_startoff) && if ((offset_fsb >= imap->br_startoff) &&
(offset_fsb < (imap->br_startoff + (offset_fsb < (imap->br_startoff +
imap->br_blockcount))) { imap->br_blockcount))) {
*retmap = 1;
XFS_STATS_INC(xs_xstrat_quick); XFS_STATS_INC(xs_xstrat_quick);
return 0; return 0;
} }
......
...@@ -18,30 +18,15 @@ ...@@ -18,30 +18,15 @@
#ifndef __XFS_IOMAP_H__ #ifndef __XFS_IOMAP_H__
#define __XFS_IOMAP_H__ #define __XFS_IOMAP_H__
/* base extent manipulation calls */
#define BMAPI_READ (1 << 0) /* read extents */
#define BMAPI_WRITE (1 << 1) /* create extents */
#define BMAPI_ALLOCATE (1 << 2) /* delayed allocate to real extents */
/* modifiers */
#define BMAPI_IGNSTATE (1 << 4) /* ignore unwritten state on read */
#define BMAPI_DIRECT (1 << 5) /* direct instead of buffered write */
#define BMAPI_MMA (1 << 6) /* allocate for mmap write */
#define BMAPI_TRYLOCK (1 << 7) /* non-blocking request */
#define BMAPI_FLAGS \
{ BMAPI_READ, "READ" }, \
{ BMAPI_WRITE, "WRITE" }, \
{ BMAPI_ALLOCATE, "ALLOCATE" }, \
{ BMAPI_IGNSTATE, "IGNSTATE" }, \
{ BMAPI_DIRECT, "DIRECT" }, \
{ BMAPI_TRYLOCK, "TRYLOCK" }
struct xfs_inode; struct xfs_inode;
struct xfs_bmbt_irec; struct xfs_bmbt_irec;
extern int xfs_iomap(struct xfs_inode *, xfs_off_t, ssize_t, int, extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int *, int *); struct xfs_bmbt_irec *, int);
extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *);
extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *);
extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t); extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
#endif /* __XFS_IOMAP_H__*/ #endif /* __XFS_IOMAP_H__*/
This diff is collapsed.
...@@ -61,7 +61,7 @@ xlog_cil_init( ...@@ -61,7 +61,7 @@ xlog_cil_init(
INIT_LIST_HEAD(&cil->xc_committing); INIT_LIST_HEAD(&cil->xc_committing);
spin_lock_init(&cil->xc_cil_lock); spin_lock_init(&cil->xc_cil_lock);
init_rwsem(&cil->xc_ctx_lock); init_rwsem(&cil->xc_ctx_lock);
sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait"); init_waitqueue_head(&cil->xc_commit_wait);
INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents); INIT_LIST_HEAD(&ctx->busy_extents);
...@@ -361,15 +361,10 @@ xlog_cil_committed( ...@@ -361,15 +361,10 @@ xlog_cil_committed(
int abort) int abort)
{ {
struct xfs_cil_ctx *ctx = args; struct xfs_cil_ctx *ctx = args;
struct xfs_log_vec *lv;
int abortflag = abort ? XFS_LI_ABORTED : 0;
struct xfs_busy_extent *busyp, *n; struct xfs_busy_extent *busyp, *n;
/* unpin all the log items */ xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) { ctx->start_lsn, abort);
xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
abortflag);
}
list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
...@@ -568,7 +563,7 @@ xlog_cil_push( ...@@ -568,7 +563,7 @@ xlog_cil_push(
* It is still being pushed! Wait for the push to * It is still being pushed! Wait for the push to
* complete, then start again from the beginning. * complete, then start again from the beginning.
*/ */
sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
goto restart; goto restart;
} }
} }
...@@ -592,7 +587,7 @@ xlog_cil_push( ...@@ -592,7 +587,7 @@ xlog_cil_push(
*/ */
spin_lock(&cil->xc_cil_lock); spin_lock(&cil->xc_cil_lock);
ctx->commit_lsn = commit_lsn; ctx->commit_lsn = commit_lsn;
sv_broadcast(&cil->xc_commit_wait); wake_up_all(&cil->xc_commit_wait);
spin_unlock(&cil->xc_cil_lock); spin_unlock(&cil->xc_cil_lock);
/* release the hounds! */ /* release the hounds! */
...@@ -757,7 +752,7 @@ xlog_cil_force_lsn( ...@@ -757,7 +752,7 @@ xlog_cil_force_lsn(
* It is still being pushed! Wait for the push to * It is still being pushed! Wait for the push to
* complete, then start again from the beginning. * complete, then start again from the beginning.
*/ */
sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
goto restart; goto restart;
} }
if (ctx->sequence != sequence) if (ctx->sequence != sequence)
......
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
struct xfs_buf; struct xfs_buf;
struct log; struct log;
struct xlog_ticket; struct xlog_ticket;
struct xfs_buf_cancel;
struct xfs_mount; struct xfs_mount;
/* /*
...@@ -54,7 +53,6 @@ struct xfs_mount; ...@@ -54,7 +53,6 @@ struct xfs_mount;
BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \
XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT))
static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block) static inline xfs_lsn_t xlog_assign_lsn(uint cycle, uint block)
{ {
return ((xfs_lsn_t)cycle << 32) | block; return ((xfs_lsn_t)cycle << 32) | block;
...@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i) ...@@ -133,12 +131,10 @@ static inline uint xlog_get_client_id(__be32 i)
*/ */
#define XLOG_TIC_INITED 0x1 /* has been initialized */ #define XLOG_TIC_INITED 0x1 /* has been initialized */
#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ #define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
#define XLOG_TIC_IN_Q 0x4
#define XLOG_TIC_FLAGS \ #define XLOG_TIC_FLAGS \
{ XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }, \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
{ XLOG_TIC_IN_Q, "XLOG_TIC_IN_Q" }
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
...@@ -244,9 +240,8 @@ typedef struct xlog_res { ...@@ -244,9 +240,8 @@ typedef struct xlog_res {
} xlog_res_t; } xlog_res_t;
typedef struct xlog_ticket { typedef struct xlog_ticket {
sv_t t_wait; /* ticket wait queue : 20 */ wait_queue_head_t t_wait; /* ticket wait queue */
struct xlog_ticket *t_next; /* :4|8 */ struct list_head t_queue; /* reserve/write queue */
struct xlog_ticket *t_prev; /* :4|8 */
xlog_tid_t t_tid; /* transaction identifier : 4 */ xlog_tid_t t_tid; /* transaction identifier : 4 */
atomic_t t_ref; /* ticket reference count : 4 */ atomic_t t_ref; /* ticket reference count : 4 */
int t_curr_res; /* current reservation in bytes : 4 */ int t_curr_res; /* current reservation in bytes : 4 */
...@@ -353,8 +348,8 @@ typedef union xlog_in_core2 { ...@@ -353,8 +348,8 @@ typedef union xlog_in_core2 {
* and move everything else out to subsequent cachelines. * and move everything else out to subsequent cachelines.
*/ */
typedef struct xlog_in_core { typedef struct xlog_in_core {
sv_t ic_force_wait; wait_queue_head_t ic_force_wait;
sv_t ic_write_wait; wait_queue_head_t ic_write_wait;
struct xlog_in_core *ic_next; struct xlog_in_core *ic_next;
struct xlog_in_core *ic_prev; struct xlog_in_core *ic_prev;
struct xfs_buf *ic_bp; struct xfs_buf *ic_bp;
...@@ -421,7 +416,7 @@ struct xfs_cil { ...@@ -421,7 +416,7 @@ struct xfs_cil {
struct xfs_cil_ctx *xc_ctx; struct xfs_cil_ctx *xc_ctx;
struct rw_semaphore xc_ctx_lock; struct rw_semaphore xc_ctx_lock;
struct list_head xc_committing; struct list_head xc_committing;
sv_t xc_commit_wait; wait_queue_head_t xc_commit_wait;
xfs_lsn_t xc_current_sequence; xfs_lsn_t xc_current_sequence;
}; };
...@@ -491,7 +486,7 @@ typedef struct log { ...@@ -491,7 +486,7 @@ typedef struct log {
struct xfs_buftarg *l_targ; /* buftarg of log */ struct xfs_buftarg *l_targ; /* buftarg of log */
uint l_flags; uint l_flags;
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
struct xfs_buf_cancel **l_buf_cancel_table; struct list_head *l_buf_cancel_table;
int l_iclog_hsize; /* size of iclog header */ int l_iclog_hsize; /* size of iclog header */
int l_iclog_heads; /* # of iclog header sectors */ int l_iclog_heads; /* # of iclog header sectors */
uint l_sectBBsize; /* sector size in BBs (2^n) */ uint l_sectBBsize; /* sector size in BBs (2^n) */
...@@ -503,29 +498,40 @@ typedef struct log { ...@@ -503,29 +498,40 @@ typedef struct log {
int l_logBBsize; /* size of log in BB chunks */ int l_logBBsize; /* size of log in BB chunks */
/* The following block of fields are changed while holding icloglock */ /* The following block of fields are changed while holding icloglock */
sv_t l_flush_wait ____cacheline_aligned_in_smp; wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp;
/* waiting for iclog flush */ /* waiting for iclog flush */
int l_covered_state;/* state of "covering disk int l_covered_state;/* state of "covering disk
* log entries" */ * log entries" */
xlog_in_core_t *l_iclog; /* head log queue */ xlog_in_core_t *l_iclog; /* head log queue */
spinlock_t l_icloglock; /* grab to change iclog state */ spinlock_t l_icloglock; /* grab to change iclog state */
xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
* buffers */
xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
int l_curr_cycle; /* Cycle number of log writes */ int l_curr_cycle; /* Cycle number of log writes */
int l_prev_cycle; /* Cycle number before last int l_prev_cycle; /* Cycle number before last
* block increment */ * block increment */
int l_curr_block; /* current logical log block */ int l_curr_block; /* current logical log block */
int l_prev_block; /* previous logical log block */ int l_prev_block; /* previous logical log block */
/* The following block of fields are changed while holding grant_lock */ /*
spinlock_t l_grant_lock ____cacheline_aligned_in_smp; * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and
xlog_ticket_t *l_reserve_headq; * read without needing to hold specific locks. To avoid operations
xlog_ticket_t *l_write_headq; * contending with other hot objects, place each of them on a separate
int l_grant_reserve_cycle; * cacheline.
int l_grant_reserve_bytes; */
int l_grant_write_cycle; /* lsn of last LR on disk */
int l_grant_write_bytes; atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp;
/* lsn of 1st LR with unflushed * buffers */
atomic64_t l_tail_lsn ____cacheline_aligned_in_smp;
/*
* ticket grant locks, queues and accounting have their own cachlines
* as these are quite hot and can be operated on concurrently.
*/
spinlock_t l_grant_reserve_lock ____cacheline_aligned_in_smp;
struct list_head l_reserveq;
atomic64_t l_grant_reserve_head;
spinlock_t l_grant_write_lock ____cacheline_aligned_in_smp;
struct list_head l_writeq;
atomic64_t l_grant_write_head;
/* The following field are used for debugging; need to hold icloglock */ /* The following field are used for debugging; need to hold icloglock */
#ifdef DEBUG #ifdef DEBUG
...@@ -534,6 +540,9 @@ typedef struct log { ...@@ -534,6 +540,9 @@ typedef struct log {
} xlog_t; } xlog_t;
#define XLOG_BUF_CANCEL_BUCKET(log, blkno) \
((log)->l_buf_cancel_table + ((__uint64_t)blkno % XLOG_BC_TABLE_SIZE))
#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
/* common routines */ /* common routines */
...@@ -561,6 +570,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector, ...@@ -561,6 +570,61 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
struct xlog_ticket *tic, xfs_lsn_t *start_lsn, struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
xlog_in_core_t **commit_iclog, uint flags); xlog_in_core_t **commit_iclog, uint flags);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
* change while we are cracking it into the component values. This means we
* will always get consistent component values to work from. This should always
* be used to smaple and crack LSNs taht are stored and updated in atomic
* variables.
*/
static inline void
xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block)
{
xfs_lsn_t val = atomic64_read(lsn);
*cycle = CYCLE_LSN(val);
*block = BLOCK_LSN(val);
}
/*
* Calculate and assign a value to an atomic LSN variable from component pieces.
*/
static inline void
xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block)
{
atomic64_set(lsn, xlog_assign_lsn(cycle, block));
}
/*
* When we crack the grant head, we sample it first so that the value will not
* change while we are cracking it into the component values. This means we
* will always get consistent component values to work from.
*/
static inline void
xlog_crack_grant_head_val(int64_t val, int *cycle, int *space)
{
*cycle = val >> 32;
*space = val & 0xffffffff;
}
static inline void
xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space)
{
xlog_crack_grant_head_val(atomic64_read(head), cycle, space);
}
static inline int64_t
xlog_assign_grant_head_val(int cycle, int space)
{
return ((int64_t)cycle << 32) | space;
}
static inline void
xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
{
atomic64_set(head, xlog_assign_grant_head_val(cycle, space));
}
/* /*
* Committed Item List interfaces * Committed Item List interfaces
*/ */
...@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log) ...@@ -585,6 +649,21 @@ xlog_cil_force(struct log *log)
*/ */
#define XLOG_UNMOUNT_REC_TYPE (-1U) #define XLOG_UNMOUNT_REC_TYPE (-1U)
/*
* Wrapper function for waiting on a wait queue serialised against wakeups
* by a spinlock. This matches the semantics of all the wait queues used in the
* log code.
*/
static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
{
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(wq, &wait);
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(lock);
schedule();
remove_wait_queue(wq, &wait);
}
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif /* __XFS_LOG_PRIV_H__ */ #endif /* __XFS_LOG_PRIV_H__ */
This diff is collapsed.
...@@ -472,7 +472,7 @@ xfs_initialize_perag( ...@@ -472,7 +472,7 @@ xfs_initialize_perag(
goto out_unwind; goto out_unwind;
pag->pag_agno = index; pag->pag_agno = index;
pag->pag_mount = mp; pag->pag_mount = mp;
rwlock_init(&pag->pag_ici_lock); spin_lock_init(&pag->pag_ici_lock);
mutex_init(&pag->pag_ici_reclaim_lock); mutex_init(&pag->pag_ici_reclaim_lock);
INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
spin_lock_init(&pag->pag_buf_lock); spin_lock_init(&pag->pag_buf_lock);
...@@ -974,6 +974,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp) ...@@ -974,6 +974,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog); mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
} }
/*
* precalculate the low space thresholds for dynamic speculative preallocation.
*/
void
xfs_set_low_space_thresholds(
struct xfs_mount *mp)
{
int i;
for (i = 0; i < XFS_LOWSP_MAX; i++) {
__uint64_t space = mp->m_sb.sb_dblocks;
do_div(space, 100);
mp->m_low_space[i] = space * (i + 1);
}
}
/* /*
* Set whether we're using inode alignment. * Set whether we're using inode alignment.
*/ */
...@@ -1196,6 +1214,9 @@ xfs_mountfs( ...@@ -1196,6 +1214,9 @@ xfs_mountfs(
*/ */
xfs_set_rw_sizes(mp); xfs_set_rw_sizes(mp);
/* set the low space thresholds for dynamic preallocation */
xfs_set_low_space_thresholds(mp);
/* /*
* Set the inode cluster size. * Set the inode cluster size.
* This may still be overridden by the file system * This may still be overridden by the file system
......
...@@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t, ...@@ -103,6 +103,16 @@ extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
xfs_mod_incore_sb(mp, field, delta, rsvd) xfs_mod_incore_sb(mp, field, delta, rsvd)
#endif #endif
/* dynamic preallocation free space thresholds, 5% down to 1% */
enum {
XFS_LOWSP_1_PCNT = 0,
XFS_LOWSP_2_PCNT,
XFS_LOWSP_3_PCNT,
XFS_LOWSP_4_PCNT,
XFS_LOWSP_5_PCNT,
XFS_LOWSP_MAX,
};
typedef struct xfs_mount { typedef struct xfs_mount {
struct super_block *m_super; struct super_block *m_super;
xfs_tid_t m_tid; /* next unused tid for fs */ xfs_tid_t m_tid; /* next unused tid for fs */
...@@ -202,6 +212,8 @@ typedef struct xfs_mount { ...@@ -202,6 +212,8 @@ typedef struct xfs_mount {
__int64_t m_update_flags; /* sb flags we need to update __int64_t m_update_flags; /* sb flags we need to update
on the next remount,rw */ on the next remount,rw */
struct shrinker m_inode_shrink; /* inode reclaim shrinker */ struct shrinker m_inode_shrink; /* inode reclaim shrinker */
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
} xfs_mount_t; } xfs_mount_t;
/* /*
...@@ -379,6 +391,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); ...@@ -379,6 +391,8 @@ extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
extern void xfs_set_low_space_thresholds(struct xfs_mount *);
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
extern void xfs_mod_sb(struct xfs_trans *, __int64_t); extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
......
...@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs( ...@@ -1350,7 +1350,7 @@ xfs_trans_fill_vecs(
* they could be immediately flushed and we'd have to race with the flusher * they could be immediately flushed and we'd have to race with the flusher
* trying to pull the item from the AIL as we add it. * trying to pull the item from the AIL as we add it.
*/ */
void static void
xfs_trans_item_committed( xfs_trans_item_committed(
struct xfs_log_item *lip, struct xfs_log_item *lip,
xfs_lsn_t commit_lsn, xfs_lsn_t commit_lsn,
...@@ -1425,6 +1425,83 @@ xfs_trans_committed( ...@@ -1425,6 +1425,83 @@ xfs_trans_committed(
xfs_trans_free(tp); xfs_trans_free(tp);
} }
static inline void
xfs_log_item_batch_insert(
struct xfs_ail *ailp,
struct xfs_log_item **log_items,
int nr_items,
xfs_lsn_t commit_lsn)
{
int i;
spin_lock(&ailp->xa_lock);
/* xfs_trans_ail_update_bulk drops ailp->xa_lock */
xfs_trans_ail_update_bulk(ailp, log_items, nr_items, commit_lsn);
for (i = 0; i < nr_items; i++)
IOP_UNPIN(log_items[i], 0);
}
/*
* Bulk operation version of xfs_trans_committed that takes a log vector of
* items to insert into the AIL. This uses bulk AIL insertion techniques to
* minimise lock traffic.
*/
void
xfs_trans_committed_bulk(
struct xfs_ail *ailp,
struct xfs_log_vec *log_vector,
xfs_lsn_t commit_lsn,
int aborted)
{
#define LOG_ITEM_BATCH_SIZE 32
struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE];
struct xfs_log_vec *lv;
int i = 0;
/* unpin all the log items */
for (lv = log_vector; lv; lv = lv->lv_next ) {
struct xfs_log_item *lip = lv->lv_item;
xfs_lsn_t item_lsn;
if (aborted)
lip->li_flags |= XFS_LI_ABORTED;
item_lsn = IOP_COMMITTED(lip, commit_lsn);
/* item_lsn of -1 means the item was freed */
if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0)
continue;
if (item_lsn != commit_lsn) {
/*
* Not a bulk update option due to unusual item_lsn.
* Push into AIL immediately, rechecking the lsn once
* we have the ail lock. Then unpin the item.
*/
spin_lock(&ailp->xa_lock);
if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0)
xfs_trans_ail_update(ailp, lip, item_lsn);
else
spin_unlock(&ailp->xa_lock);
IOP_UNPIN(lip, 0);
continue;
}
/* Item is a candidate for bulk AIL insert. */
log_items[i++] = lv->lv_item;
if (i >= LOG_ITEM_BATCH_SIZE) {
xfs_log_item_batch_insert(ailp, log_items,
LOG_ITEM_BATCH_SIZE, commit_lsn);
i = 0;
}
}
/* make sure we insert the remainder! */
if (i)
xfs_log_item_batch_insert(ailp, log_items, i, commit_lsn);
}
/* /*
* Called from the trans_commit code when we notice that * Called from the trans_commit code when we notice that
* the filesystem is in the middle of a forced shutdown. * the filesystem is in the middle of a forced shutdown.
......
...@@ -294,8 +294,8 @@ struct xfs_log_item_desc { ...@@ -294,8 +294,8 @@ struct xfs_log_item_desc {
#define XFS_ALLOC_BTREE_REF 2 #define XFS_ALLOC_BTREE_REF 2
#define XFS_BMAP_BTREE_REF 2 #define XFS_BMAP_BTREE_REF 2
#define XFS_DIR_BTREE_REF 2 #define XFS_DIR_BTREE_REF 2
#define XFS_INO_REF 2
#define XFS_ATTR_BTREE_REF 1 #define XFS_ATTR_BTREE_REF 1
#define XFS_INO_REF 1
#define XFS_DQUOT_REF 1 #define XFS_DQUOT_REF 1
#ifdef __KERNEL__ #ifdef __KERNEL__
......
This diff is collapsed.
...@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp, ...@@ -69,12 +69,16 @@ xfs_trans_log_efi_extent(xfs_trans_t *tp,
tp->t_flags |= XFS_TRANS_DIRTY; tp->t_flags |= XFS_TRANS_DIRTY;
efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY;
next_extent = efip->efi_next_extent; /*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
* it.
*/
next_extent = atomic_inc_return(&efip->efi_next_extent) - 1;
ASSERT(next_extent < efip->efi_format.efi_nextents); ASSERT(next_extent < efip->efi_format.efi_nextents);
extp = &(efip->efi_format.efi_extents[next_extent]); extp = &(efip->efi_format.efi_extents[next_extent]);
extp->ext_start = start_block; extp->ext_start = start_block;
extp->ext_len = ext_len; extp->ext_len = ext_len;
efip->efi_next_extent++;
} }
......
...@@ -22,15 +22,17 @@ struct xfs_log_item; ...@@ -22,15 +22,17 @@ struct xfs_log_item;
struct xfs_log_item_desc; struct xfs_log_item_desc;
struct xfs_mount; struct xfs_mount;
struct xfs_trans; struct xfs_trans;
struct xfs_ail;
struct xfs_log_vec;
void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
int flags); int flags);
void xfs_trans_item_committed(struct xfs_log_item *lip,
xfs_lsn_t commit_lsn, int aborted);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
xfs_lsn_t commit_lsn, int aborted);
/* /*
* AIL traversal cursor. * AIL traversal cursor.
* *
...@@ -73,12 +75,29 @@ struct xfs_ail { ...@@ -73,12 +75,29 @@ struct xfs_ail {
/* /*
* From xfs_trans_ail.c * From xfs_trans_ail.c
*/ */
void xfs_trans_ail_update(struct xfs_ail *ailp, void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
struct xfs_log_item *lip, xfs_lsn_t lsn) struct xfs_log_item **log_items, int nr_items,
__releases(ailp->xa_lock); xfs_lsn_t lsn) __releases(ailp->xa_lock);
void xfs_trans_ail_delete(struct xfs_ail *ailp, static inline void
struct xfs_log_item *lip) xfs_trans_ail_update(
__releases(ailp->xa_lock); struct xfs_ail *ailp,
struct xfs_log_item *lip,
xfs_lsn_t lsn) __releases(ailp->xa_lock)
{
xfs_trans_ail_update_bulk(ailp, &lip, 1, lsn);
}
void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
struct xfs_log_item **log_items, int nr_items)
__releases(ailp->xa_lock);
static inline void
xfs_trans_ail_delete(
struct xfs_ail *ailp,
xfs_log_item_t *lip) __releases(ailp->xa_lock)
{
xfs_trans_ail_delete_bulk(ailp, &lip, 1);
}
void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
void xfs_trans_unlocked_item(struct xfs_ail *, void xfs_trans_unlocked_item(struct xfs_ail *,
xfs_log_item_t *); xfs_log_item_t *);
......
...@@ -964,29 +964,48 @@ xfs_release( ...@@ -964,29 +964,48 @@ xfs_release(
xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE); xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
} }
if (ip->i_d.di_nlink != 0) { if (ip->i_d.di_nlink == 0)
if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && return 0;
((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
ip->i_delayed_blks > 0)) &&
(ip->i_df.if_flags & XFS_IFEXTENTS)) &&
(!(ip->i_d.di_flags &
(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
/* if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
* If we can't get the iolock just skip truncating ((ip->i_size > 0) || (VN_CACHED(VFS_I(ip)) > 0 ||
* the blocks past EOF because we could deadlock ip->i_delayed_blks > 0)) &&
* with the mmap_sem otherwise. We'll get another (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
* chance to drop them once the last reference to (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
* the inode is dropped, so we'll never leak blocks
* permanently.
*/
error = xfs_free_eofblocks(mp, ip,
XFS_FREE_EOF_TRYLOCK);
if (error)
return error;
}
}
/*
* If we can't get the iolock just skip truncating the blocks
* past EOF because we could deadlock with the mmap_sem
* otherwise. We'll get another chance to drop them once the
* last reference to the inode is dropped, so we'll never leak
* blocks permanently.
*
* Further, check if the inode is being opened, written and
* closed frequently and we have delayed allocation blocks
* oustanding (e.g. streaming writes from the NFS server),
* truncating the blocks past EOF will cause fragmentation to
* occur.
*
* In this case don't do the truncation, either, but we have to
* be careful how we detect this case. Blocks beyond EOF show
* up as i_delayed_blks even when the inode is clean, so we
* need to truncate them away first before checking for a dirty
* release. Hence on the first dirty close we will still remove
* the speculative allocation, but after that we will leave it
* in place.
*/
if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
return 0;
error = xfs_free_eofblocks(mp, ip,
XFS_FREE_EOF_TRYLOCK);
if (error)
return error;
/* delalloc blocks after truncation means it really is dirty */
if (ip->i_delayed_blks)
xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
}
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment