Merge master.kernel.org:/home/hch/BK/xfs/linux-2.5

into home.transmeta.com:/home/torvalds/v2.5/linux

Merge master.kernel.org:/home/hch/BK/xfs/linux-2.5
into home.transmeta.com:/home/torvalds/v2.5/linux
af61a2bd · Linus Torvalds · 91af0978 · ed7fa26b · af61a2bd · af61a2bd
Commit af61a2bd authored Feb 19, 2003 by Linus Torvalds
15 changed files
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -659,8 +659,7 @@ void buffer_insert_list(spinlock_t *lock,
 		struct buffer_head *bh, struct list_head *list)
 {
 	spin_lock(lock);
-	list_del(&bh->b_assoc_buffers);
-	list_add(&bh->b_assoc_buffers, list);
+	list_move_tail(&bh->b_assoc_buffers, list);
 	spin_unlock(lock);
 }


--- a/fs/xfs/linux/xfs_aops.c
+++ b/fs/xfs/linux/xfs_aops.c
@@ -53,12 +53,15 @@ map_blocks(
 		count = max_t(ssize_t, count, XFS_WRITE_IO_LOG);
 retry:
 	VOP_BMAP(vp, offset, count, flags, pbmapp, &nmaps, error);
-	if (flags & PBF_WRITE) {
-		if (unlikely((flags & PBF_DIRECT) && nmaps &&
+	if (error == EAGAIN)
+		return -error;
+	if (unlikely((flags & (PBF_WRITE|PBF_DIRECT)) ==
+					(PBF_WRITE|PBF_DIRECT) && nmaps &&
 					(pbmapp->pbm_flags & PBMF_DELAY))) {
 		flags = PBF_FILE_ALLOCATE;
 		goto retry;
 	}
+	if (flags & (PBF_WRITE|PBF_FILE_ALLOCATE)) {
 		VMODIFY(vp);
 	}
 	return -error;
@@ -309,6 +312,7 @@ convert_page(
 		if (startio && (offset < end)) {
 			bh_arr[index++] = bh;
 		} else {
+			set_buffer_dirty(bh);
 			unlock_buffer(bh);
 		}
 	} while (i++, (bh = bh->b_this_page) != head);
@@ -367,7 +371,7 @@ STATIC int
 delalloc_convert(
 	struct page	*page,
 	int		startio,
-	int			allocate_space)
+	int		unmapped) /* also implies page uptodate */
 {
 	struct inode		*inode = page->mapping->host;
 	struct buffer_head	*bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
@@ -375,6 +379,9 @@ delalloc_convert(
 	unsigned long		p_offset = 0, end_index;
 	loff_t			offset, end_offset;
 	int			len, err, i, cnt = 0, uptodate = 1;
+	int			flags = startio ? 0 : PBF_TRYLOCK;
+	int			page_dirty = 1;
+

 	/* Are we off the end of the file ? */
 	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -390,9 +397,6 @@ delalloc_convert(
 	if (end_offset > inode->i_size)
 		end_offset = inode->i_size;

-	if (startio && !page_has_buffers(page))
-		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-
 	bh = head = page_buffers(page);
 	mp = NULL;

@@ -406,10 +410,14 @@ delalloc_convert(
 			mp = match_offset_to_mapping(page, &map, p_offset);
 		}

+		/*
+		 * First case, allocate space for delalloc buffer head
+		 * we can return EAGAIN here in the release page case.
+		 */
 		if (buffer_delay(bh)) {
 			if (!mp) {
 				err = map_blocks(inode, offset, len, &map,
-						PBF_FILE_ALLOCATE);
+					PBF_FILE_ALLOCATE | flags);
 				if (err) {
 					goto error;
 				}
@@ -422,11 +430,14 @@ delalloc_convert(
 				if (startio) {
 					bh_arr[cnt++] = bh;
 				} else {
+					set_buffer_dirty(bh);
 					unlock_buffer(bh);
 				}
+				page_dirty = 0;
 			}
 		} else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
-			   (allocate_space || startio)) {
+			   (unmapped || startio)) {
+
 			if (!buffer_mapped(bh)) {
 				int	size;

@@ -454,13 +465,16 @@ delalloc_convert(
 					if (startio) {
 						bh_arr[cnt++] = bh;
 					} else {
+						set_buffer_dirty(bh);
 						unlock_buffer(bh);
 					}
+					page_dirty = 0;
 				}
-			} else if (startio && buffer_mapped(bh)) {
-				if (buffer_uptodate(bh) && allocate_space) {
+			} else if (startio) {
+				if (buffer_uptodate(bh)) {
 					lock_buffer(bh);
 					bh_arr[cnt++] = bh;
+					page_dirty = 0;
 				}
 			}
 		}
@@ -482,10 +496,10 @@ delalloc_convert(

 	if (mp) {
 		cluster_write(inode, page->index + 1, mp,
-				startio, allocate_space);
+				startio, unmapped);
 	}

-	return 0;
+	return page_dirty;

 error:
 	for (i = 0; i < cnt; i++) {
@@ -494,12 +508,15 @@ delalloc_convert(
 	
 	/*
 	 * If it's delalloc and we have nowhere to put it,
-	 * throw it away.
+	 * throw it away, unless the lower layers told
+	 * us to try again.
 	 */
-	if (!allocate_space) {
+	if (err != -EAGAIN) {
+		if (!unmapped) {
 			block_invalidatepage(page, 0);
 		}
 		ClearPageUptodate(page);
+	}
 	return err;
 }

@@ -679,109 +696,172 @@ linvfs_readpages(
 }


-STATIC int
+STATIC void
 count_page_state(
 	struct page		*page,
-	int			*nr_delalloc,
-	int			*nr_unmapped)
+	int			*delalloc,
+	int			*unmapped)
 {
-	*nr_delalloc = *nr_unmapped = 0;
-
-	if (page_has_buffers(page)) {
 	struct buffer_head	*bh, *head;

+	*delalloc = *unmapped = 0;
+
 	bh = head = page_buffers(page);
 	do {
 		if (buffer_uptodate(bh) && !buffer_mapped(bh))
-				(*nr_unmapped)++;
+			(*unmapped) = 1;
 		else if (buffer_delay(bh))
-				(*nr_delalloc)++;
+			(*delalloc) = 1;
 	} while ((bh = bh->b_this_page) != head);
-
-		return 1;
-	}
-
-	return 0;
 }

+
+/*
+ * writepage: Called from one of two places:
+ *
+ * 1. we are flushing a delalloc buffer head.
+ *
+ * 2. we are writing out a dirty page. Typically the page dirty
+ *    state is cleared before we get here. In this case is it
+ *    conceivable we have no buffer heads.
+ *
+ * For delalloc space on the page we need to allocate space and
+ * flush it. For unmapped buffer heads on the page we should
+ * allocate space if the page is uptodate. For any other dirty
+ * buffer heads on the page we should flush them.
+ *
+ * If we detect that a transaction would be required to flush
+ * the page, we have to check the process flags first, if we
+ * are already in a transaction or disk I/O during allocations
+ * is off, we need to fail the writepage and redirty the page.
+ * We also need to set PF_NOIO ourselves.
+ */
 STATIC int
 linvfs_writepage(
 	struct page		*page,
 	struct writeback_control *wbc)
 {
 	int			error;
-	int			need_trans = 1;
-	int			nr_delalloc, nr_unmapped;
+	int			need_trans;
+	int			delalloc, unmapped;
+	struct inode		*inode = page->mapping->host;

-	if (count_page_state(page, &nr_delalloc, &nr_unmapped))
-		need_trans = nr_delalloc + nr_unmapped;
+	/*
+	 * We need a transaction if:
+	 *  1. There are delalloc buffers on the page
+	 *  2. The page is upto date and we have unmapped buffers
+	 *  3. The page is upto date and we have no buffers
+	 */
+	if (!page_has_buffers(page)) {
+		unmapped = 1;
+		need_trans = 1;
+	} else {
+		count_page_state(page, &delalloc, &unmapped);
+		if (!PageUptodate(page))
+			unmapped = 0;
+		need_trans = delalloc + unmapped;
+	}

+	/*
+	 * If we need a transaction and the process flags say
+	 * we are already in a transaction, or no IO is allowed
+	 * then mark the page dirty again and leave the page
+	 * as is.
+	 */
 	if ((current->flags & (PF_FSTRANS)) && need_trans)
 		goto out_fail;

+	/*
+	 * Delay hooking up buffer heads until we have
+	 * made our go/no-go decision.
+	 */
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+	}
+
 	/*
 	 * Convert delalloc or unmapped space to real space and flush out
 	 * to disk.
 	 */
-	error = delalloc_convert(page, 1, nr_delalloc == 0);
-	if (unlikely(error))
-		unlock_page(page);
-	return error;
+	error = delalloc_convert(page, 1, unmapped);
+	if (error == -EAGAIN)
+		goto out_fail;
+	if (unlikely(error < 0))
+		goto out_unlock;
+
+	return 0;

 out_fail:
 	set_page_dirty(page);
 	unlock_page(page);
 	return 0;
-}
-
-STATIC int
-linvfs_prepare_write(
-	struct file		*file,
-	struct page		*page,
-	unsigned int		from,
-	unsigned int		to)
-{
-	if (file && (file->f_flags & O_SYNC)) {
-		return block_prepare_write(page, from, to,
-						linvfs_get_block_sync);
-	} else {
-		return block_prepare_write(page, from, to,
-						linvfs_get_block);
-	}
+out_unlock:
+	unlock_page(page);
+	return error;
 }

 /*
- * This gets a page into cleanable state - page locked on entry
- * kept locked on exit. If the page is marked dirty we should
- * not come this way.
+ * Called to move a page into cleanable state - and from there
+ * to be released. Possibly the page is already clean. We always
+ * have buffer heads in this call.
+ *
+ * Returns 0 if the page is ok to release, 1 otherwise.
+ *
+ * Possible scenarios are:
+ *
+ * 1. We are being called to release a page which has been written
+ *    to via regular I/O. buffer heads will be dirty and possibly
+ *    delalloc. If no delalloc buffer heads in this case then we
+ *    can just return zero.
+ *
+ * 2. We are called to release a page which has been written via
+ *    mmap, all we need to do is ensure there is no delalloc
+ *    state in the buffer heads, if not we can let the caller
+ *    free them and we should come back later via writepage.
 */
 STATIC int
 linvfs_release_page(
 	struct page		*page,
 	int			gfp_mask)
 {
-	int			nr_delalloc, nr_unmapped;
+	int			delalloc, unmapped;

-	if (count_page_state(page, &nr_delalloc, &nr_unmapped)) {
-		if (!nr_delalloc)
+	count_page_state(page, &delalloc, &unmapped);
+	if (!delalloc)
 		goto free_buffers;
-	} 

-	if (gfp_mask & __GFP_FS) {
+	if (!(gfp_mask & __GFP_FS))
+		return 0;
+
 	/*
 	 * Convert delalloc space to real space, do not flush the
 	 * data out to disk, that will be done by the caller.
+	 * Never need to allocate space here - we will always
+	 * come back to writepage in that case.
 	 */
 	if (delalloc_convert(page, 0, 0) == 0)
 		goto free_buffers;
-	}
-
 	return 0;

 free_buffers:
 	return try_to_free_buffers(page);
 }

+STATIC int
+linvfs_prepare_write(
+	struct file		*file,
+	struct page		*page,
+	unsigned int		from,
+	unsigned int		to)
+{
+	if (file && (file->f_flags & O_SYNC)) {
+		return block_prepare_write(page, from, to,
+						linvfs_get_block_sync);
+	} else {
+		return block_prepare_write(page, from, to,
+						linvfs_get_block);
+	}
+}

 struct address_space_operations linvfs_aops = {
 	.readpage		= linvfs_readpage,

--- a/fs/xfs/linux/xfs_iomap.c
+++ b/fs/xfs/linux/xfs_iomap.c
@@ -120,7 +120,13 @@ xfs_iomap(
 	case PBF_FILE_ALLOCATE:
 		lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD;
 		bmap_flags = XFS_BMAPI_ENTIRE;
+		/* Attempt non-blocking lock */
+		if (flags & PBF_TRYLOCK) {
+			if (!XFS_ILOCK_NOWAIT(mp, io, lockmode))
+				return XFS_ERROR(EAGAIN);
+		} else {
 			XFS_ILOCK(mp, io, lockmode);
+		}
 		break;
 	case PBF_FILE_UNWRITTEN:
 		lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;

--- a/fs/xfs/linux/xfs_super.c
+++ b/fs/xfs/linux/xfs_super.c
@@ -1009,13 +1009,13 @@ init_xfs_fs( void )
 	if (error < 0)
 		return error;

+	si_meminfo(&si);
+	xfs_physmem = si.totalram;
+
 	error = pagebuf_init();
 	if (error < 0)
 		goto out;

-	si_meminfo(&si);
-	xfs_physmem = si.totalram;
-
 	vn_init();
 	xfs_init();
 	dmapi_init();

--- a/fs/xfs/pagebuf/page_buf.c
+++ b/fs/xfs/pagebuf/page_buf.c
@@ -119,9 +119,9 @@ pb_trace_func(

 STATIC kmem_cache_t *pagebuf_cache;
 STATIC void pagebuf_daemon_wakeup(int);
+STATIC void pagebuf_delwri_queue(page_buf_t *, int);
 STATIC struct workqueue_struct *pagebuf_workqueue;

-
 /*
 * Pagebuf module configuration parameters, exported via
 * /proc/sys/vm/pagebuf
@@ -155,35 +155,37 @@ struct pbstats pbstats;
 * Pagebuf hashing
 */

-#define NBITS	5
-#define NHASH	(1<<NBITS)
-
+/* This structure must be a power of 2 long for the hash to work */
 typedef struct {
 	struct list_head	pb_hash;
 	int			pb_count;
 	spinlock_t		pb_hash_lock;
 } pb_hash_t;

-STATIC pb_hash_t	pbhash[NHASH];
+static pb_hash_t	*pbhash;
+static unsigned int	pb_hash_mask;
+static unsigned int	pb_hash_shift;
+static unsigned int	pb_order;
 #define pb_hash(pb)	&pbhash[pb->pb_hash_index]

-STATIC int
+/*
+ * This hash is the same one as used on the Linux buffer cache,
+ * see fs/buffer.c
+ */
+
+#define _hashfn(dev,block)      \
+        ((((dev)<<(pb_hash_shift - 6)) ^ ((dev)<<(pb_hash_shift - 9))) ^ \
+         (((block)<<(pb_hash_shift - 6)) ^ ((block) >> 13) ^ \
+          ((block) << (pb_hash_shift - 12))))
+
+static inline int
 _bhash(
 	dev_t		dev,
 	loff_t		base)
 {
-	int		bit, hval;
-
 	base >>= 9;
-	/*
-	 * dev_t is 16 bits, loff_t is always 64 bits
-	 */
-	base ^= dev;
-	for (bit = hval = 0; base != 0 && bit < sizeof(base) * 8; bit += NBITS) {
-		hval ^= (int)base & (NHASH-1);
-		base >>= NBITS;
-	}
-	return hval;
+	
+	return (_hashfn(dev, base) & pb_hash_mask);
 }

 /*
@@ -1516,7 +1518,7 @@ STATIC int pbd_active = 1;
 STATIC LIST_HEAD(pbd_delwrite_queue);
 STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED;

-void
+STATIC void
 pagebuf_delwri_queue(
 	page_buf_t		*pb,
 	int			unlock)
@@ -1862,7 +1864,39 @@ pagebuf_shaker(void)
 int __init
 pagebuf_init(void)
 {
-	int			i;
+	int		order, mempages, i;
+	unsigned int	nr_hash;
+	extern int	xfs_physmem;
+
+	mempages = xfs_physmem >>= 16;
+	mempages *= sizeof(pb_hash_t);
+	for (order = 0; (1 << order) < mempages; order++)
+		;
+
+	if (order > 3) order = 3;	/* cap us at 2K buckets */
+
+	do {
+		unsigned long tmp;
+
+		nr_hash = (PAGE_SIZE << order) / sizeof(pb_hash_t);	
+		nr_hash = 1 << (ffs(nr_hash) - 1);
+		pb_hash_mask =  (nr_hash - 1);
+		tmp = nr_hash;
+		pb_hash_shift = 0;
+		while((tmp >>= 1UL) != 0UL)
+			pb_hash_shift++;
+
+		pbhash = (pb_hash_t *)
+			__get_free_pages(GFP_KERNEL, order);
+		pb_order = order;
+	} while (pbhash == NULL && --order > 0);
+	printk("pagebuf cache hash table entries: %d (order: %d, %ld bytes)\n",
+		nr_hash, order, (PAGE_SIZE << order));
+
+	for(i = 0; i < nr_hash; i++) {
+		spin_lock_init(&pbhash[i].pb_hash_lock);
+		INIT_LIST_HEAD(&pbhash[i].pb_hash);
+	} 

 	pagebuf_table_header = register_sysctl_table(pagebuf_root_table, 1);

@@ -1880,11 +1914,6 @@ pagebuf_init(void)
 		return -ENOMEM;
 	}

-	for (i = 0; i < NHASH; i++) {
-		spin_lock_init(&pbhash[i].pb_hash_lock);
-		INIT_LIST_HEAD(&pbhash[i].pb_hash);
-	}
-
 #ifdef PAGEBUF_TRACE
 	pb_trace.buf = (pagebuf_trace_t *)kmalloc(
 			PB_TRACE_BUFSIZE * sizeof(pagebuf_trace_t), GFP_KERNEL);
@@ -1911,6 +1940,7 @@ pagebuf_terminate(void)

 	kmem_cache_destroy(pagebuf_cache);
 	kmem_shake_deregister(pagebuf_shaker);
+	free_pages((unsigned long)pbhash, pb_order);

 	unregister_sysctl_table(pagebuf_table_header);
 #ifdef	CONFIG_PROC_FS

--- a/fs/xfs/pagebuf/page_buf.h
+++ b/fs/xfs/pagebuf/page_buf.h
@@ -215,8 +215,8 @@ typedef struct page_buf_s {
 	unsigned short		pb_error;	/* error code on I/O */
 	unsigned short		pb_page_count;	/* size of page array */
 	unsigned short		pb_offset;	/* page offset in first page */
+	unsigned short		pb_hash_index;	/* hash table index	*/
 	unsigned char		pb_locked;	/* page array is locked */
-	unsigned char		pb_hash_index;	/* hash table index	*/
 	struct page		**pb_pages;	/* array of page pointers */
 	struct page		*pb_page_array[PB_PAGES]; /* inline pages */
 #ifdef PAGEBUF_LOCK_TRACKING
@@ -350,7 +350,6 @@ extern int pagebuf_ispin( page_buf_t *); /* check if pagebuf is pinned	*/

 /* Reading and writing pages */

-extern void pagebuf_delwri_queue(page_buf_t *, int);
 extern void pagebuf_delwri_dequeue(page_buf_t *);

 #define PBDF_WAIT    0x01

--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -86,7 +86,7 @@ cmn_err(register int level, char *fmt, ...)
 {
 	char	*fp = fmt;
 	int	len;
-	int	flags;
+	unsigned long flags;
 	va_list	ap;

 	level &= XFS_ERR_MASK;

--- a/fs/xfs/support/spin.h
+++ b/fs/xfs/support/spin.h
@@ -46,7 +46,6 @@
 typedef spinlock_t lock_t;

 #define spinlock_init(lock, name)	spin_lock_init(lock)
-#define init_spinlock(lock, name, ll)	spin_lock_init(lock)
 #define spinlock_destroy(lock)

 static inline unsigned long mutex_spinlock(lock_t *lock)

--- a/fs/xfs/xfs_dir_leaf.c
+++ b/fs/xfs/xfs_dir_leaf.c
@@ -648,7 +648,7 @@ xfs_dir_leaf_to_shortform(xfs_da_args_t *iargs)
 	retval = xfs_da_read_buf(iargs->trans, iargs->dp, 0, -1, &bp,
 					       XFS_DATA_FORK);
 	if (retval)
-		return(retval);
+		goto out;
 	ASSERT(bp != NULL);
 	memcpy(tmpbuffer, bp->data, XFS_LBSIZE(dp->i_mount));
 	leaf = (xfs_dir_leafblock_t *)tmpbuffer;

--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -33,7 +33,7 @@
 #define __XFS_ERROR_H__

 #define prdev(fmt,dev,args...) \
-	printk("XFS: device 0x%x- " fmt "\n", dev, ## args)
+	printk("XFS: device 0x%x- " fmt "\n", (unsigned)dev, ## args)

 #define XFS_ERECOVER	1	/* Failure to recover log */
 #define XFS_ELOGSTAT	2	/* Failure to stat log in user space */

--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -378,17 +378,26 @@ xfs_log_notify(xfs_mount_t	  *mp,		/* mount of partition */
 		iclog->ic_callback_tail = &(cb->cb_next);
 	}
 	LOG_UNLOCK(log, spl);
-	if (!abortflg) {
-		if (xlog_state_release_iclog(log, iclog)) {
-			xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
-			return EIO;
-		}
-	} else {
+	if (abortflg) {
 		cb->cb_func(cb->cb_arg, abortflg);
 	}
 	return 0;
 }	/* xfs_log_notify */

+int
+xfs_log_release_iclog(xfs_mount_t *mp,
+		      void	  *iclog_hndl)
+{
+	xlog_t *log = mp->m_log;
+	xlog_in_core_t	  *iclog = (xlog_in_core_t *)iclog_hndl;
+
+	if (xlog_state_release_iclog(log, iclog)) {
+		xfs_force_shutdown(mp, XFS_LOG_IO_ERROR);
+		return(EIO);
+	}
+
+	return 0;
+}

 /*
 * Initialize log manager data.	 This routine is intended to be called when

--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -164,6 +164,8 @@ void	  xfs_log_move_tail(struct xfs_mount	*mp,
 int	  xfs_log_notify(struct xfs_mount	*mp,
 			 void			*iclog,
 			 xfs_log_callback_t	*callback_entry);
+int	  xfs_log_release_iclog(struct xfs_mount *mp,
+			 void			 *iclog_hndl);
 int	  xfs_log_reserve(struct xfs_mount *mp,
 			  int		   length,
 			  int		   count,

--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1287,10 +1287,6 @@ xlog_recover_add_to_trans(xlog_recover_t	*trans,

 	if (!len)
 		return 0;
-	ptr = kmem_zalloc(len, 0);
-	memcpy(ptr, dp, len);
-
-	in_f = (xfs_inode_log_format_t *)ptr;
 	item = trans->r_itemq;
 	if (item == 0) {
 		ASSERT(*(uint *)dp == XFS_TRANS_HEADER_MAGIC);
@@ -1299,6 +1295,11 @@ xlog_recover_add_to_trans(xlog_recover_t	*trans,
 		memcpy(&trans->r_theader, dp, len); /* d, s, l */
 		return 0;
 	}
+
+	ptr = kmem_alloc(len, 0);
+	memcpy(ptr, dp, len);
+	in_f = (xfs_inode_log_format_t *)ptr;
+
 	if (item->ri_prev->ri_total != 0 &&
 	     item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
 		xlog_recover_add_item(&trans->r_itemq);

--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -272,7 +272,7 @@ xfs_mount_validate_sb(
 		cmn_err(CE_WARN,
 		"XFS: Only page-sized (%d) or less blocksizes currently work.",
 			PAGE_SIZE);
-		return XFS_ERROR(EWRONGFS);
+		return XFS_ERROR(ENOSYS);
 	}

 	return 0;
@@ -459,10 +459,22 @@ xfs_readsb(xfs_mount_t *mp)
 	}

 	/*
-	 * Re-read the superblock so that our buffer is correctly sized.
-	 * We only need to do this if sector size on-disk is different.
+	 * We must be able to do sector-sized and sector-aligned IO.
+	 */
+	if (sector_size > mp->m_sb.sb_sectsize) {
+		cmn_err(CE_WARN,
+			"XFS: device supports only %u byte sectors (not %u)",
+			sector_size, mp->m_sb.sb_sectsize);
+		XFS_BUF_UNMANAGE(bp);
+		xfs_buf_relse(bp);
+		return XFS_ERROR(ENOSYS);
+	}
+
+	/*
+	 * If device sector size is smaller than the superblock size,
+	 * re-read the superblock so the buffer is correctly sized.
 	 */
-	if (sector_size != mp->m_sb.sb_sectsize) {
+	if (sector_size < mp->m_sb.sb_sectsize) {
 		XFS_BUF_UNMANAGE(bp);
 		xfs_buf_relse(bp);
 		sector_size = mp->m_sb.sb_sectsize;

--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -808,19 +808,6 @@ xfs_trans_commit(
 		return XFS_ERROR(EIO);
 	}

-	/*
-	 * Once all the items of the transaction have been copied
-	 * to the in core log we can release them.  Do that here.
-	 * This will free descriptors pointing to items which were
-	 * not logged since there is nothing more to do with them.
-	 * For items which were logged, we will keep pointers to them
-	 * so they can be unpinned after the transaction commits to disk.
-	 * This will also stamp each modified meta-data item with
-	 * the commit lsn of this transaction for dependency tracking
-	 * purposes.
-	 */
-	xfs_trans_unlock_items(tp, commit_lsn);
-
 	/*
 	 * Once the transaction has committed, unused
 	 * reservations need to be released and changes to
@@ -856,12 +843,36 @@ xfs_trans_commit(
 	tp->t_logcb.cb_arg = tp;

 	/* We need to pass the iclog buffer which was used for the
-	 * transaction commit record into this function, attach
-	 * the callback to it, and then release it. This will guarantee
-	 * that we do callbacks on the transaction in the correct order.
+	 * transaction commit record into this function, and attach
+	 * the callback to it. The callback must be attached before
+	 * the items are unlocked to avoid racing with other threads
+	 * waiting for an item to unlock.
 	 */
 	error = xfs_log_notify(mp, commit_iclog, &(tp->t_logcb));
 #endif
+
+	/*
+	 * Once all the items of the transaction have been copied
+	 * to the in core log and the callback is attached, the
+	 * items can be unlocked.
+	 *
+	 * This will free descriptors pointing to items which were
+	 * not logged since there is nothing more to do with them.
+	 * For items which were logged, we will keep pointers to them
+	 * so they can be unpinned after the transaction commits to disk.
+	 * This will also stamp each modified meta-data item with
+	 * the commit lsn of this transaction for dependency tracking
+	 * purposes.
+	 */
+	xfs_trans_unlock_items(tp, commit_lsn);
+
+	/*
+	 * Now that the xfs_trans_committed callback has been attached,
+	 * and the items are released we can finally allow the iclog to
+	 * go to disk.
+	 */
+	error = xfs_log_release_iclog(mp, commit_iclog);
+
 	/*
 	 * If the transaction needs to be synchronous, then force the
 	 * log out now and wait for it.