Merge branch 'xfs-4.8-misc-fixes-4' into for-next

f2bdfda9 · Dave Chinner · dc4113d2 · 72ccbbe1 · f2bdfda9 · f2bdfda9
Commit f2bdfda9 authored Jul 22, 2016 by Dave Chinner
11 changed files
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -356,7 +356,6 @@ xfs_da3_split(
 	struct xfs_da_state_blk	*newblk;
 	struct xfs_da_state_blk	*addblk;
 	struct xfs_da_intnode	*node;
-	struct xfs_buf		*bp;
 	int			max;
 	int			action = 0;
 	int			error;
@@ -397,7 +396,9 @@ xfs_da3_split(
 				break;
 			}
 			/*
-			 * Entry wouldn't fit, split the leaf again.
+			 * Entry wouldn't fit, split the leaf again. The new
+			 * extrablk will be consumed by xfs_da3_node_split if
+			 * the node is split.
 			 */
 			state->extravalid = 1;
 			if (state->inleaf) {
@@ -445,6 +446,14 @@ xfs_da3_split(
 	if (!addblk)
 		return 0;
+	/*
+	 * xfs_da3_node_split() should have consumed any extra blocks we added
+	 * during a double leaf split in the attr fork. This is guaranteed as
+	 * we can't be here if the attr fork only has a single leaf block.
+	 */
+	ASSERT(state->extravalid == 0 ||
+	       state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
 	/*
 	 * Split the root node.
 	 */
@@ -457,41 +466,31 @@ xfs_da3_split(
 	}
 	/*
-	 * Update pointers to the node which used to be block 0 and
+	 * Update pointers to the node which used to be block 0 and just got
-	 * just got bumped because of the addition of a new root node.
+	 * bumped because of the addition of a new root node.  Note that the
-	 * There might be three blocks involved if a double split occurred,
+	 * original block 0 could be at any position in the list of blocks in
-	 * and the original block 0 could be at any position in the list.
+	 * the tree.
 	 *
-	 * Note: the magic numbers and sibling pointers are in the same
+	 * Note: the magic numbers and sibling pointers are in the same physical
-	 * physical place for both v2 and v3 headers (by design). Hence it
+	 * place for both v2 and v3 headers (by design). Hence it doesn't matter
-	 * doesn't matter which version of the xfs_da_intnode structure we use
+	 * which version of the xfs_da_intnode structure we use here as the
-	 * here as the result will be the same using either structure.
+	 * result will be the same using either structure.
 	 */
 	node = oldblk->bp->b_addr;
 	if (node->hdr.info.forw) {
-		if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) {
+		ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
-			bp = addblk->bp;
+		node = addblk->bp->b_addr;
-		} else {
-			ASSERT(state->extravalid);
-			bp = state->extrablk.bp;
-		}
-		node = bp->b_addr;
 		node->hdr.info.back = cpu_to_be32(oldblk->blkno);
-		xfs_trans_log_buf(state->args->trans, bp,
+		xfs_trans_log_buf(state->args->trans, addblk->bp,
 				  XFS_DA_LOGRANGE(node, &node->hdr.info,
 				  sizeof(node->hdr.info)));
 	}
 	node = oldblk->bp->b_addr;
 	if (node->hdr.info.back) {
-		if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) {
+		ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
-			bp = addblk->bp;
+		node = addblk->bp->b_addr;
-		} else {
-			ASSERT(state->extravalid);
-			bp = state->extrablk.bp;
-		}
-		node = bp->b_addr;
 		node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
-		xfs_trans_log_buf(state->args->trans, bp,
+		xfs_trans_log_buf(state->args->trans, addblk->bp,
 				  XFS_DA_LOGRANGE(node, &node->hdr.info,
 				  sizeof(node->hdr.info)));
 	}

--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
 * We're now finished for good with this page.  Update the page state via the
 * associated buffer_heads, paying attention to the start and end offsets that
 * we need to process on the page.
+ *
+ * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+ * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
+ * the page at all, as we may be racing with memory reclaim and it can free both
+ * the bufferhead chain and the page as it will see the page as clean and
+ * unused.
 */
 static void
 xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
 	int			error)
 {
 	unsigned int		end = bvec->bv_offset + bvec->bv_len - 1;
-	struct buffer_head	*head, *bh;
+	struct buffer_head	*head, *bh, *next;
 	unsigned int		off = 0;
+	unsigned int		bsize;
 	ASSERT(bvec->bv_offset < PAGE_SIZE);
 	ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
 	bh = head = page_buffers(bvec->bv_page);
+	bsize = bh->b_size;
 	do {
+		next = bh->b_this_page;
 		if (off < bvec->bv_offset)
 			goto next_bh;
 		if (off > end)
 			break;
 		bh->b_end_io(bh, !error);
 next_bh:
-		off += bh->b_size;
+		off += bsize;
-	} while ((bh = bh->b_this_page) != head);
+	} while ((bh = next) != head);
 }
 /*
@@ -1040,6 +1049,20 @@ xfs_vm_releasepage(
 	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
+	/*
+	 * mm accommodates an old ext3 case where clean pages might not have had
+	 * the dirty bit cleared. Thus, it can send actual dirty pages to
+	 * ->releasepage() via shrink_active_list(). Conversely,
+	 * block_invalidatepage() can send pages that are still marked dirty
+	 * but otherwise have invalidated buffers.
+	 *
+	 * We've historically freed buffers on the latter. Instead, quietly
+	 * filter out all dirty pages to avoid spurious buffer state warnings.
+	 * This can likely be removed once shrink_active_list() is fixed.
+	 */
+	if (PageDirty(page))
+		return 0;
 	xfs_count_page_state(page, &delalloc, &unwritten);
 	if (WARN_ON_ONCE(delalloc))

--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -957,6 +957,7 @@ xfs_buf_item_free(
 	xfs_buf_log_item_t	*bip)
 {
 	xfs_buf_item_free_format(bip);
+	kmem_free(bip->bli_item.li_lv_shadow);
 	kmem_zone_free(xfs_buf_item_zone, bip);
 }

--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
 {
 	ASSERT(list_empty(&dqp->q_lru));
+	kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
 	mutex_destroy(&dqp->q_qlock);
 	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);

--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
 	spin_lock(&ailp->xa_lock);
 	xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
+	kmem_free(qfs->qql_item.li_lv_shadow);
+	kmem_free(lip->li_lv_shadow);
 	kmem_free(qfs);
 	kmem_free(qfe);
 	return (xfs_lsn_t)-1;

--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,6 +40,7 @@ void
 xfs_efi_item_free(
 	struct xfs_efi_log_item	*efip)
 {
+	kmem_free(efip->efi_item.li_lv_shadow);
 	if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
 		kmem_free(efip);
 	else
@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
 STATIC void
 xfs_efd_item_free(struct xfs_efd_log_item *efdp)
 {
+	kmem_free(efdp->efd_item.li_lv_shadow);
 	if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
 		kmem_free(efdp);
 	else

--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -327,7 +327,7 @@ xfs_file_dio_aio_read(
 	return ret;
 }
-STATIC ssize_t
+static noinline ssize_t
 xfs_file_dax_read(
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
@@ -706,7 +706,7 @@ xfs_file_dio_aio_write(
 	return ret;
 }
-STATIC ssize_t
+static noinline ssize_t
 xfs_file_dax_write(
 	struct kiocb		*iocb,
 	struct iov_iter		*from)

--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,6 +651,7 @@ void
 xfs_inode_item_destroy(
 	xfs_inode_t	*ip)
 {
+	kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
 	kmem_zone_free(xfs_ili_zone, ip->i_itemp);
 }

--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
 	log->l_cilp->xc_ctx->sequence = 1;
 }
+static inline int
+xlog_cil_iovec_space(
+	uint	niovecs)
+{
+	return round_up((sizeof(struct xfs_log_vec) +
+					niovecs * sizeof(struct xfs_log_iovec)),
+			sizeof(uint64_t));
+}
+/*
+ * Allocate or pin log vector buffers for CIL insertion.
+ *
+ * The CIL currently uses disposable buffers for copying a snapshot of the
+ * modified items into the log during a push. The biggest problem with this is
+ * the requirement to allocate the disposable buffer during the commit if:
+ *	a) does not exist; or
+ *	b) it is too small
+ *
+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
+ * the memory allocation. This means that we have a potential deadlock situation
+ * under low memory conditions when we have lots of dirty metadata pinned in
+ * the CIL and we need a CIL commit to occur to free memory.
+ *
+ * To avoid this, we need to move the memory allocation outside the
+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
+ * vector buffers between the check and the formatting of the item into the
+ * log vector buffer within the xc_ctx_lock.
+ *
+ * Because the log vector buffer needs to be unchanged during the CIL push
+ * process, we cannot share the buffer between the transaction commit (which
+ * modifies the buffer) and the CIL push context that is writing the changes
+ * into the log. This means skipping preallocation of buffer space is
+ * unreliable, but we most definitely do not want to be allocating and freeing
+ * buffers unnecessarily during commits when overwrites can be done safely.
+ *
+ * The simplest solution to this problem is to allocate a shadow buffer when a
+ * log item is committed for the second time, and then to only use this buffer
+ * if necessary. The buffer can remain attached to the log item until such time
+ * it is needed, and this is the buffer that is reallocated to match the size of
+ * the incoming modification. Then during the formatting of the item we can swap
+ * the active buffer with the new one if we can't reuse the existing buffer. We
+ * don't free the old buffer as it may be reused on the next modification if
+ * it's size is right, otherwise we'll free and reallocate it at that point.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and attaches the vector to the log item in preparation
+ * for the formatting step which occurs under the xc_ctx_lock.
+ *
+ * While this means the memory footprint goes up, it avoids the repeated
+ * alloc/free pattern that repeated modifications of an item would otherwise
+ * cause, and hence minimises the CPU overhead of such behaviour.
+ */
+static void
+xlog_cil_alloc_shadow_bufs(
+	struct xlog		*log,
+	struct xfs_trans	*tp)
+{
+	struct xfs_log_item_desc *lidp;
+	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
+		struct xfs_log_item *lip = lidp->lid_item;
+		struct xfs_log_vec *lv;
+		int	niovecs = 0;
+		int	nbytes = 0;
+		int	buf_size;
+		bool	ordered = false;
+		/* Skip items which aren't dirty in this transaction. */
+		if (!(lidp->lid_flags & XFS_LID_DIRTY))
+			continue;
+		/* get number of vecs and size of data to be stored */
+		lip->li_ops->iop_size(lip, &niovecs, &nbytes);
+		/*
+		 * Ordered items need to be tracked but we do not wish to write
+		 * them. We need a logvec to track the object, but we do not
+		 * need an iovec or buffer to be allocated for copying data.
+		 */
+		if (niovecs == XFS_LOG_VEC_ORDERED) {
+			ordered = true;
+			niovecs = 0;
+			nbytes = 0;
+		}
+		/*
+		 * We 64-bit align the length of each iovec so that the start
+		 * of the next one is naturally aligned.  We'll need to
+		 * account for that slack space here. Then round nbytes up
+		 * to 64-bit alignment so that the initial buffer alignment is
+		 * easy to calculate and verify.
+		 */
+		nbytes += niovecs * sizeof(uint64_t);
+		nbytes = round_up(nbytes, sizeof(uint64_t));
+		/*
+		 * The data buffer needs to start 64-bit aligned, so round up
+		 * that space to ensure we can align it appropriately and not
+		 * overrun the buffer.
+		 */
+		buf_size = nbytes + xlog_cil_iovec_space(niovecs);
+		/*
+		 * if we have no shadow buffer, or it is too small, we need to
+		 * reallocate it.
+		 */
+		if (!lip->li_lv_shadow ||
+		    buf_size > lip->li_lv_shadow->lv_size) {
+			/*
+			 * We free and allocate here as a realloc would copy
+			 * unecessary data. We don't use kmem_zalloc() for the
+			 * same reason - we don't need to zero the data area in
+			 * the buffer, only the log vector header and the iovec
+			 * storage.
+			 */
+			kmem_free(lip->li_lv_shadow);
+			lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
+			memset(lv, 0, xlog_cil_iovec_space(niovecs));
+			lv->lv_item = lip;
+			lv->lv_size = buf_size;
+			if (ordered)
+				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+			else
+				lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
+			lip->li_lv_shadow = lv;
+		} else {
+			/* same or smaller, optimise common overwrite case */
+			lv = lip->li_lv_shadow;
+			if (ordered)
+				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+			else
+				lv->lv_buf_len = 0;
+			lv->lv_bytes = 0;
+			lv->lv_next = NULL;
+		}
+		/* Ensure the lv is set up according to ->iop_size */
+		lv->lv_niovecs = niovecs;
+		/* The allocated data region lies beyond the iovec region */
+		lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
+	}
+}
 /*
 * Prepare the log item for insertion into the CIL. Calculate the difference in
 * log space and vectors it will consume, and if it is a new item pin it as
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
 	/*
 	 * If there is no old LV, this is the first time we've seen the item in
 	 * this CIL context and so we need to pin it. If we are replacing the
-	 * old_lv, then remove the space it accounts for and free it.
+	 * old_lv, then remove the space it accounts for and make it the shadow
+	 * buffer for later freeing. In both cases we are now switching to the
+	 * shadow buffer, so update the the pointer to it appropriately.
 	 */
-	if (!old_lv)
+	if (!old_lv) {
 		lv->lv_item->li_ops->iop_pin(lv->lv_item);
-	else if (old_lv != lv) {
+		lv->lv_item->li_lv_shadow = NULL;
+	} else if (old_lv != lv) {
 		ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
 		*diff_len -= old_lv->lv_bytes;
 		*diff_iovecs -= old_lv->lv_niovecs;
-		kmem_free(old_lv);
+		lv->lv_item->li_lv_shadow = old_lv;
 	}
 	/* attach new log vector to log item */
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
 * write it out asynchronously without needing to relock the object that was
 * modified at the time it gets written into the iclog.
 *
- * This function builds a vector for the changes in each log item in the
+ * This function takes the prepared log vectors attached to each log item, and
- * transaction. It then works out the length of the buffer needed for each log
+ * formats the changes into the log vector buffer. The buffer it uses is
- * item, allocates them and formats the vector for the item into the buffer.
+ * dependent on the current state of the vector in the CIL - the shadow lv is
- * The buffer is then attached to the log item are then inserted into the
+ * guaranteed to be large enough for the current modification, but we will only
- * Committed Item List for tracking until the next checkpoint is written out.
+ * use that if we can't reuse the existing lv. If we can't reuse the existing
+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
+ * done lazily either by th enext modification or the freeing of the log item.
 *
 * We don't set up region headers during this process; we simply copy the
 * regions into the flat buffer. We can do this because we still have to do a
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
 	list_for_each_entry(lidp, &tp->t_items, lid_trans) {
 		struct xfs_log_item *lip = lidp->lid_item;
 		struct xfs_log_vec *lv;
-		struct xfs_log_vec *old_lv;
+		struct xfs_log_vec *old_lv = NULL;
-		int	niovecs = 0;
+		struct xfs_log_vec *shadow;
-		int	nbytes = 0;
-		int	buf_size;
 		bool	ordered = false;
 		/* Skip items which aren't dirty in this transaction. */
 		if (!(lidp->lid_flags & XFS_LID_DIRTY))
 			continue;
-		/* get number of vecs and size of data to be stored */
-		lip->li_ops->iop_size(lip, &niovecs, &nbytes);
-		/* Skip items that do not have any vectors for writing */
-		if (!niovecs)
-			continue;
 		/*
-		 * Ordered items need to be tracked but we do not wish to write
+		 * The formatting size information is already attached to
-		 * them. We need a logvec to track the object, but we do not
+		 * the shadow lv on the log item.
-		 * need an iovec or buffer to be allocated for copying data.
 		 */
-		if (niovecs == XFS_LOG_VEC_ORDERED) {
+		shadow = lip->li_lv_shadow;
+		if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
 			ordered = true;
-			niovecs = 0;
-			nbytes = 0;
-		}
-		/*
-		 * We 64-bit align the length of each iovec so that the start
-		 * of the next one is naturally aligned.  We'll need to
-		 * account for that slack space here. Then round nbytes up
-		 * to 64-bit alignment so that the initial buffer alignment is
-		 * easy to calculate and verify.
-		 */
-		nbytes += niovecs * sizeof(uint64_t);
-		nbytes = round_up(nbytes, sizeof(uint64_t));
-		/* grab the old item if it exists for reservation accounting */
-		old_lv = lip->li_lv;
-		/*
+		/* Skip items that do not have any vectors for writing */
-		 * The data buffer needs to start 64-bit aligned, so round up
+		if (!shadow->lv_niovecs && !ordered)
-		 * that space to ensure we can align it appropriately and not
+			continue;
-		 * overrun the buffer.
-		 */
-		buf_size = nbytes +
-			   round_up((sizeof(struct xfs_log_vec) +
-				     niovecs * sizeof(struct xfs_log_iovec)),
-				    sizeof(uint64_t));
 		/* compare to existing item size */
-		if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
+		old_lv = lip->li_lv;
+		if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
 			/* same or smaller, optimise common overwrite case */
 			lv = lip->li_lv;
 			lv->lv_next = NULL;
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
 			 */
 			*diff_iovecs -= lv->lv_niovecs;
 			*diff_len -= lv->lv_bytes;
+			/* Ensure the lv is set up according to ->iop_size */
+			lv->lv_niovecs = shadow->lv_niovecs;
+			/* reset the lv buffer information for new formatting */
+			lv->lv_buf_len = 0;
+			lv->lv_bytes = 0;
+			lv->lv_buf = (char *)lv +
+					xlog_cil_iovec_space(lv->lv_niovecs);
 		} else {
-			/* allocate new data chunk */
+			/* switch to shadow buffer! */
-			lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+			lv = shadow;
 			lv->lv_item = lip;
-			lv->lv_size = buf_size;
 			if (ordered) {
 				/* track as an ordered logvec */
 				ASSERT(lip->li_lv == NULL);
-				lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
 				goto insert;
 			}
-			lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
 		}
-		/* Ensure the lv is set up according to ->iop_size */
-		lv->lv_niovecs = niovecs;
-		/* The allocated data region lies beyond the iovec region */
-		lv->lv_buf_len = 0;
-		lv->lv_bytes = 0;
-		lv->lv_buf = (char *)lv + buf_size - nbytes;
 		ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
 		lip->li_ops->iop_format(lip, lv);
 insert:
-		ASSERT(lv->lv_buf_len <= nbytes);
 		xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
 	}
 }
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
 	struct xlog		*log = mp->m_log;
 	struct xfs_cil		*cil = log->l_cilp;
+	/*
+	 * Do all necessary memory allocation before we lock the CIL.
+	 * This ensures the allocation does not deadlock with a CIL
+	 * push in memory reclaim (e.g. from kswapd).
+	 */
+	xlog_cil_alloc_shadow_bufs(log, tp);
 	/* lock out background commit */
 	down_read(&cil->xc_ctx_lock);

--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1573,10 +1573,6 @@ xfs_fs_fill_super(
 		}
 	}
-	if (xfs_sb_version_hassparseinodes(&mp->m_sb))
-		xfs_alert(mp,
-	"EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
 	error = xfs_mountfs(mp);
 	if (error)
 		goto out_filestream_unmount;

--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
 	/* delayed logging */
 	struct list_head		li_cil;		/* CIL pointers */
 	struct xfs_log_vec		*li_lv;		/* active log vector */
+	struct xfs_log_vec		*li_lv_shadow;	/* standby vector */
 	xfs_lsn_t			li_seq;		/* CIL commit seq */
 } xfs_log_item_t;