Commit 45ff8b47 authored by Dave Chinner's avatar Dave Chinner Committed by Dave Chinner

xfs: can't use kmem_zalloc() for attribute buffers

Because heap allocation of 64kB buffers will fail:

....
 XFS: fs_mark(8414) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8417) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8409) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8428) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8430) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8437) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8433) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8406) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8412) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8432) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
 XFS: fs_mark(8424) possible memory allocation deadlock size 65768 in kmem_alloc (mode:0x2d40)
....

I'd use kvmalloc() instead, but....

- 48.19% xfs_attr_create_intent
  - 46.89% xfs_attri_init
     - kvmalloc_node
	- 46.04% __kmalloc_node
	   - kmalloc_large_node
	      - 45.99% __alloc_pages
		 - 39.39% __alloc_pages_slowpath.constprop.0
		    - 38.89% __alloc_pages_direct_compact
		       - 38.71% try_to_compact_pages
			  - compact_zone_order
			  - compact_zone
			     - 21.09% isolate_migratepages_block
				  10.31% PageHuge
				  5.82% set_pfnblock_flags_mask
				  0.86% get_pfnblock_flags_mask
			     - 4.48% __reset_isolation_suitable
				  4.44% __reset_isolation_pfn
			     - 3.56% __pageblock_pfn_to_page
				  1.33% pfn_to_online_page
			       2.83% get_pfnblock_flags_mask
			     - 0.87% migrate_pages
				  0.86% compaction_alloc
			       0.84% find_suitable_fallback
		 - 6.60% get_page_from_freelist
		      4.99% clear_page_erms
		    - 1.19% _raw_spin_lock_irqsave
		       - do_raw_spin_lock
			    __pv_queued_spin_lock_slowpath
	- 0.86% __vmalloc_node_range
	     0.65% __alloc_pages_bulk

.... this is just yet another reminder of how much kvmalloc() sucks.
So lift xlog_cil_kvmalloc(), rename it to xlog_kvmalloc() and use
that instead....

We also clean up the attribute name and value lengths as they no
longer need to be rounded out to sizes compatible with log vectors.
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarAllison Henderson <allison.henderson@oracle.com>
Reviewed-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parent 51e6104f
......@@ -44,7 +44,7 @@ xfs_attri_item_free(
struct xfs_attri_log_item *attrip)
{
kmem_free(attrip->attri_item.li_lv_shadow);
kmem_free(attrip);
kvfree(attrip);
}
/*
......@@ -119,11 +119,11 @@ xfs_attri_item_format(
sizeof(struct xfs_attri_log_format));
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_NAME,
attrip->attri_name,
xlog_calc_iovec_len(attrip->attri_name_len));
attrip->attri_name_len);
if (attrip->attri_value_len > 0)
xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ATTR_VALUE,
attrip->attri_value,
xlog_calc_iovec_len(attrip->attri_value_len));
attrip->attri_value_len);
}
/*
......@@ -163,26 +163,21 @@ xfs_attri_init(
{
struct xfs_attri_log_item *attrip;
uint32_t name_vec_len = 0;
uint32_t value_vec_len = 0;
uint32_t buffer_size;
if (name_len)
name_vec_len = xlog_calc_iovec_len(name_len);
if (value_len)
value_vec_len = xlog_calc_iovec_len(value_len);
buffer_size = name_vec_len + value_vec_len;
uint32_t buffer_size = name_len + value_len;
if (buffer_size) {
attrip = kmem_zalloc(sizeof(struct xfs_attri_log_item) +
buffer_size, KM_NOFS);
if (attrip == NULL)
return NULL;
/*
* This could be over 64kB in length, so we have to use
* kvmalloc() for this. But kvmalloc() utterly sucks, so we
* use own version.
*/
attrip = xlog_kvmalloc(sizeof(struct xfs_attri_log_item) +
buffer_size);
} else {
attrip = kmem_cache_zalloc(xfs_attri_cache,
GFP_NOFS | __GFP_NOFAIL);
attrip = kmem_cache_alloc(xfs_attri_cache,
GFP_NOFS | __GFP_NOFAIL);
}
memset(attrip, 0, sizeof(struct xfs_attri_log_item));
attrip->attri_name_len = name_len;
if (name_len)
......@@ -195,7 +190,7 @@ xfs_attri_init(
if (value_len)
attrip->attri_value = ((char *)attrip) +
sizeof(struct xfs_attri_log_item) +
name_vec_len;
name_len;
else
attrip->attri_value = NULL;
......
......@@ -134,39 +134,6 @@ xlog_cil_iovec_space(
sizeof(uint64_t));
}
/*
* shadow buffers can be large, so we need to use kvmalloc() here to ensure
* success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts to fall
* back to vmalloc, so we can't actually do anything useful with gfp flags to
* control the kmalloc() behaviour within kvmalloc(). Hence kmalloc() will do
* direct reclaim and compaction in the slow path, both of which are
* horrendously expensive. We just want kmalloc to fail fast and fall back to
* vmalloc if it can't get somethign straight away from the free lists or buddy
* allocator. Hence we have to open code kvmalloc outselves here.
*
* Also, we are in memalloc_nofs_save task context here, so despite the use of
* GFP_KERNEL here, we are actually going to be doing GFP_NOFS allocations. This
* is actually the only way to make vmalloc() do GFP_NOFS allocations, so lets
* just all pretend this is a GFP_KERNEL context operation....
*/
static inline void *
xlog_cil_kvmalloc(
size_t buf_size)
{
gfp_t flags = GFP_KERNEL;
void *p;
flags &= ~__GFP_DIRECT_RECLAIM;
flags |= __GFP_NOWARN | __GFP_NORETRY;
do {
p = kmalloc(buf_size, flags);
if (!p)
p = vmalloc(buf_size);
} while (!p);
return p;
}
/*
* Allocate or pin log vector buffers for CIL insertion.
*
......@@ -283,7 +250,7 @@ xlog_cil_alloc_shadow_bufs(
* storage.
*/
kmem_free(lip->li_lv_shadow);
lv = xlog_cil_kvmalloc(buf_size);
lv = xlog_kvmalloc(buf_size);
memset(lv, 0, xlog_cil_iovec_space(niovecs));
......
......@@ -651,4 +651,38 @@ xlog_valid_lsn(
return valid;
}
/*
* Log vector and shadow buffers can be large, so we need to use kvmalloc() here
* to ensure success. Unfortunately, kvmalloc() only allows GFP_KERNEL contexts
* to fall back to vmalloc, so we can't actually do anything useful with gfp
* flags to control the kmalloc() behaviour within kvmalloc(). Hence kmalloc()
* will do direct reclaim and compaction in the slow path, both of which are
* horrendously expensive. We just want kmalloc to fail fast and fall back to
* vmalloc if it can't get somethign straight away from the free lists or
* buddy allocator. Hence we have to open code kvmalloc outselves here.
*
* This assumes that the caller uses memalloc_nofs_save task context here, so
* despite the use of GFP_KERNEL here, we are going to be doing GFP_NOFS
* allocations. This is actually the only way to make vmalloc() do GFP_NOFS
* allocations, so lets just all pretend this is a GFP_KERNEL context
* operation....
*/
static inline void *
xlog_kvmalloc(
size_t buf_size)
{
gfp_t flags = GFP_KERNEL;
void *p;
flags &= ~__GFP_DIRECT_RECLAIM;
flags |= __GFP_NOWARN | __GFP_NORETRY;
do {
p = kmalloc(buf_size, flags);
if (!p)
p = vmalloc(buf_size);
} while (!p);
return p;
}
#endif /* __XFS_LOG_PRIV_H__ */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment