Commit 2c813ad6 authored by Darrick J. Wong's avatar Darrick J. Wong Committed by Dave Chinner

xfs: support btrees with overlapping intervals for keys

On a filesystem with both reflink and reverse mapping enabled, it's
possible to have multiple rmap records referring to the same blocks on
disk.  When overlapping intervals are possible, querying a classic
btree to find all records intersecting a given interval is inefficient
because we cannot use the left side of the search interval to filter
out non-matching records the same way that we can use the existing
btree key to filter out records coming after the right side of the
search interval.  This will become important once we want to use the
rmap btree to rebuild BMBTs, or implement the (future) fsmap ioctl.

(For the non-overlapping case, we can perform such queries trivially
by starting at the left side of the interval and walking the tree
until we pass the right side.)

Therefore, extend the btree code to come closer to supporting
intervals as a first-class record attribute.  This involves widening
the btree node's key space to store both the lowest key reachable via
the node pointer (as the btree does now) and the highest key reachable
via the same pointer and teaching the btree modifying functions to
keep the highest-key records up to date.

This behavior can be turned on via a new btree ops flag so that btrees
that cannot store overlapping intervals don't pay the overhead costs
in terms of extra code and disk format changes.

When we're deleting a record in a btree that supports overlapped
interval records and the deletion results in two btree blocks being
joined, we defer updating the high/low keys until after all possible
joining (at higher levels in the tree) have finished.  At this point,
the btree pointers at all levels have been updated to remove the empty
blocks and we can update the low and high keys.

When we're doing this, we must be careful to update the keys of all
node pointers up to the root instead of stopping at the first set of
keys that don't need updating.  This is because it's possible for a
single deletion to cause joining of multiple levels of tree, and so
we need to update everything going back to the root.

The diff_two_keys functions return < 0, 0, or > 0 if key1 is less than,
equal to, or greater than key2, respectively.  This is consistent
with the rest of the kernel and the C library.

In btree_updkeys(), we need to evaluate the force_all parameter before
running the key diff to avoid reading uninitialized memory when we're
forcing a key update.  This happens when we've allocated an empty slot
at level N + 1 to point to a new block at level N and we're in the
process of filling out the new keys.
Signed-off-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: default avatarDave Chinner <dchinner@redhat.com>
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parent 70b22659
This diff is collapsed.
......@@ -44,6 +44,20 @@ union xfs_btree_key {
xfs_inobt_key_t inobt;
};
/*
* In-core key that holds both low and high keys for overlapped btrees.
* The two keys are packed next to each other on disk, so do the same
* in memory. Preserve the existing xfs_btree_key as a single key to
* avoid the mental model breakage that would happen if we passed a
* bigkey into a function that operates on a single key.
*/
union xfs_btree_bigkey {
struct xfs_bmbt_key bmbt;
xfs_bmdr_key_t bmbr; /* bmbt root block */
xfs_alloc_key_t alloc;
struct xfs_inobt_key inobt;
};
union xfs_btree_rec {
xfs_bmbt_rec_t bmbt;
xfs_bmdr_rec_t bmbr; /* bmbt root block */
......@@ -162,11 +176,21 @@ struct xfs_btree_ops {
union xfs_btree_rec *rec);
void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr);
void (*init_high_key_from_rec)(union xfs_btree_key *key,
union xfs_btree_rec *rec);
/* difference between key value and cursor value */
__int64_t (*key_diff)(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
/*
* Difference between key2 and key1 -- positive if key1 > key2,
* negative if key1 < key2, and zero if equal.
*/
__int64_t (*diff_two_keys)(struct xfs_btree_cur *cur,
union xfs_btree_key *key1,
union xfs_btree_key *key2);
const struct xfs_buf_ops *buf_ops;
#if defined(DEBUG) || defined(XFS_WARN)
......@@ -249,6 +273,7 @@ typedef struct xfs_btree_cur
#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
#define XFS_BTREE_CRC_BLOCKS (1<<3) /* uses extended btree blocks */
#define XFS_BTREE_OVERLAPPING (1<<4) /* overlapping intervals */
#define XFS_BTREE_NOERROR 0
......@@ -493,5 +518,10 @@ void xfs_btree_get_leaf_keys(struct xfs_btree_cur *cur,
void xfs_btree_get_node_keys(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, union xfs_btree_key *key);
int xfs_btree_update_keys(struct xfs_btree_cur *cur, int level);
void xfs_btree_get_leaf_keys_overlapped(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, union xfs_btree_key *key);
void xfs_btree_get_node_keys_overlapped(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, union xfs_btree_key *key);
int xfs_btree_update_keys_overlapped(struct xfs_btree_cur *cur, int level);
#endif /* __XFS_BTREE_H__ */
......@@ -38,6 +38,7 @@ struct xlog_recover_item;
struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
struct xfs_btree_cur;
DECLARE_EVENT_CLASS(xfs_attr_list_class,
TP_PROTO(struct xfs_attr_list_context *ctx),
......@@ -2185,6 +2186,41 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
DEFINE_DISCARD_EVENT(xfs_discard_exclude);
DEFINE_DISCARD_EVENT(xfs_discard_busy);
/* btree cursor events */
DECLARE_EVENT_CLASS(xfs_btree_cur_class,
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
TP_ARGS(cur, level, bp),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_btnum_t, btnum)
__field(int, level)
__field(int, nlevels)
__field(int, ptr)
__field(xfs_daddr_t, daddr)
),
TP_fast_assign(
__entry->dev = cur->bc_mp->m_super->s_dev;
__entry->btnum = cur->bc_btnum;
__entry->level = level;
__entry->nlevels = cur->bc_nlevels;
__entry->ptr = cur->bc_ptrs[level];
__entry->daddr = bp ? bp->b_bn : -1;
),
TP_printk("dev %d:%d btnum %d level %d/%d ptr %d daddr 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->btnum,
__entry->level,
__entry->nlevels,
__entry->ptr,
(unsigned long long)__entry->daddr)
)
#define DEFINE_BTREE_CUR_EVENT(name) \
DEFINE_EVENT(xfs_btree_cur_class, name, \
TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp), \
TP_ARGS(cur, level, bp))
DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys);
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment