Commit 024512e7 authored by Matthew Sakai's avatar Matthew Sakai Committed by Mike Snitzer

dm vdo: implement the delta index

The delta index is a space and memory efficient alternative to a hashtable.
Instead of storing the entire key for each entry, the entries are sorted by
key and only the difference between adjacent keys (the delta) is stored.
If the keys are evenly distributed, the size of the deltas follows an
exponential distribution, and the deltas can use a Huffman code to take up
even less space.

This structure allows the index to use many fewer bytes per entry than a
traditional hash table, but it is slightly more expensive to look up
entries, because a request must read and sum every entry in a list of
deltas in order to find a given record. The delta index reduces this lookup
cost by splitting its key space into many sub-lists, each starting at a
fixed key value, so that each individual list is short.
Co-developed-by: default avatarJ. corwin Coburn <corwin@hurlbutnet.net>
Signed-off-by: default avatarJ. corwin Coburn <corwin@hurlbutnet.net>
Co-developed-by: default avatarMichael Sclafani <dm-devel@lists.linux.dev>
Signed-off-by: default avatarMichael Sclafani <dm-devel@lists.linux.dev>
Co-developed-by: default avatarThomas Jaskiewicz <tom@jaskiewicz.us>
Signed-off-by: default avatarThomas Jaskiewicz <tom@jaskiewicz.us>
Signed-off-by: default avatarMatthew Sakai <msakai@redhat.com>
Signed-off-by: default avatarMike Snitzer <snitzer@kernel.org>
parent b46d79bd
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2023 Red Hat
*/
#ifndef UDS_DELTA_INDEX_H
#define UDS_DELTA_INDEX_H
#include <linux/cache.h>
#include "config.h"
#include "io-factory.h"
#include "numeric.h"
#include "time-utils.h"
/*
* A delta index is a key-value store, where each entry maps an address (the key) to a payload (the
* value). The entries are sorted by address, and only the delta between successive addresses is
* stored in the entry. The addresses are assumed to be uniformly distributed, and the deltas are
* therefore exponentially distributed.
*
* A delta_index can either be mutable or immutable depending on its expected use. The immutable
* form of a delta index is used for the indexes of closed chapters committed to the volume. The
* mutable form of a delta index is used by the volume index, and also by the chapter index in an
* open chapter. Like the index as a whole, each mutable delta index is divided into a number of
* independent zones.
*/
struct delta_list {
/* The offset of the delta list start, in bits */
u64 start;
/* The number of bits in the delta list */
u16 size;
/* Where the last search "found" the key, in bits */
u16 save_offset;
/* The key for the record just before save_offset */
u32 save_key;
};
struct delta_zone {
/* The delta list memory */
u8 *memory;
/* The delta list headers */
struct delta_list *delta_lists;
/* Temporary starts of delta lists */
u64 *new_offsets;
/* Buffered writer for saving an index */
struct buffered_writer *buffered_writer;
/* The size of delta list memory */
size_t size;
/* Nanoseconds spent rebalancing */
ktime_t rebalance_time;
/* Number of memory rebalances */
u32 rebalance_count;
/* The number of bits in a stored value */
u8 value_bits;
/* The number of bits in the minimal key code */
u16 min_bits;
/* The number of keys used in a minimal code */
u32 min_keys;
/* The number of keys used for another code bit */
u32 incr_keys;
/* The number of records in the index */
u64 record_count;
/* The number of collision records */
u64 collision_count;
/* The number of records removed */
u64 discard_count;
/* The number of UDS_OVERFLOW errors detected */
u64 overflow_count;
/* The index of the first delta list */
u32 first_list;
/* The number of delta lists */
u32 list_count;
/* Tag belonging to this delta index */
u8 tag;
} __aligned(L1_CACHE_BYTES);
struct delta_list_save_info {
/* Tag identifying which delta index this list is in */
u8 tag;
/* Bit offset of the start of the list data */
u8 bit_offset;
/* Number of bytes of list data */
u16 byte_count;
/* The delta list number within the delta index */
u32 index;
} __packed;
struct delta_index {
/* The zones */
struct delta_zone *delta_zones;
/* The number of zones */
unsigned int zone_count;
/* The number of delta lists */
u32 list_count;
/* Maximum lists per zone */
u32 lists_per_zone;
/* Total memory allocated to this index */
size_t memory_size;
/* The number of non-empty lists at load time per zone */
u32 load_lists[MAX_ZONES];
/* True if this index is mutable */
bool mutable;
/* Tag belonging to this delta index */
u8 tag;
};
/*
* A delta_index_page describes a single page of a chapter index. The delta_index field allows the
* page to be treated as an immutable delta_index. We use the delta_zone field to treat the chapter
* index page as a single zone index, and without the need to do an additional memory allocation.
*/
struct delta_index_page {
struct delta_index delta_index;
/* These values are loaded from the DeltaPageHeader */
u32 lowest_list_number;
u32 highest_list_number;
u64 virtual_chapter_number;
/* This structure describes the single zone of a delta index page. */
struct delta_zone delta_zone;
};
/*
* Notes on the delta_index_entries:
*
* The fields documented as "public" can be read by any code that uses a delta_index. The fields
* documented as "private" carry information between delta_index method calls and should not be
* used outside the delta_index module.
*
* (1) The delta_index_entry is used like an iterator when searching a delta list.
*
* (2) It is also the result of a successful search and can be used to refer to the element found
* by the search.
*
* (3) It is also the result of an unsuccessful search and can be used to refer to the insertion
* point for a new record.
*
* (4) If at_end is true, the delta_list entry can only be used as the insertion point for a new
* record at the end of the list.
*
* (5) If at_end is false and is_collision is true, the delta_list entry fields refer to a
* collision entry in the list, and the delta_list entry can be used a a reference to this
* entry.
*
* (6) If at_end is false and is_collision is false, the delta_list entry fields refer to a
* non-collision entry in the list. Such delta_list entries can be used as a reference to a
* found entry, or an insertion point for a non-collision entry before this entry, or an
* insertion point for a collision entry that collides with this entry.
*/
struct delta_index_entry {
/* Public fields */
/* The key for this entry */
u32 key;
/* We are after the last list entry */
bool at_end;
/* This record is a collision */
bool is_collision;
/* Private fields */
/* This delta list overflowed */
bool list_overflow;
/* The number of bits used for the value */
u8 value_bits;
/* The number of bits used for the entire entry */
u16 entry_bits;
/* The delta index zone */
struct delta_zone *delta_zone;
/* The delta list containing the entry */
struct delta_list *delta_list;
/* The delta list number */
u32 list_number;
/* Bit offset of this entry within the list */
u16 offset;
/* The delta between this and previous entry */
u32 delta;
/* Temporary delta list for immutable indices */
struct delta_list temp_delta_list;
};
struct delta_index_stats {
/* Number of bytes allocated */
size_t memory_allocated;
/* Nanoseconds spent rebalancing */
ktime_t rebalance_time;
/* Number of memory rebalances */
u32 rebalance_count;
/* The number of records in the index */
u64 record_count;
/* The number of collision records */
u64 collision_count;
/* The number of records removed */
u64 discard_count;
/* The number of UDS_OVERFLOW errors detected */
u64 overflow_count;
/* The number of delta lists */
u32 list_count;
};
int __must_check uds_initialize_delta_index(struct delta_index *delta_index,
unsigned int zone_count, u32 list_count,
u32 mean_delta, u32 payload_bits,
size_t memory_size, u8 tag);
int __must_check uds_initialize_delta_index_page(struct delta_index_page *delta_index_page,
u64 expected_nonce, u32 mean_delta,
u32 payload_bits, u8 *memory,
size_t memory_size);
void uds_uninitialize_delta_index(struct delta_index *delta_index);
void uds_reset_delta_index(const struct delta_index *delta_index);
int __must_check uds_pack_delta_index_page(const struct delta_index *delta_index,
u64 header_nonce, u8 *memory,
size_t memory_size,
u64 virtual_chapter_number, u32 first_list,
u32 *list_count);
int __must_check uds_start_restoring_delta_index(struct delta_index *delta_index,
struct buffered_reader **buffered_readers,
unsigned int reader_count);
int __must_check uds_finish_restoring_delta_index(struct delta_index *delta_index,
struct buffered_reader **buffered_readers,
unsigned int reader_count);
int __must_check uds_check_guard_delta_lists(struct buffered_reader **buffered_readers,
unsigned int reader_count);
int __must_check uds_start_saving_delta_index(const struct delta_index *delta_index,
unsigned int zone_number,
struct buffered_writer *buffered_writer);
int __must_check uds_finish_saving_delta_index(const struct delta_index *delta_index,
unsigned int zone_number);
int __must_check uds_write_guard_delta_list(struct buffered_writer *buffered_writer);
size_t __must_check uds_compute_delta_index_save_bytes(u32 list_count,
size_t memory_size);
int __must_check uds_start_delta_index_search(const struct delta_index *delta_index,
u32 list_number, u32 key,
struct delta_index_entry *iterator);
int __must_check uds_next_delta_index_entry(struct delta_index_entry *delta_entry);
int __must_check uds_remember_delta_index_offset(const struct delta_index_entry *delta_entry);
int __must_check uds_get_delta_index_entry(const struct delta_index *delta_index,
u32 list_number, u32 key, const u8 *name,
struct delta_index_entry *delta_entry);
int __must_check uds_get_delta_entry_collision(const struct delta_index_entry *delta_entry,
u8 *name);
u32 __must_check uds_get_delta_entry_value(const struct delta_index_entry *delta_entry);
int __must_check uds_set_delta_entry_value(const struct delta_index_entry *delta_entry, u32 value);
int __must_check uds_put_delta_index_entry(struct delta_index_entry *delta_entry, u32 key,
u32 value, const u8 *name);
int __must_check uds_remove_delta_index_entry(struct delta_index_entry *delta_entry);
void uds_get_delta_index_stats(const struct delta_index *delta_index,
struct delta_index_stats *stats);
size_t __must_check uds_compute_delta_index_size(u32 entry_count, u32 mean_delta,
u32 payload_bits);
u32 uds_get_delta_index_page_count(u32 entry_count, u32 list_count, u32 mean_delta,
u32 payload_bits, size_t bytes_per_page);
void uds_log_delta_index_entry(struct delta_index_entry *delta_entry);
#endif /* UDS_DELTA_INDEX_H */
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2023 Red Hat
*/
#ifndef UDS_HASH_UTILS_H
#define UDS_HASH_UTILS_H
#include "geometry.h"
#include "numeric.h"
#include "uds.h"
/* Utilities for extracting portions of a request name for various uses. */
/* How various portions of a record name are apportioned. */
enum {
VOLUME_INDEX_BYTES_OFFSET = 0,
VOLUME_INDEX_BYTES_COUNT = 8,
CHAPTER_INDEX_BYTES_OFFSET = 8,
CHAPTER_INDEX_BYTES_COUNT = 6,
SAMPLE_BYTES_OFFSET = 14,
SAMPLE_BYTES_COUNT = 2,
};
static inline u64 uds_extract_chapter_index_bytes(const struct uds_record_name *name)
{
const u8 *chapter_bits = &name->name[CHAPTER_INDEX_BYTES_OFFSET];
u64 bytes = (u64) get_unaligned_be16(chapter_bits) << 32;
bytes |= get_unaligned_be32(chapter_bits + 2);
return bytes;
}
static inline u64 uds_extract_volume_index_bytes(const struct uds_record_name *name)
{
return get_unaligned_be64(&name->name[VOLUME_INDEX_BYTES_OFFSET]);
}
static inline u32 uds_extract_sampling_bytes(const struct uds_record_name *name)
{
return get_unaligned_be16(&name->name[SAMPLE_BYTES_OFFSET]);
}
/* Compute the chapter delta list for a given name. */
static inline u32 uds_hash_to_chapter_delta_list(const struct uds_record_name *name,
const struct geometry *geometry)
{
return ((uds_extract_chapter_index_bytes(name) >> geometry->chapter_address_bits) &
((1 << geometry->chapter_delta_list_bits) - 1));
}
/* Compute the chapter delta address for a given name. */
static inline u32 uds_hash_to_chapter_delta_address(const struct uds_record_name *name,
const struct geometry *geometry)
{
return uds_extract_chapter_index_bytes(name) & ((1 << geometry->chapter_address_bits) - 1);
}
static inline unsigned int uds_name_to_hash_slot(const struct uds_record_name *name,
unsigned int slot_count)
{
return (unsigned int) (uds_extract_chapter_index_bytes(name) % slot_count);
}
#endif /* UDS_HASH_UTILS_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment