Commit 4e7ff039 authored by Matthew Sakai's avatar Matthew Sakai Committed by Mike Snitzer

dm vdo: implement the chapter volume store

The volume store structures manage the reading and writing of chapter
pages. When a chapter is closed, it is packed into a read-only structure,
split across several pages, and written to storage.

The volume store also contains a cache and specialized queues that sort and
batch requests by the page they need, in order to minimize latency and I/O
requests when records have to be read from storage. The cache and queues
also coordinate with the volume index to ensure that the volume does not
waste resources reading pages that are no longer valid.
Co-developed-by: default avatarJ. corwin Coburn <corwin@hurlbutnet.net>
Signed-off-by: default avatarJ. corwin Coburn <corwin@hurlbutnet.net>
Co-developed-by: default avatarMichael Sclafani <dm-devel@lists.linux.dev>
Signed-off-by: default avatarMichael Sclafani <dm-devel@lists.linux.dev>
Co-developed-by: default avatarThomas Jaskiewicz <tom@jaskiewicz.us>
Signed-off-by: default avatarThomas Jaskiewicz <tom@jaskiewicz.us>
Co-developed-by: default avatarJohn Wiele <jwiele@redhat.com>
Signed-off-by: default avatarJohn Wiele <jwiele@redhat.com>
Signed-off-by: default avatarMatthew Sakai <msakai@redhat.com>
Signed-off-by: default avatarMike Snitzer <snitzer@kernel.org>
parent 6afc7bca
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2023 Red Hat
*/
#include "index-page-map.h"
#include "errors.h"
#include "hash-utils.h"
#include "logger.h"
#include "memory-alloc.h"
#include "numeric.h"
#include "permassert.h"
#include "string-utils.h"
#include "uds-threads.h"
#include "uds.h"
/*
* The index page map is conceptually a two-dimensional array indexed by chapter number and index
* page number within the chapter. Each entry contains the number of the last delta list on that
* index page. In order to save memory, the information for the last page in each chapter is not
* recorded, as it is known from the geometry.
*/
static const u8 PAGE_MAP_MAGIC[] = "ALBIPM02";
enum {
PAGE_MAP_MAGIC_LENGTH = sizeof(PAGE_MAP_MAGIC) - 1,
};
static inline u32 get_entry_count(const struct geometry *geometry)
{
return geometry->chapters_per_volume * (geometry->index_pages_per_chapter - 1);
}
int uds_make_index_page_map(const struct geometry *geometry,
struct index_page_map **map_ptr)
{
int result;
struct index_page_map *map;
result = uds_allocate(1, struct index_page_map, "page map", &map);
if (result != UDS_SUCCESS)
return result;
map->geometry = geometry;
map->entries_per_chapter = geometry->index_pages_per_chapter - 1;
result = uds_allocate(get_entry_count(geometry), u16, "Index Page Map Entries",
&map->entries);
if (result != UDS_SUCCESS) {
uds_free_index_page_map(map);
return result;
}
*map_ptr = map;
return UDS_SUCCESS;
}
void uds_free_index_page_map(struct index_page_map *map)
{
if (map != NULL) {
uds_free(map->entries);
uds_free(map);
}
}
void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number,
u32 chapter_number, u32 index_page_number,
u32 delta_list_number)
{
size_t slot;
map->last_update = virtual_chapter_number;
if (index_page_number == map->entries_per_chapter)
return;
slot = (chapter_number * map->entries_per_chapter) + index_page_number;
map->entries[slot] = delta_list_number;
}
u32 uds_find_index_page_number(const struct index_page_map *map,
const struct uds_record_name *name, u32 chapter_number)
{
u32 delta_list_number = uds_hash_to_chapter_delta_list(name, map->geometry);
u32 slot = chapter_number * map->entries_per_chapter;
u32 page;
for (page = 0; page < map->entries_per_chapter; page++) {
if (delta_list_number <= map->entries[slot + page])
break;
}
return page;
}
void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number,
u32 index_page_number, u32 *lowest_list,
u32 *highest_list)
{
u32 slot = chapter_number * map->entries_per_chapter;
*lowest_list = ((index_page_number == 0) ?
0 :
map->entries[slot + index_page_number - 1] + 1);
*highest_list = ((index_page_number < map->entries_per_chapter) ?
map->entries[slot + index_page_number] :
map->geometry->delta_lists_per_chapter - 1);
}
u64 uds_compute_index_page_map_save_size(const struct geometry *geometry)
{
return PAGE_MAP_MAGIC_LENGTH + sizeof(u64) + sizeof(u16) * get_entry_count(geometry);
}
int uds_write_index_page_map(struct index_page_map *map, struct buffered_writer *writer)
{
int result;
u8 *buffer;
size_t offset = 0;
u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
u32 i;
result = uds_allocate(saved_size, u8, "page map data", &buffer);
if (result != UDS_SUCCESS)
return result;
memcpy(buffer, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH);
offset += PAGE_MAP_MAGIC_LENGTH;
encode_u64_le(buffer, &offset, map->last_update);
for (i = 0; i < get_entry_count(map->geometry); i++)
encode_u16_le(buffer, &offset, map->entries[i]);
result = uds_write_to_buffered_writer(writer, buffer, offset);
uds_free(buffer);
if (result != UDS_SUCCESS)
return result;
return uds_flush_buffered_writer(writer);
}
int uds_read_index_page_map(struct index_page_map *map, struct buffered_reader *reader)
{
int result;
u8 magic[PAGE_MAP_MAGIC_LENGTH];
u8 *buffer;
size_t offset = 0;
u64 saved_size = uds_compute_index_page_map_save_size(map->geometry);
u32 i;
result = uds_allocate(saved_size, u8, "page map data", &buffer);
if (result != UDS_SUCCESS)
return result;
result = uds_read_from_buffered_reader(reader, buffer, saved_size);
if (result != UDS_SUCCESS) {
uds_free(buffer);
return result;
}
memcpy(&magic, buffer, PAGE_MAP_MAGIC_LENGTH);
offset += PAGE_MAP_MAGIC_LENGTH;
if (memcmp(magic, PAGE_MAP_MAGIC, PAGE_MAP_MAGIC_LENGTH) != 0) {
uds_free(buffer);
return UDS_CORRUPT_DATA;
}
decode_u64_le(buffer, &offset, &map->last_update);
for (i = 0; i < get_entry_count(map->geometry); i++)
decode_u16_le(buffer, &offset, &map->entries[i]);
uds_free(buffer);
uds_log_debug("read index page map, last update %llu",
(unsigned long long) map->last_update);
return UDS_SUCCESS;
}
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2023 Red Hat
*/
#ifndef UDS_INDEX_PAGE_MAP_H
#define UDS_INDEX_PAGE_MAP_H
#include "geometry.h"
#include "io-factory.h"
/*
* The index maintains a page map which records how the chapter delta lists are distributed among
* the index pages for each chapter, allowing the volume to be efficient about reading only pages
* that it knows it will need.
*/
struct index_page_map {
const struct geometry *geometry;
u64 last_update;
u32 entries_per_chapter;
u16 *entries;
};
int __must_check uds_make_index_page_map(const struct geometry *geometry,
struct index_page_map **map_ptr);
void uds_free_index_page_map(struct index_page_map *map);
int __must_check uds_read_index_page_map(struct index_page_map *map,
struct buffered_reader *reader);
int __must_check uds_write_index_page_map(struct index_page_map *map,
struct buffered_writer *writer);
void uds_update_index_page_map(struct index_page_map *map, u64 virtual_chapter_number,
u32 chapter_number, u32 index_page_number,
u32 delta_list_number);
u32 __must_check uds_find_index_page_number(const struct index_page_map *map,
const struct uds_record_name *name,
u32 chapter_number);
void uds_get_list_number_bounds(const struct index_page_map *map, u32 chapter_number,
u32 index_page_number, u32 *lowest_list,
u32 *highest_list);
u64 uds_compute_index_page_map_save_size(const struct geometry *geometry);
#endif /* UDS_INDEX_PAGE_MAP_H */
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2023 Red Hat
*/
#include "radix-sort.h"
#include <linux/limits.h>
#include <linux/types.h>
#include "memory-alloc.h"
#include "string-utils.h"
/*
* This implementation allocates one large object to do the sorting, which can be reused as many
* times as desired. The amount of memory required is logarithmically proportional to the number of
* keys to be sorted.
*/
enum {
/* Piles smaller than this are handled with a simple insertion sort. */
INSERTION_SORT_THRESHOLD = 12,
};
/* Sort keys are pointers to immutable fixed-length arrays of bytes. */
typedef const u8 *sort_key_t;
/*
* The keys are separated into piles based on the byte in each keys at the current offset, so the
* number of keys with each byte must be counted.
*/
struct histogram {
/* The number of non-empty bins */
u16 used;
/* The index (key byte) of the first non-empty bin */
u16 first;
/* The index (key byte) of the last non-empty bin */
u16 last;
/* The number of occurrences of each specific byte */
u32 size[256];
};
/*
* Sub-tasks are manually managed on a stack, both for performance and to put a logarithmic bound
* on the stack space needed.
*/
struct task {
/* Pointer to the first key to sort. */
sort_key_t *first_key;
/* Pointer to the last key to sort. */
sort_key_t *last_key;
/* The offset into the key at which to continue sorting. */
u16 offset;
/* The number of bytes remaining in the sort keys. */
u16 length;
};
struct radix_sorter {
unsigned int count;
struct histogram bins;
sort_key_t *pile[256];
struct task *end_of_stack;
struct task insertion_list[256];
struct task stack[];
};
/* Compare a segment of two fixed-length keys starting at an offset. */
static inline int compare(sort_key_t key1, sort_key_t key2, u16 offset, u16 length)
{
return memcmp(&key1[offset], &key2[offset], length);
}
/* Insert the next unsorted key into an array of sorted keys. */
static inline void insert_key(const struct task task, sort_key_t *next)
{
/* Pull the unsorted key out, freeing up the array slot. */
sort_key_t unsorted = *next;
/* Compare the key to the preceding sorted entries, shifting down ones that are larger. */
while ((--next >= task.first_key) &&
(compare(unsorted, next[0], task.offset, task.length) < 0))
next[1] = next[0];
/* Insert the key into the last slot that was cleared, sorting it. */
next[1] = unsorted;
}
/*
* Sort a range of key segments using an insertion sort. This simple sort is faster than the
* 256-way radix sort when the number of keys to sort is small.
*/
static inline void insertion_sort(const struct task task)
{
sort_key_t *next;
for (next = task.first_key + 1; next <= task.last_key; next++)
insert_key(task, next);
}
/* Push a sorting task onto a task stack. */
static inline void push_task(struct task **stack_pointer, sort_key_t *first_key,
u32 count, u16 offset, u16 length)
{
struct task *task = (*stack_pointer)++;
task->first_key = first_key;
task->last_key = &first_key[count - 1];
task->offset = offset;
task->length = length;
}
static inline void swap_keys(sort_key_t *a, sort_key_t *b)
{
sort_key_t c = *a;
*a = *b;
*b = c;
}
/*
* Count the number of times each byte value appears in the arrays of keys to sort at the current
* offset, keeping track of the number of non-empty bins, and the index of the first and last
* non-empty bin.
*/
static inline void measure_bins(const struct task task, struct histogram *bins)
{
sort_key_t *key_ptr;
/*
* Subtle invariant: bins->used and bins->size[] are zero because the sorting code clears
* it all out as it goes. Even though this structure is re-used, we don't need to pay to
* zero it before starting a new tally.
*/
bins->first = U8_MAX;
bins->last = 0;
for (key_ptr = task.first_key; key_ptr <= task.last_key; key_ptr++) {
/* Increment the count for the byte in the key at the current offset. */
u8 bin = (*key_ptr)[task.offset];
u32 size = ++bins->size[bin];
/* Track non-empty bins. */
if (size == 1) {
bins->used += 1;
if (bin < bins->first)
bins->first = bin;
if (bin > bins->last)
bins->last = bin;
}
}
}
/*
* Convert the bin sizes to pointers to where each pile goes.
*
* pile[0] = first_key + bin->size[0],
* pile[1] = pile[0] + bin->size[1], etc.
*
* After the keys are moved to the appropriate pile, we'll need to sort each of the piles by the
* next radix position. A new task is put on the stack for each pile containing lots of keys, or a
* new task is put on the list for each pile containing few keys.
*
* @stack: pointer the top of the stack
* @end_of_stack: the end of the stack
* @list: pointer the head of the list
* @pile: array for pointers to the end of each pile
* @bins: the histogram of the sizes of each pile
* @first_key: the first key of the stack
* @offset: the next radix position to sort by
* @length: the number of bytes remaining in the sort keys
*
* Return: UDS_SUCCESS or an error code
*/
static inline int push_bins(struct task **stack, struct task *end_of_stack,
struct task **list, sort_key_t *pile[],
struct histogram *bins, sort_key_t *first_key,
u16 offset, u16 length)
{
sort_key_t *pile_start = first_key;
int bin;
for (bin = bins->first; ; bin++) {
u32 size = bins->size[bin];
/* Skip empty piles. */
if (size == 0)
continue;
/* There's no need to sort empty keys. */
if (length > 0) {
if (size > INSERTION_SORT_THRESHOLD) {
if (*stack >= end_of_stack)
return UDS_BAD_STATE;
push_task(stack, pile_start, size, offset, length);
} else if (size > 1) {
push_task(list, pile_start, size, offset, length);
}
}
pile_start += size;
pile[bin] = pile_start;
if (--bins->used == 0)
break;
}
return UDS_SUCCESS;
}
int uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter)
{
int result;
unsigned int stack_size = count / INSERTION_SORT_THRESHOLD;
struct radix_sorter *radix_sorter;
result = uds_allocate_extended(struct radix_sorter, stack_size, struct task,
__func__, &radix_sorter);
if (result != UDS_SUCCESS)
return result;
radix_sorter->count = count;
radix_sorter->end_of_stack = radix_sorter->stack + stack_size;
*sorter = radix_sorter;
return UDS_SUCCESS;
}
void uds_free_radix_sorter(struct radix_sorter *sorter)
{
uds_free(sorter);
}
/*
* Sort pointers to fixed-length keys (arrays of bytes) using a radix sort. The sort implementation
* is unstable, so the relative ordering of equal keys is not preserved.
*/
int uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[],
unsigned int count, unsigned short length)
{
struct task start;
struct histogram *bins = &sorter->bins;
sort_key_t **pile = sorter->pile;
struct task *task_stack = sorter->stack;
/* All zero-length keys are identical and therefore already sorted. */
if ((count == 0) || (length == 0))
return UDS_SUCCESS;
/* The initial task is to sort the entire length of all the keys. */
start = (struct task) {
.first_key = keys,
.last_key = &keys[count - 1],
.offset = 0,
.length = length,
};
if (count <= INSERTION_SORT_THRESHOLD) {
insertion_sort(start);
return UDS_SUCCESS;
}
if (count > sorter->count)
return UDS_INVALID_ARGUMENT;
/*
* Repeatedly consume a sorting task from the stack and process it, pushing new sub-tasks
* onto the stack for each radix-sorted pile. When all tasks and sub-tasks have been
* processed, the stack will be empty and all the keys in the starting task will be fully
* sorted.
*/
for (*task_stack = start; task_stack >= sorter->stack; task_stack--) {
const struct task task = *task_stack;
struct task *insertion_task_list;
int result;
sort_key_t *fence;
sort_key_t *end;
measure_bins(task, bins);
/*
* Now that we know how large each bin is, generate pointers for each of the piles
* and push a new task to sort each pile by the next radix byte.
*/
insertion_task_list = sorter->insertion_list;
result = push_bins(&task_stack, sorter->end_of_stack,
&insertion_task_list, pile, bins, task.first_key,
task.offset + 1, task.length - 1);
if (result != UDS_SUCCESS) {
memset(bins, 0, sizeof(*bins));
return result;
}
/* Now bins->used is zero again. */
/*
* Don't bother processing the last pile: when piles 0..N-1 are all in place, then
* pile N must also be in place.
*/
end = task.last_key - bins->size[bins->last];
bins->size[bins->last] = 0;
for (fence = task.first_key; fence <= end; ) {
u8 bin;
sort_key_t key = *fence;
/*
* The radix byte of the key tells us which pile it belongs in. Swap it for
* an unprocessed item just below that pile, and repeat.
*/
while (--pile[bin = key[task.offset]] > fence)
swap_keys(pile[bin], &key);
/*
* The pile reached the fence. Put the key at the bottom of that pile,
* completing it, and advance the fence to the next pile.
*/
*fence = key;
fence += bins->size[bin];
bins->size[bin] = 0;
}
/* Now bins->size[] is all zero again. */
/*
* When the number of keys in a task gets small enough, it is faster to use an
* insertion sort than to keep subdividing into tiny piles.
*/
while (--insertion_task_list >= sorter->insertion_list)
insertion_sort(*insertion_task_list);
}
return UDS_SUCCESS;
}
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2023 Red Hat
*/
#ifndef UDS_RADIX_SORT_H
#define UDS_RADIX_SORT_H
/*
* Radix sort is implemented using an American Flag sort, an unstable, in-place 8-bit radix
* exchange sort. This is adapted from the algorithm in the paper by Peter M. McIlroy, Keith
* Bostic, and M. Douglas McIlroy, "Engineering Radix Sort".
*
* http://www.usenix.org/publications/compsystems/1993/win_mcilroy.pdf
*/
struct radix_sorter;
int __must_check uds_make_radix_sorter(unsigned int count, struct radix_sorter **sorter);
void uds_free_radix_sorter(struct radix_sorter *sorter);
int __must_check uds_radix_sort(struct radix_sorter *sorter, const unsigned char *keys[],
unsigned int count, unsigned short length);
#endif /* UDS_RADIX_SORT_H */
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2023 Red Hat
*/
#ifndef UDS_VOLUME_H
#define UDS_VOLUME_H
#include <linux/atomic.h>
#include <linux/cache.h>
#include <linux/dm-bufio.h>
#include <linux/limits.h>
#include "chapter-index.h"
#include "config.h"
#include "geometry.h"
#include "index-layout.h"
#include "index-page-map.h"
#include "permassert.h"
#include "radix-sort.h"
#include "sparse-cache.h"
#include "uds.h"
#include "uds-threads.h"
/*
* The volume manages deduplication records on permanent storage. The term "volume" can also refer
* to the region of permanent storage where the records (and the chapters containing them) are
* stored. The volume handles all I/O to this region by reading, caching, and writing chapter pages
* as necessary.
*/
enum index_lookup_mode {
/* Always do lookups in all chapters normally */
LOOKUP_NORMAL,
/* Only do a subset of lookups needed when rebuilding an index */
LOOKUP_FOR_REBUILD,
};
struct queued_read {
bool invalid;
bool reserved;
u32 physical_page;
struct uds_request *first_request;
struct uds_request *last_request;
};
struct __aligned(L1_CACHE_BYTES) search_pending_counter {
u64 atomic_value;
};
struct cached_page {
/* Whether this page is currently being read asynchronously */
bool read_pending;
/* The physical page stored in this cache entry */
u32 physical_page;
/* The value of the volume clock when this page was last used */
s64 last_used;
/* The cached page buffer */
struct dm_buffer *buffer;
/* The chapter index page, meaningless for record pages */
struct delta_index_page index_page;
};
struct page_cache {
/* The number of zones */
unsigned int zone_count;
/* The number of volume pages that can be cached */
u32 indexable_pages;
/* The maximum number of simultaneously cached pages */
u16 cache_slots;
/* An index for each physical page noting where it is in the cache */
u16 *index;
/* The array of cached pages */
struct cached_page *cache;
/* A counter for each zone tracking if a search is occurring there */
struct search_pending_counter *search_pending_counters;
/* The read queue entries as a circular array */
struct queued_read *read_queue;
/* All entries above this point are constant after initialization. */
/*
* These values are all indexes into the array of read queue entries. New entries in the
* read queue are enqueued at read_queue_last. To dequeue entries, a reader thread gets the
* lock and then claims the entry pointed to by read_queue_next_read and increments that
* value. After the read is completed, the reader thread calls release_read_queue_entry(),
* which increments read_queue_first until it points to a pending read, or is equal to
* read_queue_next_read. This means that if multiple reads are outstanding,
* read_queue_first might not advance until the last of the reads finishes.
*/
u16 read_queue_first;
u16 read_queue_next_read;
u16 read_queue_last;
atomic64_t clock;
};
struct volume {
struct geometry *geometry;
struct dm_bufio_client *client;
u64 nonce;
size_t cache_size;
/* A single page worth of records, for sorting */
const struct uds_volume_record **record_pointers;
/* Sorter for sorting records within each page */
struct radix_sorter *radix_sorter;
struct sparse_cache *sparse_cache;
struct page_cache page_cache;
struct index_page_map *index_page_map;
struct mutex read_threads_mutex;
struct cond_var read_threads_cond;
struct cond_var read_threads_read_done_cond;
struct thread **reader_threads;
unsigned int read_thread_count;
bool read_threads_exiting;
enum index_lookup_mode lookup_mode;
unsigned int reserved_buffers;
};
int __must_check uds_make_volume(const struct configuration *config,
struct index_layout *layout,
struct volume **new_volume);
void uds_free_volume(struct volume *volume);
int __must_check uds_replace_volume_storage(struct volume *volume,
struct index_layout *layout,
struct block_device *bdev);
int __must_check uds_find_volume_chapter_boundaries(struct volume *volume,
u64 *lowest_vcn, u64 *highest_vcn,
bool *is_empty);
int __must_check uds_search_volume_page_cache(struct volume *volume,
struct uds_request *request,
bool *found);
int __must_check uds_search_volume_page_cache_for_rebuild(struct volume *volume,
const struct uds_record_name *name,
u64 virtual_chapter,
bool *found);
int __must_check uds_search_cached_record_page(struct volume *volume,
struct uds_request *request, u32 chapter,
u16 record_page_number, bool *found);
void uds_forget_chapter(struct volume *volume, u64 chapter);
int __must_check uds_write_chapter(struct volume *volume,
struct open_chapter_index *chapter_index,
const struct uds_volume_record records[]);
void uds_prefetch_volume_chapter(const struct volume *volume, u32 chapter);
int __must_check uds_read_chapter_index_from_volume(const struct volume *volume,
u64 virtual_chapter,
struct dm_buffer *volume_buffers[],
struct delta_index_page index_pages[]);
int __must_check uds_get_volume_record_page(struct volume *volume, u32 chapter,
u32 page_number, u8 **data_ptr);
int __must_check uds_get_volume_index_page(struct volume *volume, u32 chapter,
u32 page_number,
struct delta_index_page **page_ptr);
#endif /* UDS_VOLUME_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment