Commit aba4dc79 authored by Sergei Golubchik's avatar Sergei Golubchik

mhnsw: configurable parameters

1. introduce alpha. the value of 1.1 is optimal, so hard-code it.

2. hard-code ef_construction=10, best by test

3. rename hnsw_max_connection_per_layer to mhnsw_max_edges_per_node
   (max_connection is rather ambiguous in MariaDB) and add a help text

4. rename hnsw_ef_search to mhnsw_min_limit and add a help text
parent 6ca33338
......@@ -402,11 +402,6 @@ The following specify which files/extra groups are read (specified before remain
height-balanced, DOUBLE_PREC_HB - double precision
height-balanced, JSON_HB - height-balanced, stored as
JSON
--hnsw-ef-constructor=#
hnsw_ef_constructor
--hnsw-ef-search=# hnsw_ef_search
--hnsw-max-connection-per-layer=#
hnsw_max_connection_per_layer
--host-cache-size=# How many host names should be cached to avoid resolving
(Automatically configured unless set explicitly)
--idle-readonly-transaction-timeout=#
......@@ -695,6 +690,15 @@ The following specify which files/extra groups are read (specified before remain
Unused. Deprecated, will be removed in a future release.
--metadata-locks-hash-instances=#
Unused. Deprecated, will be removed in a future release.
--mhnsw-max-edges-per-node=#
Larger values means slower INSERT, larger index size and
higher memory consumption, but better search results
--mhnsw-min-limit=# Defines the minimal number of result candidates to look
for in the vector index for ORDER BY ... LIMIT N queries.
The search will never search for less rows than that,
even if LIMIT is smaller. This notably improves the
search quality at low LIMIT values, at the expense of
search time
--min-examined-row-limit=#
Alias for log_slow_min_examined_row_limit. Don't write
queries to slow log that examine fewer rows than that
......@@ -1705,9 +1709,6 @@ gtid-strict-mode FALSE
help TRUE
histogram-size 254
histogram-type JSON_HB
hnsw-ef-constructor 10
hnsw-ef-search 10
hnsw-max-connection-per-layer 50
host-cache-size 279
idle-readonly-transaction-timeout 0
idle-transaction-timeout 0
......@@ -1796,6 +1797,8 @@ max-write-lock-count 18446744073709551615
memlock FALSE
metadata-locks-cache-size 1024
metadata-locks-hash-instances 8
mhnsw-max-edges-per-node 6
mhnsw-min-limit 20
min-examined-row-limit 0
mrr-buffer-size 262144
myisam-block-size 1024
......
......@@ -2162,6 +2162,26 @@ NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME MHNSW_MAX_EDGES_PER_NODE
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Larger values means slower INSERT, larger index size and higher memory consumption, but better search results
NUMERIC_MIN_VALUE 3
NUMERIC_MAX_VALUE 200
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME MHNSW_MIN_LIMIT
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Defines the minimal number of result candidates to look for in the vector index for ORDER BY ... LIMIT N queries. The search will never search for less rows than that, even if LIMIT is smaller. This notably improves the search quality at low LIMIT values, at the expense of search time
NUMERIC_MIN_VALUE 1
NUMERIC_MAX_VALUE 65535
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME MIN_EXAMINED_ROW_LIMIT
VARIABLE_SCOPE SESSION
VARIABLE_TYPE BIGINT UNSIGNED
......
......@@ -1422,36 +1422,6 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST SINGLE_PREC_HB,DOUBLE_PREC_HB,JSON_HB
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME HNSW_EF_CONSTRUCTOR
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT hnsw_ef_constructor
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 4294967295
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME HNSW_EF_SEARCH
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT hnsw_ef_search
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 4294967295
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME HNSW_MAX_CONNECTION_PER_LAYER
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT hnsw_max_connection_per_layer
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 4294967295
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME HOSTNAME
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE VARCHAR
......@@ -2402,6 +2372,26 @@ NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME MHNSW_MAX_EDGES_PER_NODE
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Larger values means slower INSERT, larger index size and higher memory consumption, but better search results
NUMERIC_MIN_VALUE 3
NUMERIC_MAX_VALUE 200
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME MHNSW_MIN_LIMIT
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT Defines the minimal number of result candidates to look for in the vector index for ORDER BY ... LIMIT N queries. The search will never search for less rows than that, even if LIMIT is smaller. This notably improves the search quality at low LIMIT values, at the expense of search time
NUMERIC_MIN_VALUE 1
NUMERIC_MAX_VALUE 65535
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME MIN_EXAMINED_ROW_LIMIT
VARIABLE_SCOPE SESSION
VARIABLE_TYPE BIGINT UNSIGNED
......
......@@ -920,9 +920,8 @@ typedef struct system_variables
Charset_collation_map_st character_set_collations;
/* Temporary for HNSW tests */
uint hnsw_max_connection_per_layer;
uint hnsw_ef_constructor;
uint hnsw_ef_search;
uint mhnsw_max_edges_per_node;
uint mhnsw_min_limit;
} SV;
/**
......
......@@ -7384,22 +7384,18 @@ static Sys_var_enum Sys_block_encryption_mode(
SESSION_VAR(block_encryption_mode), CMD_LINE(REQUIRED_ARG),
block_encryption_mode_values, DEFAULT(0));
/* Temporary for HNSW tests */
static Sys_var_uint Sys_hnsw_ef_search(
"hnsw_ef_search",
"hnsw_ef_search",
SESSION_VAR(hnsw_ef_search), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(10),
BLOCK_SIZE(1));
static Sys_var_uint Sys_hnsw_ef_constructor(
"hnsw_ef_constructor",
"hnsw_ef_constructor",
SESSION_VAR(hnsw_ef_constructor), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(10),
BLOCK_SIZE(1));
static Sys_var_uint Sys_hnsw_max_connection_per_layer(
"hnsw_max_connection_per_layer",
"hnsw_max_connection_per_layer",
SESSION_VAR(hnsw_max_connection_per_layer), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(50),
BLOCK_SIZE(1));
static Sys_var_uint Sys_mhnsw_min_limit(
"mhnsw_min_limit",
"Defines the minimal number of result candidates to look for in the "
"vector index for ORDER BY ... LIMIT N queries. The search will never "
"search for less rows than that, even if LIMIT is smaller. "
"This notably improves the search quality at low LIMIT values, "
"at the expense of search time",
SESSION_VAR(mhnsw_min_limit), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(1, 65535), DEFAULT(20), BLOCK_SIZE(1));
static Sys_var_uint Sys_mhnsw_max_edges_per_node(
"mhnsw_max_edges_per_node",
"Larger values means slower INSERT, larger index size and higher "
"memory consumption, but better search results",
SESSION_VAR(mhnsw_max_edges_per_node), CMD_LINE(REQUIRED_ARG),
VALID_RANGE(3, 200), DEFAULT(6), BLOCK_SIZE(1));
......@@ -21,6 +21,10 @@
#include "key.h"
#include <scope.h>
// Algorithm parameters
static constexpr float alpha = 1.1f;
static constexpr uint ef_construction= 10;
class MHNSW_Context;
class FVector: public Sql_alloc
......@@ -230,7 +234,7 @@ static int select_neighbors(MHNSW_Context *ctx, size_t layer,
bool discard= false;
for (const FVectorNode &neigh : neighbors)
{
if ((discard= vec->distance_to(neigh) < target_dist))
if ((discard= vec->distance_to(neigh) * alpha < target_dist))
break;
}
if (!discard)
......@@ -427,7 +431,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
if (res->length() == 0 || res->length() % 4)
return bad_value_on_insert(vec_field);
const double NORMALIZATION_FACTOR= 1 / std::log(thd->variables.hnsw_max_connection_per_layer);
const double NORMALIZATION_FACTOR= 1 / std::log(thd->variables.mhnsw_max_edges_per_node);
table->file->position(table->record[0]);
......@@ -495,15 +499,13 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
for (longlong cur_layer= new_node_layer; cur_layer >= 0; cur_layer--)
{
if (int err= search_layer(&ctx, start_nodes,
thd->variables.hnsw_ef_constructor, cur_layer,
uint max_neighbors= (cur_layer == 0) // heuristics from the paper
? thd->variables.mhnsw_max_edges_per_node * 2
: thd->variables.mhnsw_max_edges_per_node;
if (int err= search_layer(&ctx, start_nodes, ef_construction, cur_layer,
&candidates))
return err;
uint max_neighbors= (cur_layer == 0) // heuristics from the paper
? thd->variables.hnsw_max_connection_per_layer * 2
: thd->variables.hnsw_max_connection_per_layer;
if (int err= select_neighbors(&ctx, cur_layer, target, candidates,
max_neighbors))
return err;
......@@ -567,8 +569,7 @@ int mhnsw_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
FVector target(&ctx, res->ptr());
ctx.target= &target;
ulonglong ef_search= std::max<ulonglong>( //XXX why not always limit?
thd->variables.hnsw_ef_search, limit);
uint ef_search= thd->variables.mhnsw_min_limit;
for (size_t cur_layer= max_layer; cur_layer > 0; cur_layer--)
{
......@@ -619,6 +620,6 @@ const LEX_CSTRING mhnsw_hlindex_table_def(THD *thd, uint ref_length)
size_t len= sizeof(templ) + 32;
char *s= thd->alloc(len);
len= my_snprintf(s, len, templ, ref_length, 2 * ref_length *
thd->variables.hnsw_max_connection_per_layer);
thd->variables.mhnsw_max_edges_per_node);
return {s, len};
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment