Commit 09989347 authored by Vicențiu Ciorbaru's avatar Vicențiu Ciorbaru Committed by Sergei Golubchik

Initial HNSW implementation

This commit includes the work done in collaboration with Hugo Wen from
Amazon:

    MDEV-33408 Alter HNSW graph storage and fix memory leak

    This commit changes the way HNSW graph information is stored in the
    second table. Instead of storing connections as separate records, it now
    stores neighbors for each node, leading to significant performance
    improvements and storage savings.

    Comparing with the previous approach, the insert speed is 5 times faster,
    search speed improves by 23%, and storage usage is reduced by 73%, based
    on ann-benchmark tests with random-xs-20-euclidean and
    random-s-100-euclidean datasets.

    Additionally, in previous code, vector objects were not released after
    use, resulting in excessive memory consumption (over 20GB for building
    the index with 90,000 records), preventing tests with large datasets.
    Now ensure that vectors are released appropriately during the insert and
    search functions. Note there are still some vectors that need to be
    cleaned up after search query completion. Needs to be addressed in a
    future commit.

    All new code of the whole pull request, including one or several files
    that are either new files or modified ones, are contributed under the
    BSD-new license. I am contributing on behalf of my employer Amazon Web
    Services, Inc.

As well as the commit:

    Introduce session variables to manage HNSW index parameters

    Three variables:

    hnsw_max_connection_per_layer
    hnsw_ef_constructor
    hnsw_ef_search

    ann-benchmark tool is also updated to support these variables in commit
    https://github.com/HugoWenTD/ann-benchmarks/commit/e09784e for branch
    https://github.com/HugoWenTD/ann-benchmarks/tree/mariadb-configurable

    All new code of the whole pull request, including one or several files
    that are either new files or modified ones, are contributed under the
    BSD-new license. I am contributing on behalf of my employer Amazon Web
    Services, Inc.
Co-authored-by: default avatarHugo Wen <wenhug@amazon.com>
parent 6aaaf96e
...@@ -402,6 +402,11 @@ The following specify which files/extra groups are read (specified before remain ...@@ -402,6 +402,11 @@ The following specify which files/extra groups are read (specified before remain
height-balanced, DOUBLE_PREC_HB - double precision height-balanced, DOUBLE_PREC_HB - double precision
height-balanced, JSON_HB - height-balanced, stored as height-balanced, JSON_HB - height-balanced, stored as
JSON JSON
--hnsw-ef-constructor
hnsw_ef_constructor
--hnsw-ef-search hnsw_ef_search
--hnsw-max-connection-per-layer
hnsw_max_connection_per_layer
--host-cache-size=# How many host names should be cached to avoid resolving --host-cache-size=# How many host names should be cached to avoid resolving
(Automatically configured unless set explicitly) (Automatically configured unless set explicitly)
--idle-readonly-transaction-timeout=# --idle-readonly-transaction-timeout=#
...@@ -1700,6 +1705,9 @@ gtid-strict-mode FALSE ...@@ -1700,6 +1705,9 @@ gtid-strict-mode FALSE
help TRUE help TRUE
histogram-size 254 histogram-size 254
histogram-type JSON_HB histogram-type JSON_HB
hnsw-ef-constructor 10
hnsw-ef-search 10
hnsw-max-connection-per-layer 50
host-cache-size 279 host-cache-size 279
idle-readonly-transaction-timeout 0 idle-readonly-transaction-timeout 0
idle-transaction-timeout 0 idle-transaction-timeout 0
......
...@@ -1422,6 +1422,36 @@ NUMERIC_BLOCK_SIZE NULL ...@@ -1422,6 +1422,36 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST SINGLE_PREC_HB,DOUBLE_PREC_HB,JSON_HB ENUM_VALUE_LIST SINGLE_PREC_HB,DOUBLE_PREC_HB,JSON_HB
READ_ONLY NO READ_ONLY NO
COMMAND_LINE_ARGUMENT REQUIRED COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME HNSW_EF_CONSTRUCTOR
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT hnsw_ef_constructor
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 4294967295
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT NONE
VARIABLE_NAME HNSW_EF_SEARCH
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT hnsw_ef_search
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 4294967295
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT NONE
VARIABLE_NAME HNSW_MAX_CONNECTION_PER_LAYER
VARIABLE_SCOPE SESSION
VARIABLE_TYPE INT UNSIGNED
VARIABLE_COMMENT hnsw_max_connection_per_layer
NUMERIC_MIN_VALUE 0
NUMERIC_MAX_VALUE 4294967295
NUMERIC_BLOCK_SIZE 1
ENUM_VALUE_LIST NULL
READ_ONLY NO
COMMAND_LINE_ARGUMENT NONE
VARIABLE_NAME HOSTNAME VARIABLE_NAME HOSTNAME
VARIABLE_SCOPE GLOBAL VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE VARCHAR VARIABLE_TYPE VARCHAR
......
...@@ -6620,7 +6620,6 @@ class Item_int_with_ref :public Item_int ...@@ -6620,7 +6620,6 @@ class Item_int_with_ref :public Item_int
#include "item_subselect.h" #include "item_subselect.h"
#include "item_xmlfunc.h" #include "item_xmlfunc.h"
#include "item_jsonfunc.h" #include "item_jsonfunc.h"
#include "item_vectorfunc.h"
#include "item_create.h" #include "item_create.h"
#include "item_vers.h" #include "item_vers.h"
#endif #endif
......
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include "sp.h" #include "sp.h"
#include "sql_time.h" #include "sql_time.h"
#include "sql_type_geom.h" #include "sql_type_geom.h"
#include "item_vectorfunc.h"
#include <mysql/plugin_function.h> #include <mysql/plugin_function.h>
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include <my_global.h> #include <my_global.h>
#include "item.h" #include "item.h"
#include "item_vectorfunc.h"
key_map Item_func_vec_distance::part_of_sortkey() const key_map Item_func_vec_distance::part_of_sortkey() const
{ {
...@@ -48,8 +49,18 @@ double Item_func_vec_distance::val_real() ...@@ -48,8 +49,18 @@ double Item_func_vec_distance::val_real()
return 0; return 0;
float *v1= (float*)r1->ptr(); float *v1= (float*)r1->ptr();
float *v2= (float*)r2->ptr(); float *v2= (float*)r2->ptr();
return euclidean_vec_distance(v1, v2, (r1->length()) / sizeof(float));
}
double euclidean_vec_distance(float *v1, float *v2, size_t v_len)
{
float *p1= v1;
float *p2= v2;
double d= 0; double d= 0;
for (uint i=0; i < r1->length() / sizeof(float); i++) for (size_t i= 0; i < v_len; p1++, p2++, i++)
d+= (v1[i] - v2[i])*(v1[i] - v2[i]); {
float dist= *p1 - *p2;
d+= dist * dist;
}
return sqrt(d); return sqrt(d);
} }
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
/* This file defines all vector functions */ /* This file defines all vector functions */
#include <my_global.h>
#include "item.h"
#include "lex_string.h" #include "lex_string.h"
#include "item_func.h" #include "item_func.h"
...@@ -34,6 +36,7 @@ class Item_func_vec_distance: public Item_real_func ...@@ -34,6 +36,7 @@ class Item_func_vec_distance: public Item_real_func
{ {
return check_argument_types_or_binary(NULL, 0, arg_count); return check_argument_types_or_binary(NULL, 0, arg_count);
} }
public: public:
Item_func_vec_distance(THD *thd, Item *a, Item *b) Item_func_vec_distance(THD *thd, Item *a, Item *b)
:Item_real_func(thd, a, b) {} :Item_real_func(thd, a, b) {}
...@@ -51,6 +54,9 @@ class Item_func_vec_distance: public Item_real_func ...@@ -51,6 +54,9 @@ class Item_func_vec_distance: public Item_real_func
key_map part_of_sortkey() const override; key_map part_of_sortkey() const override;
Item *do_get_copy(THD *thd) const override Item *do_get_copy(THD *thd) const override
{ return get_item_copy<Item_func_vec_distance>(thd, this); } { return get_item_copy<Item_func_vec_distance>(thd, this); }
virtual ~Item_func_vec_distance() {};
}; };
double euclidean_vec_distance(float *v1, float *v2, size_t v_len);
#endif #endif
...@@ -9876,7 +9876,7 @@ int TABLE::hlindex_open(uint nr) ...@@ -9876,7 +9876,7 @@ int TABLE::hlindex_open(uint nr)
mysql_mutex_unlock(&s->LOCK_share); mysql_mutex_unlock(&s->LOCK_share);
TABLE *table= (TABLE*)alloc_root(&mem_root, sizeof(*table)); TABLE *table= (TABLE*)alloc_root(&mem_root, sizeof(*table));
if (!table || if (!table ||
open_table_from_share(in_use, s->hlindex, &empty_clex_str, db_stat, 0, open_table_from_share(in_use, s->hlindex, &empty_clex_str, db_stat, EXTRA_RECORD,
in_use->open_options, table, 0)) in_use->open_options, table, 0))
return 1; return 1;
hlindex= table; hlindex= table;
...@@ -9931,7 +9931,7 @@ int TABLE::hlindex_first(uint nr, Item *item, ulonglong limit) ...@@ -9931,7 +9931,7 @@ int TABLE::hlindex_first(uint nr, Item *item, ulonglong limit)
DBUG_ASSERT(hlindex->in_use == in_use); DBUG_ASSERT(hlindex->in_use == in_use);
return mhnsw_first(this, item, limit); return mhnsw_first(this, key_info + s->keys, item, limit);
} }
int TABLE::hlindex_next() int TABLE::hlindex_next()
......
...@@ -918,6 +918,11 @@ typedef struct system_variables ...@@ -918,6 +918,11 @@ typedef struct system_variables
my_bool binlog_alter_two_phase; my_bool binlog_alter_two_phase;
Charset_collation_map_st character_set_collations; Charset_collation_map_st character_set_collations;
/* Temporary for HNSW tests */
uint hnsw_max_connection_per_layer;
uint hnsw_ef_constructor;
uint hnsw_ef_search;
} SV; } SV;
/** /**
......
...@@ -7383,3 +7383,23 @@ static Sys_var_enum Sys_block_encryption_mode( ...@@ -7383,3 +7383,23 @@ static Sys_var_enum Sys_block_encryption_mode(
"AES_ENCRYPT() and AES_DECRYPT() functions", "AES_ENCRYPT() and AES_DECRYPT() functions",
SESSION_VAR(block_encryption_mode), CMD_LINE(REQUIRED_ARG), SESSION_VAR(block_encryption_mode), CMD_LINE(REQUIRED_ARG),
block_encryption_mode_values, DEFAULT(0)); block_encryption_mode_values, DEFAULT(0));
/* Temporary for HNSW tests */
static Sys_var_uint Sys_hnsw_ef_search(
"hnsw_ef_search",
"hnsw_ef_search",
SESSION_VAR(hnsw_ef_search), CMD_LINE(NO_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(10),
BLOCK_SIZE(1));
static Sys_var_uint Sys_hnsw_ef_constructor(
"hnsw_ef_constructor",
"hnsw_ef_constructor",
SESSION_VAR(hnsw_ef_constructor), CMD_LINE(NO_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(10),
BLOCK_SIZE(1));
static Sys_var_uint Sys_hnsw_max_connection_per_layer(
"hnsw_max_connection_per_layer",
"hnsw_max_connection_per_layer",
SESSION_VAR(hnsw_max_connection_per_layer), CMD_LINE(NO_ARG),
VALID_RANGE(0, UINT_MAX), DEFAULT(50),
BLOCK_SIZE(1));
This diff is collapsed.
...@@ -15,10 +15,14 @@ ...@@ -15,10 +15,14 @@
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA
*/ */
#include <my_global.h>
#include "item.h"
#include "m_string.h"
#include "structs.h"
#include "table.h" #include "table.h"
extern const LEX_CSTRING mhnsw_hlindex_table; extern const LEX_CSTRING mhnsw_hlindex_table;
int mhnsw_insert(TABLE *table, KEY *keyinfo); int mhnsw_insert(TABLE *table, KEY *keyinfo);
int mhnsw_first(TABLE *table, Item *dist, ulonglong limit); int mhnsw_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit);
int mhnsw_next(TABLE *table); int mhnsw_next(TABLE *table);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment