Commit 48f4e9f9 authored by Sergei Golubchik's avatar Sergei Golubchik

mhnsw: store coordinates in 16 bits, not 32

use int16_t instead of floats, they're faster and smaller.
but perform intermediate SIMD calculations with floats to avoid overflows.
recall drop with such scheme is below 0.002, often none.

int8_t would've been better but the precision loss is too big
and recall degrades too much.
parent 96210d23
...@@ -29,10 +29,6 @@ ulonglong mhnsw_cache_size; ...@@ -29,10 +29,6 @@ ulonglong mhnsw_cache_size;
static constexpr float alpha = 1.1f; static constexpr float alpha = 1.1f;
static constexpr uint ef_construction= 10; static constexpr uint ef_construction= 10;
// SIMD definitions
#define SIMD_word (256/8)
#define SIMD_floats (SIMD_word/sizeof(float))
enum Graph_table_fields { enum Graph_table_fields {
FIELD_LAYER, FIELD_TREF, FIELD_VEC, FIELD_NEIGHBORS FIELD_LAYER, FIELD_TREF, FIELD_VEC, FIELD_NEIGHBORS
}; };
...@@ -44,19 +40,110 @@ class MHNSW_Context; ...@@ -44,19 +40,110 @@ class MHNSW_Context;
class FVectorNode; class FVectorNode;
/* /*
One vector, an array of ctx->vec_len floats One vector, an array of coordinates in ctx->vec_len dimensions
Aligned on 32-byte (SIMD_word) boundary for SIMD, vector lenght
is zero-padded to multiples of 8, for the same reason.
*/ */
class FVector #pragma pack(push, 1)
struct FVector
{ {
public: static constexpr size_t data_header= sizeof(float);
FVector(MHNSW_Context *ctx_, MEM_ROOT *root, const void *vec_); static constexpr size_t alloc_header= data_header + sizeof(float);
float *vec;
protected: float abs2, scale;
FVector() : vec(nullptr) {} int16_t dims[4];
uchar *data() const { return (uchar*)(&scale); }
static size_t data_size(size_t n)
{ return data_header + n*2; }
static size_t data_to_value_size(size_t data_size)
{ return (data_size - data_header)*2; }
static const FVector *create(void *mem, const void *src, size_t src_len)
{
float scale=0, *v= (float *)src;
size_t vec_len= src_len / sizeof(float);
for (size_t i= 0; i < vec_len; i++)
if (std::abs(scale) < std::abs(v[i]))
scale= v[i];
FVector *vec= align_ptr(mem);
vec->scale= scale ? scale/32767 : 1;
for (size_t i= 0; i < vec_len; i++)
vec->dims[i] = static_cast<int16_t>(std::round(v[i] / vec->scale));
vec->postprocess(vec_len);
return vec;
}
void postprocess(size_t vec_len)
{
fix_tail(vec_len);
abs2= scale * scale * dot_product(dims, dims, vec_len) / 2;
}
#ifdef INTEL_SIMD_IMPLEMENTATION
/************* AVX2 *****************************************************/
static constexpr size_t AVX2_bytes= 256/8;
static constexpr size_t AVX2_dims= AVX2_bytes/sizeof(int16_t);
INTEL_SIMD_IMPLEMENTATION
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
{
typedef float v8f __attribute__((vector_size(AVX2_bytes)));
union { v8f v; __m256 i; } tmp;
__m256i *p1= (__m256i*)v1;
__m256i *p2= (__m256i*)v2;
v8f d= {0};
for (size_t i= 0; i < (len + AVX2_dims-1)/AVX2_dims; p1++, p2++, i++)
{
tmp.i= _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2));
d+= tmp.v;
}
return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
}
INTEL_SIMD_IMPLEMENTATION
static size_t alloc_size(size_t n)
{ return alloc_header + MY_ALIGN(n*2, AVX2_bytes) + AVX2_bytes - 1; }
INTEL_SIMD_IMPLEMENTATION
static FVector *align_ptr(void *ptr)
{ return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX2_bytes)
- alloc_header); }
INTEL_SIMD_IMPLEMENTATION
void fix_tail(size_t vec_len)
{
bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX2_dims) - vec_len)*2);
}
#endif
/************* no-SIMD default ******************************************/
DEFAULT_IMPLEMENTATION
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
{
int64_t d= 0;
for (size_t i= 0; i < len; i++)
d+= int32_t(v1[i]) * int32_t(v2[i]);
return static_cast<float>(d);
}
DEFAULT_IMPLEMENTATION
static size_t alloc_size(size_t n) { return alloc_header + n*2; }
DEFAULT_IMPLEMENTATION
static FVector *align_ptr(void *ptr) { return (FVector*)ptr; }
DEFAULT_IMPLEMENTATION
void fix_tail(size_t) { }
float distance_to(const FVector *other, size_t vec_len) const
{
return abs2 + other->abs2 - scale * other->scale *
dot_product(dims, other->dims, vec_len);
}
}; };
#pragma pack(pop)
/* /*
An array of pointers to graph nodes An array of pointers to graph nodes
...@@ -86,30 +173,6 @@ struct Neighborhood: public Sql_alloc ...@@ -86,30 +173,6 @@ struct Neighborhood: public Sql_alloc
}; };
#ifdef INTEL_SIMD_IMPLEMENTATION
INTEL_SIMD_IMPLEMENTATION
float vec_distance(float *v1, float *v2, size_t len)
{
typedef float v8f __attribute__((vector_size(SIMD_word)));
v8f *p1= (v8f*)v1;
v8f *p2= (v8f*)v2;
v8f d= {0};
for (size_t i= 0; i < len/SIMD_floats; p1++, p2++, i++)
{
v8f dist= *p1 - *p2;
d+= dist * dist;
}
return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
}
#endif
DEFAULT_IMPLEMENTATION
float vec_distance(float *v1, float *v2, size_t len)
{
return euclidean_vec_distance(v1, v2, len);
}
/* /*
One node in a graph = one row in the graph table One node in a graph = one row in the graph table
...@@ -132,14 +195,15 @@ float vec_distance(float *v1, float *v2, size_t len) ...@@ -132,14 +195,15 @@ float vec_distance(float *v1, float *v2, size_t len)
is constrained by mhnsw_cache_size, so every byte matters here is constrained by mhnsw_cache_size, so every byte matters here
*/ */
#pragma pack(push, 1) #pragma pack(push, 1)
class FVectorNode: public FVector class FVectorNode
{ {
private: private:
MHNSW_Context *ctx; MHNSW_Context *ctx;
float *make_vec(const void *v); const FVector *make_vec(const void *v);
int alloc_neighborhood(uint8_t layer); int alloc_neighborhood(uint8_t layer);
public: public:
const FVector *vec= nullptr;
Neighborhood *neighbors= nullptr; Neighborhood *neighbors= nullptr;
uint8_t max_layer; uint8_t max_layer;
bool stored:1, deleted:1; bool stored:1, deleted:1;
...@@ -147,7 +211,7 @@ class FVectorNode: public FVector ...@@ -147,7 +211,7 @@ class FVectorNode: public FVector
FVectorNode(MHNSW_Context *ctx_, const void *gref_); FVectorNode(MHNSW_Context *ctx_, const void *gref_);
FVectorNode(MHNSW_Context *ctx_, const void *tref_, uint8_t layer, FVectorNode(MHNSW_Context *ctx_, const void *tref_, uint8_t layer,
const void *vec_); const void *vec_);
float distance_to(const FVector &other) const; float distance_to(const FVector *other) const;
int load(TABLE *graph); int load(TABLE *graph);
int load_from_record(TABLE *graph); int load_from_record(TABLE *graph);
int save(TABLE *graph); int save(TABLE *graph);
...@@ -192,7 +256,7 @@ class MHNSW_Context : public Sql_alloc ...@@ -192,7 +256,7 @@ class MHNSW_Context : public Sql_alloc
void *alloc_node_internal() void *alloc_node_internal()
{ {
return alloc_root(&root, sizeof(FVectorNode) + gref_len + tref_len return alloc_root(&root, sizeof(FVectorNode) + gref_len + tref_len
+ vec_len * sizeof(float) + SIMD_word - 1); + FVector::alloc_size(vec_len));
} }
protected: protected:
...@@ -252,7 +316,7 @@ class MHNSW_Context : public Sql_alloc ...@@ -252,7 +316,7 @@ class MHNSW_Context : public Sql_alloc
void set_lengths(size_t len) void set_lengths(size_t len)
{ {
byte_len= len; byte_len= len;
vec_len= MY_ALIGN(byte_len/sizeof(float), SIMD_floats); vec_len= len / sizeof(float);
} }
static int acquire(MHNSW_Context **ctx, TABLE *table, bool for_update); static int acquire(MHNSW_Context **ctx, TABLE *table, bool for_update);
...@@ -505,42 +569,26 @@ int MHNSW_Context::acquire(MHNSW_Context **ctx, TABLE *table, bool for_update) ...@@ -505,42 +569,26 @@ int MHNSW_Context::acquire(MHNSW_Context **ctx, TABLE *table, bool for_update)
return err; return err;
graph->file->position(graph->record[0]); graph->file->position(graph->record[0]);
(*ctx)->set_lengths(graph->field[FIELD_VEC]->value_length()); (*ctx)->set_lengths(FVector::data_to_value_size(graph->field[FIELD_VEC]->value_length()));
(*ctx)->start= (*ctx)->get_node(graph->file->ref); (*ctx)->start= (*ctx)->get_node(graph->file->ref);
return (*ctx)->start->load_from_record(graph); return (*ctx)->start->load_from_record(graph);
} }
/* copy the vector, aligned and padded for SIMD */ /* copy the vector, preprocessed as needed */
static float *make_vec(void *mem, const void *src, size_t src_len) const FVector *FVectorNode::make_vec(const void *v)
{
auto dst= (float*)MY_ALIGN((intptr)mem, SIMD_word);
memcpy(dst, src, src_len);
const size_t start= src_len/sizeof(float);
for (size_t i= start; i < MY_ALIGN(start, SIMD_floats); i++)
dst[i]=0.0f;
return dst;
}
FVector::FVector(MHNSW_Context *ctx, MEM_ROOT *root, const void *vec_)
{
vec= make_vec(alloc_root(root, ctx->vec_len * sizeof(float) + SIMD_word - 1),
vec_, ctx->byte_len);
}
float *FVectorNode::make_vec(const void *v)
{ {
return ::make_vec(tref() + tref_len(), v, ctx->byte_len); return FVector::create(tref() + tref_len(), v, ctx->byte_len);
} }
FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *gref_) FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *gref_)
: FVector(), ctx(ctx_), stored(true), deleted(false) : ctx(ctx_), stored(true), deleted(false)
{ {
memcpy(gref(), gref_, gref_len()); memcpy(gref(), gref_, gref_len());
} }
FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *tref_, uint8_t layer, FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *tref_, uint8_t layer,
const void *vec_) const void *vec_)
: FVector(), ctx(ctx_), stored(false), deleted(false) : ctx(ctx_), stored(false), deleted(false)
{ {
DBUG_ASSERT(tref_); DBUG_ASSERT(tref_);
memset(gref(), 0xff, gref_len()); // important: larger than any real gref memset(gref(), 0xff, gref_len()); // important: larger than any real gref
...@@ -550,9 +598,9 @@ FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *tref_, uint8_t layer, ...@@ -550,9 +598,9 @@ FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *tref_, uint8_t layer,
alloc_neighborhood(layer); alloc_neighborhood(layer);
} }
float FVectorNode::distance_to(const FVector &other) const float FVectorNode::distance_to(const FVector *other) const
{ {
return vec_distance(vec, other.vec, ctx->vec_len); return vec->distance_to(other, ctx->vec_len);
} }
int FVectorNode::alloc_neighborhood(uint8_t layer) int FVectorNode::alloc_neighborhood(uint8_t layer)
...@@ -603,9 +651,11 @@ int FVectorNode::load_from_record(TABLE *graph) ...@@ -603,9 +651,11 @@ int FVectorNode::load_from_record(TABLE *graph)
if (unlikely(!v)) if (unlikely(!v))
return my_errno= HA_ERR_CRASHED; return my_errno= HA_ERR_CRASHED;
if (v->length() != ctx->byte_len) if (v->length() != FVector::data_size(ctx->vec_len))
return my_errno= HA_ERR_CRASHED; return my_errno= HA_ERR_CRASHED;
float *vec_ptr= make_vec(v->ptr()); FVector *vec_ptr= FVector::align_ptr(tref() + tref_len());
memcpy(vec_ptr->data(), v->ptr(), v->length());
vec_ptr->postprocess(ctx->vec_len);
longlong layer= graph->field[FIELD_LAYER]->val_int(); longlong layer= graph->field[FIELD_LAYER]->val_int();
if (layer > 100) // 10e30 nodes at M=2, more at larger M's if (layer > 100) // 10e30 nodes at M=2, more at larger M's
...@@ -676,13 +726,13 @@ struct Visited : public Sql_alloc ...@@ -676,13 +726,13 @@ struct Visited : public Sql_alloc
class VisitedSet class VisitedSet
{ {
MEM_ROOT *root; MEM_ROOT *root;
const FVector &target; const FVector *target;
PatternedSimdBloomFilter<FVectorNode> map; PatternedSimdBloomFilter<FVectorNode> map;
const FVectorNode *nodes[8]= {0,0,0,0,0,0,0,0}; const FVectorNode *nodes[8]= {0,0,0,0,0,0,0,0};
size_t idx= 1; // to record 0 in the filter size_t idx= 1; // to record 0 in the filter
public: public:
uint count= 0; uint count= 0;
VisitedSet(MEM_ROOT *root, const FVector &target, uint size) : VisitedSet(MEM_ROOT *root, const FVector *target, uint size) :
root(root), target(target), map(size, 0.01f) {} root(root), target(target), map(size, 0.01f) {}
Visited *create(FVectorNode *node) Visited *create(FVectorNode *node)
{ {
...@@ -730,10 +780,10 @@ static int select_neighbors(MHNSW_Context *ctx, TABLE *graph, size_t layer, ...@@ -730,10 +780,10 @@ static int select_neighbors(MHNSW_Context *ctx, TABLE *graph, size_t layer,
FVectorNode *node= candidates.links[i]; FVectorNode *node= candidates.links[i];
if (int err= node->load(graph)) if (int err= node->load(graph))
return err; return err;
pq.push(new (root) Visited(node, node->distance_to(target))); pq.push(new (root) Visited(node, node->distance_to(target.vec)));
} }
if (extra_candidate) if (extra_candidate)
pq.push(new (root) Visited(extra_candidate, extra_candidate->distance_to(target))); pq.push(new (root) Visited(extra_candidate, extra_candidate->distance_to(target.vec)));
DBUG_ASSERT(pq.elements()); DBUG_ASSERT(pq.elements());
neighbors.num= 0; neighbors.num= 0;
...@@ -745,7 +795,7 @@ static int select_neighbors(MHNSW_Context *ctx, TABLE *graph, size_t layer, ...@@ -745,7 +795,7 @@ static int select_neighbors(MHNSW_Context *ctx, TABLE *graph, size_t layer,
const float target_dista= vec->distance_to_target / alpha; const float target_dista= vec->distance_to_target / alpha;
bool discard= false; bool discard= false;
for (size_t i=0; i < neighbors.num; i++) for (size_t i=0; i < neighbors.num; i++)
if ((discard= node->distance_to(*neighbors.links[i]) < target_dista)) if ((discard= node->distance_to(neighbors.links[i]->vec) < target_dista))
break; break;
if (!discard) if (!discard)
target.push_neighbor(layer, node); target.push_neighbor(layer, node);
...@@ -774,7 +824,7 @@ int FVectorNode::save(TABLE *graph) ...@@ -774,7 +824,7 @@ int FVectorNode::save(TABLE *graph)
graph->field[FIELD_TREF]->set_notnull(); graph->field[FIELD_TREF]->set_notnull();
graph->field[FIELD_TREF]->store_binary(tref(), tref_len()); graph->field[FIELD_TREF]->store_binary(tref(), tref_len());
} }
graph->field[FIELD_VEC]->store_binary((uchar*)vec, ctx->byte_len); graph->field[FIELD_VEC]->store_binary(vec->data(), FVector::data_size(ctx->vec_len));
size_t total_size= 0; size_t total_size= 0;
for (size_t i=0; i <= max_layer; i++) for (size_t i=0; i <= max_layer; i++)
...@@ -834,7 +884,7 @@ static int update_second_degree_neighbors(MHNSW_Context *ctx, TABLE *graph, ...@@ -834,7 +884,7 @@ static int update_second_degree_neighbors(MHNSW_Context *ctx, TABLE *graph,
return 0; return 0;
} }
static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector &target, static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector *target,
Neighborhood *start_nodes, uint ef, size_t layer, Neighborhood *start_nodes, uint ef, size_t layer,
Neighborhood *result, bool skip_deleted) Neighborhood *result, bool skip_deleted)
{ {
...@@ -998,8 +1048,8 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo) ...@@ -998,8 +1048,8 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
for (cur_layer= max_layer; cur_layer > target_layer; cur_layer--) for (cur_layer= max_layer; cur_layer > target_layer; cur_layer--)
{ {
if (int err= search_layer(ctx, graph, *target, &start_nodes, 1, cur_layer, if (int err= search_layer(ctx, graph, target->vec, &start_nodes, 1,
&candidates, false)) cur_layer, &candidates, false))
return err; return err;
std::swap(start_nodes, candidates); std::swap(start_nodes, candidates);
} }
...@@ -1007,7 +1057,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo) ...@@ -1007,7 +1057,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
for (; cur_layer >= 0; cur_layer--) for (; cur_layer >= 0; cur_layer--)
{ {
uint max_neighbors= ctx->max_neighbors(cur_layer); uint max_neighbors= ctx->max_neighbors(cur_layer);
if (int err= search_layer(ctx, graph, *target, &start_nodes, if (int err= search_layer(ctx, graph, target->vec, &start_nodes,
ef_construction, cur_layer, &candidates, false)) ef_construction, cur_layer, &candidates, false))
return err; return err;
...@@ -1063,13 +1113,19 @@ int mhnsw_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit) ...@@ -1063,13 +1113,19 @@ int mhnsw_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
/* /*
if the query vector is NULL or invalid, VEC_DISTANCE will return if the query vector is NULL or invalid, VEC_DISTANCE will return
NULL, so the result is basically unsorted, we can return rows NULL, so the result is basically unsorted, we can return rows
in any order. For simplicity let's sort by the start_node. in any order. Let's use some hardcoded value here
*/ */
if (!res || ctx->byte_len != res->length()) if (!res || ctx->byte_len != res->length())
(res= &buf)->set((char*)start_nodes.links[0]->vec, ctx->byte_len, &my_charset_bin); {
res= &buf;
buf.alloc(ctx->byte_len);
for (size_t i=0; i < ctx->vec_len; i++)
((float*)buf.ptr())[i]= i == 0;
}
const longlong max_layer= start_nodes.links[0]->max_layer; const longlong max_layer= start_nodes.links[0]->max_layer;
FVector target(ctx, thd->mem_root, res->ptr()); auto target= FVector::create(thd->alloc(FVector::alloc_size(ctx->vec_len)),
res->ptr(), res->length());
if (int err= graph->file->ha_rnd_init(0)) if (int err= graph->file->ha_rnd_init(0))
return err; return err;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment