Commit 1bca1fc5 authored by Sergei Golubchik's avatar Sergei Golubchik

mhnsw: store coordinates in 16 bits, not 32

use int16_t instead of floats, they're faster and smaller.
but perform intermediate SIMD calculations with floats to avoid overflows.
recall drop with such scheme is below 0.002, often none.

int8_t would've been better but the precision loss is too big
and recall degrades too much.
parent d029a2e0
...@@ -38,8 +38,8 @@ static const double alpha = 1.1; ...@@ -38,8 +38,8 @@ static const double alpha = 1.1;
static const uint clo_nei_threshold= 10000; static const uint clo_nei_threshold= 10000;
// SIMD definitions // SIMD definitions
#define SIMD_word (256/8) #define SIMD_bytes (256/8)
#define SIMD_floats (SIMD_word/sizeof(float)) #define SIMD_dims (SIMD_bytes/sizeof(int16_t))
enum Graph_table_fields { enum Graph_table_fields {
FIELD_LAYER, FIELD_TREF, FIELD_VEC, FIELD_NEIGHBORS FIELD_LAYER, FIELD_TREF, FIELD_VEC, FIELD_NEIGHBORS
...@@ -51,17 +51,26 @@ enum Graph_table_indices { ...@@ -51,17 +51,26 @@ enum Graph_table_indices {
class MHNSW_Context; class MHNSW_Context;
class FVectorNode; class FVectorNode;
/* #pragma pack(push, 1)
One vector, an array of ctx->vec_len floats struct vector
{
float abs2, scale;
int16_t dims[4];
static constexpr size_t alloc_size(size_t n)
{ return sizeof(float)*2+MY_ALIGN(n*2, SIMD_bytes) + SIMD_bytes - 1; }
static constexpr size_t data_size(size_t n)
{ return sizeof(float)*2+n*2; }
};
#pragma pack(pop)
Aligned on 32-byte (SIMD_word) boundary for SIMD, vector lenght /*
is zero-padded to multiples of 8, for the same reason. One vector, an array of coordinates in ctx->vec_len dimensions
*/ */
class FVector class FVector
{ {
public: public:
FVector(MHNSW_Context *ctx_, MEM_ROOT *root, const void *vec_); FVector(MHNSW_Context *ctx_, MEM_ROOT *root, const void *vec_);
float *vec; vector *vec;
protected: protected:
FVector() : vec(nullptr) {} FVector() : vec(nullptr) {}
}; };
...@@ -98,26 +107,30 @@ struct Neighborhood: public Sql_alloc ...@@ -98,26 +107,30 @@ struct Neighborhood: public Sql_alloc
#if __GNUC__ > 7 #if __GNUC__ > 7
__attribute__ ((target ("avx2,avx,fma"))) __attribute__ ((target ("avx2,avx")))
float vec_distance(float *v1, float *v2, size_t len) float dot_product(int16_t *v1, int16_t *v2, size_t len)
{ {
typedef float v8f __attribute__((vector_size(SIMD_word))); typedef float v8f __attribute__((vector_size(SIMD_bytes)));
v8f *p1= (v8f*)v1; union { v8f v; __m256 i; } tmp;
v8f *p2= (v8f*)v2; __m256i *p1= (__m256i*)v1;
__m256i *p2= (__m256i*)v2;
v8f d= {0}; v8f d= {0};
for (size_t i= 0; i < len/SIMD_floats; p1++, p2++, i++) for (size_t i= 0; i < len/SIMD_dims; p1++, p2++, i++)
{ {
v8f dist= *p1 - *p2; tmp.i= _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2));
d+= dist * dist; d+= tmp.v;
} }
return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7]; return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
} }
#endif #endif
__attribute__ ((target ("default"))) __attribute__ ((target ("default")))
float vec_distance(float *v1, float *v2, size_t len) float dot_product(int16_t *v1, int16_t *v2, size_t len)
{ {
return euclidean_vec_distance(v1, v2, len); int64_t d= 0;
for (size_t i= 0; i < len; i++)
d+= int32_t(v1[i]) * int32_t(v2[i]);
return d;
} }
...@@ -148,7 +161,7 @@ class FVectorNode: public FVector ...@@ -148,7 +161,7 @@ class FVectorNode: public FVector
private: private:
MHNSW_Context *ctx; MHNSW_Context *ctx;
float *make_vec(const void *v); vector *make_vec(const void *v);
int alloc_neighborhood(size_t layer); int alloc_neighborhood(size_t layer);
public: public:
Neighborhood *neighbors= nullptr; Neighborhood *neighbors= nullptr;
...@@ -204,7 +217,7 @@ class MHNSW_Context : public Sql_alloc ...@@ -204,7 +217,7 @@ class MHNSW_Context : public Sql_alloc
void *alloc_node_internal() void *alloc_node_internal()
{ {
return alloc_root(&root, sizeof(FVectorNode) + gref_len + tref_len return alloc_root(&root, sizeof(FVectorNode) + gref_len + tref_len
+ vec_len * sizeof(float) + SIMD_word - 1); + vector::alloc_size(vec_len));
} }
protected: protected:
...@@ -276,7 +289,7 @@ class MHNSW_Context : public Sql_alloc ...@@ -276,7 +289,7 @@ class MHNSW_Context : public Sql_alloc
void set_lengths(size_t len) void set_lengths(size_t len)
{ {
byte_len= len; byte_len= len;
vec_len= MY_ALIGN(byte_len/sizeof(float), SIMD_floats); vec_len= len / sizeof(float);
} }
static int acquire(MHNSW_Context **ctx, TABLE *table, bool for_update); static int acquire(MHNSW_Context **ctx, TABLE *table, bool for_update);
...@@ -522,29 +535,39 @@ int MHNSW_Context::acquire(MHNSW_Context **ctx, TABLE *table, bool for_update) ...@@ -522,29 +535,39 @@ int MHNSW_Context::acquire(MHNSW_Context **ctx, TABLE *table, bool for_update)
return err; return err;
graph->file->position(graph->record[0]); graph->file->position(graph->record[0]);
(*ctx)->set_lengths(graph->field[FIELD_VEC]->value_length()); (*ctx)->set_lengths((graph->field[FIELD_VEC]->value_length()-8)*2);
(*ctx)->start= (*ctx)->get_node(graph->file->ref); (*ctx)->start= (*ctx)->get_node(graph->file->ref);
return (*ctx)->start->load_from_record(graph); return (*ctx)->start->load_from_record(graph);
} }
/* copy the vector, aligned and padded for SIMD */ /* copy the vector, preprocessed as needed */
static float *make_vec(void *mem, const void *src, size_t src_len) static vector *make_vec(void *mem, const void *src, size_t src_len)
{ {
auto dst= (float*)MY_ALIGN((intptr)mem, SIMD_word); auto vec= (vector*)(MY_ALIGN(((intptr)mem)+8, SIMD_bytes) - 8);
memcpy(dst, src, src_len); float abs2= 0, scale=0, *v= (float *)src;
const size_t start= src_len/sizeof(float); size_t vec_len= src_len / sizeof(float);
for (size_t i= start; i < MY_ALIGN(start, SIMD_floats); i++) for (size_t i= 0; i < vec_len; i++)
dst[i]=0.0f; {
return dst; abs2+= v[i]*v[i];
if (std::abs(scale) < std::abs(v[i]))
scale= v[i];
}
vec->abs2= abs2/2;
vec->scale= scale ? scale/32767 : 1;
for (size_t i= 0; i < vec_len; i++)
vec->dims[i] = std::round(v[i] / vec->scale);
bzero(vec->dims + vec_len, (MY_ALIGN(vec_len, SIMD_dims) - vec_len)*2);
return vec;
} }
FVector::FVector(MHNSW_Context *ctx, MEM_ROOT *root, const void *vec_) FVector::FVector(MHNSW_Context *ctx, MEM_ROOT *root, const void *vec_)
{ {
vec= make_vec(alloc_root(root, ctx->vec_len * sizeof(float) + SIMD_word - 1), vec= make_vec(alloc_root(root, vector::alloc_size(ctx->vec_len)),
vec_, ctx->byte_len); vec_, ctx->byte_len);
} }
float *FVectorNode::make_vec(const void *v) vector *FVectorNode::make_vec(const void *v)
{ {
return ::make_vec(tref() + tref_len(), v, ctx->byte_len); return ::make_vec(tref() + tref_len(), v, ctx->byte_len);
} }
...@@ -569,7 +592,8 @@ FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *tref_, size_t layer, ...@@ -569,7 +592,8 @@ FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *tref_, size_t layer,
float FVectorNode::distance_to(const FVector &other) const float FVectorNode::distance_to(const FVector &other) const
{ {
return vec_distance(vec, other.vec, ctx->vec_len); return vec->abs2 + other.vec->abs2 - vec->scale * other.vec->scale *
dot_product(vec->dims, other.vec->dims, MY_ALIGN(ctx->vec_len, SIMD_dims));
} }
int FVectorNode::alloc_neighborhood(size_t layer) int FVectorNode::alloc_neighborhood(size_t layer)
...@@ -620,9 +644,11 @@ int FVectorNode::load_from_record(TABLE *graph) ...@@ -620,9 +644,11 @@ int FVectorNode::load_from_record(TABLE *graph)
if (unlikely(!v)) if (unlikely(!v))
return my_errno= HA_ERR_CRASHED; return my_errno= HA_ERR_CRASHED;
if (v->length() != ctx->byte_len) if (v->length() != vector::data_size(ctx->vec_len))
return my_errno= HA_ERR_CRASHED; return my_errno= HA_ERR_CRASHED;
float *vec_ptr= make_vec(v->ptr()); auto vec_ptr= (vector*)(MY_ALIGN(((intptr)tref() + tref_len())+8, SIMD_bytes) - 8);
memcpy(vec_ptr, v->ptr(), v->length());
bzero(vec_ptr->dims + ctx->vec_len, 2*(MY_ALIGN(ctx->vec_len, SIMD_dims) - ctx->vec_len));
size_t layer= graph->field[FIELD_LAYER]->val_int(); size_t layer= graph->field[FIELD_LAYER]->val_int();
if (layer > 100) // 10e30 nodes at M=2, more at larger M's if (layer > 100) // 10e30 nodes at M=2, more at larger M's
...@@ -803,7 +829,7 @@ int FVectorNode::save(TABLE *graph) ...@@ -803,7 +829,7 @@ int FVectorNode::save(TABLE *graph)
graph->field[FIELD_TREF]->set_notnull(); graph->field[FIELD_TREF]->set_notnull();
graph->field[FIELD_TREF]->store_binary(tref(), tref_len()); graph->field[FIELD_TREF]->store_binary(tref(), tref_len());
} }
graph->field[FIELD_VEC]->store_binary((uchar*)vec, ctx->byte_len); graph->field[FIELD_VEC]->store_binary((uchar*)vec, vector::data_size(ctx->vec_len));
size_t total_size= 0; size_t total_size= 0;
for (size_t i=0; i <= max_layer; i++) for (size_t i=0; i <= max_layer; i++)
...@@ -1095,10 +1121,15 @@ int mhnsw_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit) ...@@ -1095,10 +1121,15 @@ int mhnsw_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
/* /*
if the query vector is NULL or invalid, VEC_DISTANCE will return if the query vector is NULL or invalid, VEC_DISTANCE will return
NULL, so the result is basically unsorted, we can return rows NULL, so the result is basically unsorted, we can return rows
in any order. For simplicity let's sort by the start_node. in any order. Let's use some hardcoded value here
*/ */
if (!res || ctx->byte_len != res->length()) if (!res || ctx->byte_len != res->length())
(res= &buf)->set((char*)start_nodes.links[0]->vec, ctx->byte_len, &my_charset_bin); {
res= &buf;
buf.alloc(ctx->byte_len);
for (size_t i=0; i < ctx->vec_len; i++)
((float*)buf.ptr())[i]= i == 0;
}
const longlong max_layer= start_nodes.links[0]->max_layer; const longlong max_layer= start_nodes.links[0]->max_layer;
FVector target(ctx, thd->mem_root, res->ptr()); FVector target(ctx, thd->mem_root, res->ptr());
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment