Commit 1bca1fc5 authored by Sergei Golubchik's avatar Sergei Golubchik

mhnsw: store coordinates in 16 bits, not 32

use int16_t instead of floats, they're faster and smaller.
but perform intermediate SIMD calculations with floats to avoid overflows.
recall drop with such scheme is below 0.002, often none.

int8_t would've been better but the precision loss is too big
and recall degrades too much.
parent d029a2e0
......@@ -38,8 +38,8 @@ static const double alpha = 1.1;
static const uint clo_nei_threshold= 10000;
// SIMD definitions
#define SIMD_word (256/8)
#define SIMD_floats (SIMD_word/sizeof(float))
#define SIMD_bytes (256/8)
#define SIMD_dims (SIMD_bytes/sizeof(int16_t))
enum Graph_table_fields {
FIELD_LAYER, FIELD_TREF, FIELD_VEC, FIELD_NEIGHBORS
......@@ -51,17 +51,26 @@ enum Graph_table_indices {
class MHNSW_Context;
class FVectorNode;
/*
One vector, an array of ctx->vec_len floats
#pragma pack(push, 1)
struct vector
{
float abs2, scale;
int16_t dims[4];
static constexpr size_t alloc_size(size_t n)
{ return sizeof(float)*2+MY_ALIGN(n*2, SIMD_bytes) + SIMD_bytes - 1; }
static constexpr size_t data_size(size_t n)
{ return sizeof(float)*2+n*2; }
};
#pragma pack(pop)
Aligned on 32-byte (SIMD_word) boundary for SIMD, vector lenght
is zero-padded to multiples of 8, for the same reason.
/*
One vector, an array of coordinates in ctx->vec_len dimensions
*/
class FVector
{
public:
FVector(MHNSW_Context *ctx_, MEM_ROOT *root, const void *vec_);
float *vec;
vector *vec;
protected:
FVector() : vec(nullptr) {}
};
......@@ -98,26 +107,30 @@ struct Neighborhood: public Sql_alloc
#if __GNUC__ > 7
__attribute__ ((target ("avx2,avx,fma")))
float vec_distance(float *v1, float *v2, size_t len)
__attribute__ ((target ("avx2,avx")))
float dot_product(int16_t *v1, int16_t *v2, size_t len)
{
typedef float v8f __attribute__((vector_size(SIMD_word)));
v8f *p1= (v8f*)v1;
v8f *p2= (v8f*)v2;
typedef float v8f __attribute__((vector_size(SIMD_bytes)));
union { v8f v; __m256 i; } tmp;
__m256i *p1= (__m256i*)v1;
__m256i *p2= (__m256i*)v2;
v8f d= {0};
for (size_t i= 0; i < len/SIMD_floats; p1++, p2++, i++)
for (size_t i= 0; i < len/SIMD_dims; p1++, p2++, i++)
{
v8f dist= *p1 - *p2;
d+= dist * dist;
tmp.i= _mm256_cvtepi32_ps(_mm256_madd_epi16(*p1, *p2));
d+= tmp.v;
}
return d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
}
#endif
__attribute__ ((target ("default")))
float vec_distance(float *v1, float *v2, size_t len)
float dot_product(int16_t *v1, int16_t *v2, size_t len)
{
return euclidean_vec_distance(v1, v2, len);
int64_t d= 0;
for (size_t i= 0; i < len; i++)
d+= int32_t(v1[i]) * int32_t(v2[i]);
return d;
}
......@@ -148,7 +161,7 @@ class FVectorNode: public FVector
private:
MHNSW_Context *ctx;
float *make_vec(const void *v);
vector *make_vec(const void *v);
int alloc_neighborhood(size_t layer);
public:
Neighborhood *neighbors= nullptr;
......@@ -204,7 +217,7 @@ class MHNSW_Context : public Sql_alloc
void *alloc_node_internal()
{
return alloc_root(&root, sizeof(FVectorNode) + gref_len + tref_len
+ vec_len * sizeof(float) + SIMD_word - 1);
+ vector::alloc_size(vec_len));
}
protected:
......@@ -276,7 +289,7 @@ class MHNSW_Context : public Sql_alloc
void set_lengths(size_t len)
{
byte_len= len;
vec_len= MY_ALIGN(byte_len/sizeof(float), SIMD_floats);
vec_len= len / sizeof(float);
}
static int acquire(MHNSW_Context **ctx, TABLE *table, bool for_update);
......@@ -522,29 +535,39 @@ int MHNSW_Context::acquire(MHNSW_Context **ctx, TABLE *table, bool for_update)
return err;
graph->file->position(graph->record[0]);
(*ctx)->set_lengths(graph->field[FIELD_VEC]->value_length());
(*ctx)->set_lengths((graph->field[FIELD_VEC]->value_length()-8)*2);
(*ctx)->start= (*ctx)->get_node(graph->file->ref);
return (*ctx)->start->load_from_record(graph);
}
/* copy the vector, aligned and padded for SIMD */
static float *make_vec(void *mem, const void *src, size_t src_len)
/* copy the vector, preprocessed as needed */
static vector *make_vec(void *mem, const void *src, size_t src_len)
{
auto dst= (float*)MY_ALIGN((intptr)mem, SIMD_word);
memcpy(dst, src, src_len);
const size_t start= src_len/sizeof(float);
for (size_t i= start; i < MY_ALIGN(start, SIMD_floats); i++)
dst[i]=0.0f;
return dst;
auto vec= (vector*)(MY_ALIGN(((intptr)mem)+8, SIMD_bytes) - 8);
float abs2= 0, scale=0, *v= (float *)src;
size_t vec_len= src_len / sizeof(float);
for (size_t i= 0; i < vec_len; i++)
{
abs2+= v[i]*v[i];
if (std::abs(scale) < std::abs(v[i]))
scale= v[i];
}
vec->abs2= abs2/2;
vec->scale= scale ? scale/32767 : 1;
for (size_t i= 0; i < vec_len; i++)
vec->dims[i] = std::round(v[i] / vec->scale);
bzero(vec->dims + vec_len, (MY_ALIGN(vec_len, SIMD_dims) - vec_len)*2);
return vec;
}
FVector::FVector(MHNSW_Context *ctx, MEM_ROOT *root, const void *vec_)
{
vec= make_vec(alloc_root(root, ctx->vec_len * sizeof(float) + SIMD_word - 1),
vec= make_vec(alloc_root(root, vector::alloc_size(ctx->vec_len)),
vec_, ctx->byte_len);
}
float *FVectorNode::make_vec(const void *v)
vector *FVectorNode::make_vec(const void *v)
{
return ::make_vec(tref() + tref_len(), v, ctx->byte_len);
}
......@@ -569,7 +592,8 @@ FVectorNode::FVectorNode(MHNSW_Context *ctx_, const void *tref_, size_t layer,
float FVectorNode::distance_to(const FVector &other) const
{
return vec_distance(vec, other.vec, ctx->vec_len);
return vec->abs2 + other.vec->abs2 - vec->scale * other.vec->scale *
dot_product(vec->dims, other.vec->dims, MY_ALIGN(ctx->vec_len, SIMD_dims));
}
int FVectorNode::alloc_neighborhood(size_t layer)
......@@ -620,9 +644,11 @@ int FVectorNode::load_from_record(TABLE *graph)
if (unlikely(!v))
return my_errno= HA_ERR_CRASHED;
if (v->length() != ctx->byte_len)
if (v->length() != vector::data_size(ctx->vec_len))
return my_errno= HA_ERR_CRASHED;
float *vec_ptr= make_vec(v->ptr());
auto vec_ptr= (vector*)(MY_ALIGN(((intptr)tref() + tref_len())+8, SIMD_bytes) - 8);
memcpy(vec_ptr, v->ptr(), v->length());
bzero(vec_ptr->dims + ctx->vec_len, 2*(MY_ALIGN(ctx->vec_len, SIMD_dims) - ctx->vec_len));
size_t layer= graph->field[FIELD_LAYER]->val_int();
if (layer > 100) // 10e30 nodes at M=2, more at larger M's
......@@ -803,7 +829,7 @@ int FVectorNode::save(TABLE *graph)
graph->field[FIELD_TREF]->set_notnull();
graph->field[FIELD_TREF]->store_binary(tref(), tref_len());
}
graph->field[FIELD_VEC]->store_binary((uchar*)vec, ctx->byte_len);
graph->field[FIELD_VEC]->store_binary((uchar*)vec, vector::data_size(ctx->vec_len));
size_t total_size= 0;
for (size_t i=0; i <= max_layer; i++)
......@@ -1095,10 +1121,15 @@ int mhnsw_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
/*
if the query vector is NULL or invalid, VEC_DISTANCE will return
NULL, so the result is basically unsorted, we can return rows
in any order. For simplicity let's sort by the start_node.
in any order. Let's use some hardcoded value here
*/
if (!res || ctx->byte_len != res->length())
(res= &buf)->set((char*)start_nodes.links[0]->vec, ctx->byte_len, &my_charset_bin);
{
res= &buf;
buf.alloc(ctx->byte_len);
for (size_t i=0; i < ctx->vec_len; i++)
((float*)buf.ptr())[i]= i == 0;
}
const longlong max_layer= start_nodes.links[0]->max_layer;
FVector target(ctx, thd->mem_root, res->ptr());
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment