Commit f9307eaa authored by Sergei Golubchik's avatar Sergei Golubchik

AVX-512 support

parent 8547d088
...@@ -33,6 +33,9 @@ SOFTWARE. ...@@ -33,6 +33,9 @@ SOFTWARE.
#if __GNUC__ > 7 #if __GNUC__ > 7
#define DEFAULT_IMPLEMENTATION __attribute__ ((target ("default"))) #define DEFAULT_IMPLEMENTATION __attribute__ ((target ("default")))
#define AVX2_IMPLEMENTATION __attribute__ ((target ("avx2,avx,fma"))) #define AVX2_IMPLEMENTATION __attribute__ ((target ("avx2,avx,fma")))
#if __GNUC__ > 9
#define AVX512_IMPLEMENTATION __attribute__ ((target ("avx512f,avx512bw")))
#endif
#endif #endif
#endif #endif
#ifndef DEFAULT_IMPLEMENTATION #ifndef DEFAULT_IMPLEMENTATION
...@@ -169,6 +172,9 @@ struct PatternedSimdBloomFilter ...@@ -169,6 +172,9 @@ struct PatternedSimdBloomFilter
uint8_t res_bits = static_cast<uint8_t>(_mm256_movemask_epi8(_mm256_set1_epi64x(res_bytes)) & 0xff); uint8_t res_bits = static_cast<uint8_t>(_mm256_movemask_epi8(_mm256_set1_epi64x(res_bytes)) & 0xff);
return res_bits; return res_bits;
} }
/* AVX-512 version can be (and was) implemented, but the speedup is,
basically, unnoticeable, well below the noise level */
#endif #endif
/******************************************************** /********************************************************
......
...@@ -156,6 +156,39 @@ struct FVector ...@@ -156,6 +156,39 @@ struct FVector
} }
#endif #endif
#ifdef AVX512_IMPLEMENTATION
/************* AVX512 ****************************************************/
static constexpr size_t AVX512_bytes= 512/8;
static constexpr size_t AVX512_dims= AVX512_bytes/sizeof(int16_t);
static_assert(subdist_part % AVX512_dims == 0);
AVX512_IMPLEMENTATION
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
{
__m512i *p1= (__m512i*)v1;
__m512i *p2= (__m512i*)v2;
__m512 d= _mm512_setzero_ps();
for (size_t i= 0; i < (len + AVX512_dims-1)/AVX512_dims; p1++, p2++, i++)
d= _mm512_add_ps(d, _mm512_cvtepi32_ps(_mm512_madd_epi16(*p1, *p2)));
return _mm512_reduce_add_ps(d);
}
AVX512_IMPLEMENTATION
static size_t alloc_size(size_t n)
{ return alloc_header + MY_ALIGN(n*2, AVX512_bytes) + AVX512_bytes - 1; }
AVX512_IMPLEMENTATION
static FVector *align_ptr(void *ptr)
{ return (FVector*)(MY_ALIGN(((intptr)ptr) + alloc_header, AVX512_bytes)
- alloc_header); }
AVX512_IMPLEMENTATION
void fix_tail(size_t vec_len)
{
bzero(dims + vec_len, (MY_ALIGN(vec_len, AVX512_dims) - vec_len)*2);
}
#endif
/************* no-SIMD default ******************************************/ /************* no-SIMD default ******************************************/
DEFAULT_IMPLEMENTATION DEFAULT_IMPLEMENTATION
static float dot_product(const int16_t *v1, const int16_t *v2, size_t len) static float dot_product(const int16_t *v1, const int16_t *v2, size_t len)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment