Commit 151fc0ed authored by Yuqi Gu's avatar Yuqi Gu Committed by Marko Mäkelä

MDEV-23495: Refine Arm64 PMULL runtime check in MariaDB

Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030).

The PR #1645 offers a solution to fix this issue. But it does not consider
the condition that the target platform does support crc32 but not support PMULL.

In this condition, it should leverage the Arm64 crc32 instruction (__crc32c) and
just only skip parallel computation (pmull/vmull) rather than skip all hardware
crc32 instruction of computation.

The PR also removes unnecessary CRC32_ZERO branch in 'crc32c_aarch64' for MariaDB,
formats the indent and coding style.

Change-Id: I76371a6bd767b4985600e8cca10983d71b7e9459
Signed-off-by: default avatarYuqi Gu <yuqi.gu@arm.com>
parent 0b4ed0b7
...@@ -911,9 +911,7 @@ extern MYSQL_PLUGIN_IMPORT my_crc32_t my_checksum; ...@@ -911,9 +911,7 @@ extern MYSQL_PLUGIN_IMPORT my_crc32_t my_checksum;
#if defined(__GNUC__) && defined(HAVE_ARMV8_CRC) #if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
int crc32_aarch64_available(void); int crc32_aarch64_available(void);
#if defined(HAVE_ARMV8_CRYPTO) const char *crc32c_aarch64_available(void);
int crc32c_aarch64_available(void);
#endif
#endif #endif
#ifdef DBUG_ASSERT_EXISTS #ifdef DBUG_ASSERT_EXISTS
......
...@@ -8,40 +8,49 @@ ...@@ -8,40 +8,49 @@
#include <asm/hwcap.h> #include <asm/hwcap.h>
#ifndef HWCAP_CRC32 #ifndef HWCAP_CRC32
#define HWCAP_CRC32 (1 << 7) # define HWCAP_CRC32 (1 << 7)
#endif #endif
#ifndef HWCAP_PMULL
# define HWCAP_PMULL (1 << 4)
#endif
static int pmull_supported;
/* ARM made crc32 default from ARMv8.1 but optional in ARMv8A /* ARM made crc32 default from ARMv8.1 but optional in ARMv8A
so the runtime check. */ * Runtime check API.
*/
int crc32_aarch64_available(void) int crc32_aarch64_available(void)
{ {
unsigned long auxv= getauxval(AT_HWCAP); unsigned long auxv= getauxval(AT_HWCAP);
return (auxv & HWCAP_CRC32) != 0; return (auxv & HWCAP_CRC32) != 0;
} }
#if defined(HAVE_ARMV8_CRYPTO) const char *crc32c_aarch64_available(void)
{
unsigned long auxv= getauxval(AT_HWCAP);
#ifndef HWCAP_PMULL if (!(auxv & HWCAP_CRC32))
#define HWCAP_PMULL (1 << 4) return NULL;
#endif
/* Check if target ARM machine support crc32 + pmull for computing crc32c */ pmull_supported= (auxv & HWCAP_PMULL) != 0;
int crc32c_aarch64_available(void) if (pmull_supported)
{ return "Using ARMv8 crc32 + pmull instructions";
return !(~getauxval(AT_HWCAP) & (HWCAP_CRC32 | HWCAP_PMULL)); else
return "Using ARMv8 crc32 instructions";
} }
#endif /* HAVE_ARMV8_CRYPTO */
#endif /* HAVE_ARMV8_CRC */ #endif /* __GNUC__ && HAVE_ARMV8_CRC */
#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS #ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
/* Request crc extension capabilities from the assembler */ /* Request crc extension capabilities from the assembler */
asm(".arch_extension crc"); asm(".arch_extension crc");
#ifdef HAVE_ARMV8_CRYPTO # ifdef HAVE_ARMV8_CRYPTO
/* crypto extension */ /* crypto extension */
asm(".arch_extension crypto"); asm(".arch_extension crypto");
#endif # endif
#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
...@@ -53,9 +62,6 @@ asm(".arch_extension crypto"); ...@@ -53,9 +62,6 @@ asm(".arch_extension crypto");
__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\ __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
__asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR)))); __asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
#define CRC32C3X8_ZERO \
__asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ #else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
/* Intrinsics header*/ /* Intrinsics header*/
...@@ -72,9 +78,6 @@ asm(".arch_extension crypto"); ...@@ -72,9 +78,6 @@ asm(".arch_extension crypto");
crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\ crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR))); crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
#define CRC32C3X8_ZERO \
crc0 = __crc32cd(crc0, (const uint64_t)0);
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
#define CRC32C7X3X8(buffer, ITR) do {\ #define CRC32C7X3X8(buffer, ITR) do {\
...@@ -85,17 +88,7 @@ asm(".arch_extension crypto"); ...@@ -85,17 +88,7 @@ asm(".arch_extension crypto");
CRC32C3X8(buffer, ((ITR) * 7 + 4)) \ CRC32C3X8(buffer, ((ITR) * 7 + 4)) \
CRC32C3X8(buffer, ((ITR) * 7 + 5)) \ CRC32C3X8(buffer, ((ITR) * 7 + 5)) \
CRC32C3X8(buffer, ((ITR) * 7 + 6)) \ CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
} while(0) } while(0)
#define CRC32C7X3X8_ZERO do {\
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
CRC32C3X8_ZERO \
} while(0)
#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \ #define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
...@@ -121,37 +114,46 @@ asm(".arch_extension crypto"); ...@@ -121,37 +114,46 @@ asm(".arch_extension crypto");
PREF4X64L2(buffer,(PREF_OFFSET), 8) \ PREF4X64L2(buffer,(PREF_OFFSET), 8) \
PREF4X64L2(buffer,(PREF_OFFSET), 12) PREF4X64L2(buffer,(PREF_OFFSET), 12)
uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
{ {
uint32_t crc0, crc1, crc2; uint32_t crc0, crc1, crc2;
int64_t length = (int64_t)len; int64_t length= (int64_t)len;
crc = 0xFFFFFFFFU; crc= 0xFFFFFFFFU;
if (buffer) { /* Pmull runtime check here.
* Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030).
/* Crypto extension Support *
* Process 1024 Bytes (per block) * Consider the condition that the target platform does support hardware crc32
* but not support PMULL. In this condition, it should leverage the aarch64
* crc32 instruction (__crc32c) and just only skip parallel computation (pmull/vmull)
* rather than skip all hardware crc32 instruction of computation.
*/ */
if (pmull_supported)
{
/* The following Macro (HAVE_ARMV8_CRYPTO) is used for compiling check */
#ifdef HAVE_ARMV8_CRYPTO #ifdef HAVE_ARMV8_CRYPTO
/* Intrinsics Support */ /* Crypto extension Support
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS * Parallel computation with 1024 Bytes (per block)
const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014; * Intrinsics Support
*/
# ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
const poly64_t k1= 0xe417f38a, k2= 0x8f158014;
uint64_t t0, t1; uint64_t t0, t1;
/* Process per block size of 1024 Bytes /* Process per block size of 1024 Bytes
* A block size = 8 + 42*3*sizeof(uint64_t) + 8 * A block size = 8 + 42*3*sizeof(uint64_t) + 8
*/ */
while ((length -= 1024) >= 0) { while ((length-= 1024) >= 0)
{
/* Prefetch 3*1024 data for avoiding L2 cache miss */ /* Prefetch 3*1024 data for avoiding L2 cache miss */
PREF1KL2(buffer, 1024*3); PREF1KL2(buffer, 1024*3);
/* Do first 8 bytes here for better pipelining */ /* Do first 8 bytes here for better pipelining */
crc0 = __crc32cd(crc, *(const uint64_t *)buffer); crc0= __crc32cd(crc, *(const uint64_t *)buffer);
crc1 = 0; crc1= 0;
crc2 = 0; crc2= 0;
buffer += sizeof(uint64_t); buffer+= sizeof(uint64_t);
/* Process block inline /* Process block inline
* Process crc0 last to avoid dependency with above * Process crc0 last to avoid dependency with above
...@@ -163,7 +165,7 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) ...@@ -163,7 +165,7 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
CRC32C7X3X8(buffer, 4); CRC32C7X3X8(buffer, 4);
CRC32C7X3X8(buffer, 5); CRC32C7X3X8(buffer, 5);
buffer += 42*3*sizeof(uint64_t); buffer+= 42*3*sizeof(uint64_t);
/* Prefetch data for following block to avoid L1 cache miss */ /* Prefetch data for following block to avoid L1 cache miss */
PREF1KL1(buffer, 1024); PREF1KL1(buffer, 1024);
...@@ -172,18 +174,18 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) ...@@ -172,18 +174,18 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
* crc1 multiply by K2 * crc1 multiply by K2
* crc0 multiply by K1 * crc0 multiply by K1
*/ */
t1 = (uint64_t)vmull_p64(crc1, k2); t1= (uint64_t)vmull_p64(crc1, k2);
t0 = (uint64_t)vmull_p64(crc0, k1); t0= (uint64_t)vmull_p64(crc0, k1);
crc = __crc32cd(crc2, *(const uint64_t *)buffer); crc= __crc32cd(crc2, *(const uint64_t *)buffer);
crc1 = __crc32cd(0, t1); crc1= __crc32cd(0, t1);
crc ^= crc1; crc^= crc1;
crc0 = __crc32cd(0, t0); crc0= __crc32cd(0, t0);
crc ^= crc0; crc^= crc0;
buffer += sizeof(uint64_t); buffer+= sizeof(uint64_t);
} }
#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ # else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
/*No intrinsics*/ /*No intrinsics*/
__asm__("mov x16, #0xf38a \n\t" __asm__("mov x16, #0xf38a \n\t"
...@@ -194,13 +196,14 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) ...@@ -194,13 +196,14 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
"mov v0.2d[0], x16 \n\t" "mov v0.2d[0], x16 \n\t"
:::"x16"); :::"x16");
while ((length -= 1024) >= 0) { while ((length-= 1024) >= 0)
{
PREF1KL2(buffer, 1024*3); PREF1KL2(buffer, 1024*3);
__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t" __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):); :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
crc1 = 0; crc1= 0;
crc2 = 0; crc2= 0;
buffer += sizeof(uint64_t); buffer+= sizeof(uint64_t);
CRC32C7X3X8(buffer, 0); CRC32C7X3X8(buffer, 0);
CRC32C7X3X8(buffer, 1); CRC32C7X3X8(buffer, 1);
...@@ -209,7 +212,7 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) ...@@ -209,7 +212,7 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
CRC32C7X3X8(buffer, 4); CRC32C7X3X8(buffer, 4);
CRC32C7X3X8(buffer, 5); CRC32C7X3X8(buffer, 5);
buffer += 42*3*sizeof(uint64_t); buffer+= 42*3*sizeof(uint64_t);
PREF1KL1(buffer, 1024); PREF1KL1(buffer, 1024);
__asm__("mov v2.2d[0], %x[c1] \n\t" __asm__("mov v2.2d[0], %x[c1] \n\t"
"pmull v2.1q, v2.1d, v0.1d \n\t" "pmull v2.1q, v2.1d, v0.1d \n\t"
...@@ -224,94 +227,41 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) ...@@ -224,94 +227,41 @@ uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
"eor %w[c], %w[c], %w[c0] \n\t" "eor %w[c], %w[c], %w[c0] \n\t"
:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc) :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
:[v]"r"(*((const uint64_t *)buffer))); :[v]"r"(*((const uint64_t *)buffer)));
buffer += sizeof(uint64_t); buffer+= sizeof(uint64_t);
} }
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ # endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
/* Done if Input data size is aligned with 1024 */ /* Done if Input data size is aligned with 1024 */
if(!(length += 1024)) if (!(length+= 1024))
return (~crc); return ~crc;
#endif /* HAVE_ARMV8_CRYPTO */ #endif /* HAVE_ARMV8_CRYPTO */
while ((length -= sizeof(uint64_t)) >= 0) { } // end if pmull_supported
CRC32CX(crc, *(uint64_t *)buffer);
buffer += sizeof(uint64_t);
}
/* The following is more efficient than the straight loop */
if (length & sizeof(uint32_t)) {
CRC32CW(crc, *(uint32_t *)buffer);
buffer += sizeof(uint32_t);
}
if (length & sizeof(uint16_t)) {
CRC32CH(crc, *(uint16_t *)buffer);
buffer += sizeof(uint16_t);
}
if (length & sizeof(uint8_t))
CRC32CB(crc, *buffer);
} else {
#ifdef HAVE_ARMV8_CRYPTO
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
const poly64_t k1 = 0xe417f38a;
uint64_t t0;
while ((length -= 1024) >= 0) {
crc0 = __crc32cd(crc, 0);
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
/* Merge crc0 into crc: crc0 multiply by K1 */
t0 = (uint64_t)vmull_p64(crc0, k1);
crc = __crc32cd(0, t0);
}
#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
__asm__("mov x16, #0xf38a \n\t"
"movk x16, #0xe417, lsl 16 \n\t"
"mov v1.2d[0], x16 \n\t"
:::"x16");
while ((length -= 1024) >= 0) {
__asm__("crc32cx %w[c0], %w[c], xzr\n\t"
:[c0]"=r"(crc0):[c]"r"(crc));
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
CRC32C7X3X8_ZERO;
__asm__("mov v3.2d[0], %x[c0] \n\t" while ((length-= sizeof(uint64_t)) >= 0)
"pmull v3.1q, v3.1d, v1.1d \n\t" {
"mov %x[c0], v3.2d[0] \n\t" CRC32CX(crc, *(uint64_t *)buffer);
"crc32cx %w[c], wzr, %x[c0] \n\t" buffer+= sizeof(uint64_t);
:[c]"=r"(crc)
:[c0]"r"(crc0));
} }
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
if(!(length += 1024))
return (~crc);
#endif /* HAVE_ARMV8_CRYPTO */
while ((length -= sizeof(uint64_t)) >= 0)
CRC32CX(crc, 0);
/* The following is more efficient than the straight loop */ /* The following is more efficient than the straight loop */
if (length & sizeof(uint32_t)) if (length & sizeof(uint32_t))
CRC32CW(crc, 0); {
CRC32CW(crc, *(uint32_t *)buffer);
buffer+= sizeof(uint32_t);
}
if (length & sizeof(uint16_t)) if (length & sizeof(uint16_t))
CRC32CH(crc, 0); {
CRC32CH(crc, *(uint16_t *)buffer);
buffer+= sizeof(uint16_t);
}
if (length & sizeof(uint8_t)) if (length & sizeof(uint8_t))
CRC32CB(crc, 0); CRC32CB(crc, *buffer);
}
return (~crc); return ~crc;
} }
/* There are multiple approaches to calculate crc. /* There are multiple approaches to calculate crc.
......
...@@ -342,11 +342,11 @@ allocations, would not hurt if called twice, but would be pointless. */ ...@@ -342,11 +342,11 @@ allocations, would not hurt if called twice, but would be pointless. */
void ut_crc32_init() void ut_crc32_init()
{ {
#ifndef HAVE_CRC32_VPMSUM #ifndef HAVE_CRC32_VPMSUM
# if defined(__GNUC__) && defined(HAVE_ARMV8_CRC) && defined(HAVE_ARMV8_CRYPTO) # if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
if (crc32c_aarch64_available()) if (const char *crc32c_implementation= crc32c_aarch64_available())
{ {
ut_crc32_low= crc32c_aarch64; ut_crc32_low= crc32c_aarch64;
ut_crc32_implementation= "Using ARMv8 crc32 + pmull instructions"; ut_crc32_implementation= crc32c_implementation;
return; return;
} }
# elif defined(TRY_SSE4_2) # elif defined(TRY_SSE4_2)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment