MDEV-23495: Refine Arm64 PMULL runtime check in MariaDB

Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030). The PR #1645 offers a solution to fix this issue. But it does not consider the condition that the target platform does support crc32 but not support PMULL. In this condition, it should leverage the Arm64 crc32 instruction (__crc32c) and just only skip parallel computation (pmull/vmull) rather than skip all hardware crc32 instruction of computation. The PR also removes unnecessary CRC32_ZERO branch in 'crc32c_aarch64' for MariaDB, formats the indent and coding style. Change-Id: I76371a6bd767b4985600e8cca10983d71b7e9459 Signed-off-by: Yuqi Gu <yuqi.gu@arm.com>

MDEV-23495: Refine Arm64 PMULL runtime check in MariaDB
Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030). The PR #1645 offers a solution to fix this issue. But it does not consider the condition that the target platform does support crc32 but not support PMULL. In this condition, it should leverage the Arm64 crc32 instruction (__crc32c) and just only skip parallel computation (pmull/vmull) rather than skip all hardware crc32 instruction of computation. The PR also removes unnecessary CRC32_ZERO branch in 'crc32c_aarch64' for MariaDB, formats the indent and coding style. Change-Id: I76371a6bd767b4985600e8cca10983d71b7e9459 Signed-off-by: Yuqi Gu <yuqi.gu@arm.com>
151fc0ed · Yuqi Gu · Marko Mäkelä · 0b4ed0b7 · 151fc0ed · 151fc0ed
Commit 151fc0ed authored Aug 13, 2020 by Yuqi Gu Committed by Marko Mäkelä Aug 21, 2020
Showing with 193 additions and 245 deletions

include/my_sys.h include/my_sys.h +1 -3

mysys/crc32/crc32_arm64.c mysys/crc32/crc32_arm64.c +189 -239

storage/innobase/ut/ut0crc32.cc storage/innobase/ut/ut0crc32.cc +3 -3

No files found.
--- a/include/my_sys.h
+++ b/include/my_sys.h
@@ -911,9 +911,7 @@ extern MYSQL_PLUGIN_IMPORT my_crc32_t my_checksum;

 #if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
 int crc32_aarch64_available(void);
-#if defined(HAVE_ARMV8_CRYPTO)
-int crc32c_aarch64_available(void);
-#endif
+const char *crc32c_aarch64_available(void);
 #endif

 #ifdef DBUG_ASSERT_EXISTS

--- a/mysys/crc32/crc32_arm64.c
+++ b/mysys/crc32/crc32_arm64.c
@@ -8,40 +8,49 @@
 #include <asm/hwcap.h>

 #ifndef HWCAP_CRC32
-#define HWCAP_CRC32 (1 << 7)
+# define HWCAP_CRC32 (1 << 7)
 #endif

+#ifndef HWCAP_PMULL
+# define HWCAP_PMULL (1 << 4)
+#endif
+
+static int pmull_supported;
+
 /* ARM made crc32 default from ARMv8.1 but optional in ARMv8A
-so the runtime check. */
+ * Runtime check API.
+ */
 int crc32_aarch64_available(void)
 {
  unsigned long auxv= getauxval(AT_HWCAP);
  return (auxv & HWCAP_CRC32) != 0;
 }

-#if defined(HAVE_ARMV8_CRYPTO)
+const char *crc32c_aarch64_available(void)
+{
+  unsigned long auxv= getauxval(AT_HWCAP);

-#ifndef HWCAP_PMULL
-#define HWCAP_PMULL (1 << 4)
-#endif
+  if (!(auxv & HWCAP_CRC32))
+    return NULL;

-/* Check if target ARM machine support crc32 + pmull for computing crc32c */
-int crc32c_aarch64_available(void)
-{
-  return !(~getauxval(AT_HWCAP) & (HWCAP_CRC32 | HWCAP_PMULL));
+  pmull_supported= (auxv & HWCAP_PMULL) != 0;
+  if (pmull_supported)
+    return "Using ARMv8 crc32 + pmull instructions";
+  else
+    return "Using ARMv8 crc32 instructions";
 }
-#endif /* HAVE_ARMV8_CRYPTO */
-#endif /* HAVE_ARMV8_CRC */
+
+#endif /* __GNUC__ && HAVE_ARMV8_CRC */

 #ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS

 /* Request crc extension capabilities from the assembler */
 asm(".arch_extension crc");

-#ifdef HAVE_ARMV8_CRYPTO
+# ifdef HAVE_ARMV8_CRYPTO
 /* crypto extension  */
 asm(".arch_extension crypto");
-#endif
+# endif

 #define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
 #define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
@@ -49,12 +58,9 @@ asm(".arch_extension crypto");
 #define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))

 #define CRC32C3X8(buffer, ITR) \
-	__asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
-	__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
-	__asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
-
-#define CRC32C3X8_ZERO \
-	__asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
+  __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
+  __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
+  __asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));

 #else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS  */

@@ -68,250 +74,194 @@ asm(".arch_extension crypto");
 #define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))

 #define CRC32C3X8(buffer, ITR) \
-	crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
-	crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
-	crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
-
-#define CRC32C3X8_ZERO \
-	crc0 = __crc32cd(crc0, (const uint64_t)0);
+  crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
+  crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
+  crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));

 #endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */

 #define CRC32C7X3X8(buffer, ITR) do {\
-	CRC32C3X8(buffer, ((ITR) * 7 + 0)) \
-	CRC32C3X8(buffer, ((ITR) * 7 + 1)) \
-	CRC32C3X8(buffer, ((ITR) * 7 + 2)) \
-	CRC32C3X8(buffer, ((ITR) * 7 + 3)) \
-	CRC32C3X8(buffer, ((ITR) * 7 + 4)) \
-	CRC32C3X8(buffer, ((ITR) * 7 + 5)) \
-	CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
-	} while(0)
-
-#define CRC32C7X3X8_ZERO do {\
-	CRC32C3X8_ZERO \
-	CRC32C3X8_ZERO \
-	CRC32C3X8_ZERO \
-	CRC32C3X8_ZERO \
-	CRC32C3X8_ZERO \
-	CRC32C3X8_ZERO \
-	CRC32C3X8_ZERO \
-	} while(0)
+  CRC32C3X8(buffer, ((ITR) * 7 + 0)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 1)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 2)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 3)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 4)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 5)) \
+  CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
+} while(0)

 #define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
-	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
-	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
-	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
-	__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+  __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));

 #define PREF1KL1(buffer, PREF_OFFSET) \
-	PREF4X64L1(buffer,(PREF_OFFSET), 0) \
-	PREF4X64L1(buffer,(PREF_OFFSET), 4) \
-	PREF4X64L1(buffer,(PREF_OFFSET), 8) \
-	PREF4X64L1(buffer,(PREF_OFFSET), 12)
+  PREF4X64L1(buffer,(PREF_OFFSET), 0) \
+  PREF4X64L1(buffer,(PREF_OFFSET), 4) \
+  PREF4X64L1(buffer,(PREF_OFFSET), 8) \
+  PREF4X64L1(buffer,(PREF_OFFSET), 12)

 #define PREF4X64L2(buffer, PREF_OFFSET, ITR) \
-	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
-	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
-	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
-	__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
+  __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
+  __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
+  __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
+  __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));

 #define PREF1KL2(buffer, PREF_OFFSET) \
-	PREF4X64L2(buffer,(PREF_OFFSET), 0) \
-	PREF4X64L2(buffer,(PREF_OFFSET), 4) \
-	PREF4X64L2(buffer,(PREF_OFFSET), 8) \
-	PREF4X64L2(buffer,(PREF_OFFSET), 12)
-
+  PREF4X64L2(buffer,(PREF_OFFSET), 0) \
+  PREF4X64L2(buffer,(PREF_OFFSET), 4) \
+  PREF4X64L2(buffer,(PREF_OFFSET), 8) \
+  PREF4X64L2(buffer,(PREF_OFFSET), 12)

 uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
 {
-	uint32_t crc0, crc1, crc2;
-	int64_t length = (int64_t)len;
-
-	crc = 0xFFFFFFFFU;
-
-	if (buffer) {
+  uint32_t crc0, crc1, crc2;
+  int64_t length= (int64_t)len;
+
+  crc= 0xFFFFFFFFU;
+
+  /* Pmull runtime check here.
+   * Raspberry Pi 4 supports crc32 but doesn't support pmull (MDEV-23030).
+   *
+   * Consider the condition that the target platform does support hardware crc32
+   * but not support PMULL. In this condition, it should leverage the aarch64
+   * crc32 instruction (__crc32c) and just only skip parallel computation (pmull/vmull)
+   * rather than skip all hardware crc32 instruction of computation.
+   */
+  if (pmull_supported)
+  {
+/* The following Macro (HAVE_ARMV8_CRYPTO) is used for compiling check */
+#ifdef HAVE_ARMV8_CRYPTO

 /* Crypto extension Support
- * Process 1024 Bytes (per block)
+ * Parallel computation with 1024 Bytes (per block)
+ * Intrinsics Support
 */
-#ifdef HAVE_ARMV8_CRYPTO
-
-/* Intrinsics Support  */
-#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
-		const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
-		uint64_t t0, t1;
-
-		/* Process per block size of 1024 Bytes
-		 * A block size = 8 + 42*3*sizeof(uint64_t) + 8
-                 */
-		while ((length -= 1024) >= 0) {
-			/* Prefetch 3*1024 data for avoiding L2 cache miss */
-			PREF1KL2(buffer, 1024*3);
-			/* Do first 8 bytes here for better pipelining */
-			crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
-			crc1 = 0;
-			crc2 = 0;
-			buffer += sizeof(uint64_t);
-
-			/* Process block inline
-			 * Process crc0 last to avoid dependency with above
-                         */
-			CRC32C7X3X8(buffer, 0);
-			CRC32C7X3X8(buffer, 1);
-			CRC32C7X3X8(buffer, 2);
-			CRC32C7X3X8(buffer, 3);
-			CRC32C7X3X8(buffer, 4);
-			CRC32C7X3X8(buffer, 5);
-
-			buffer += 42*3*sizeof(uint64_t);
-			/* Prefetch data for following block to avoid L1 cache miss */
-			PREF1KL1(buffer, 1024);
-
-			/* Last 8 bytes
-			 * Merge crc0 and crc1 into crc2
-			 * crc1 multiply by K2
-			 * crc0 multiply by K1
-			 */
-			t1 = (uint64_t)vmull_p64(crc1, k2);
-			t0 = (uint64_t)vmull_p64(crc0, k1);
-			crc = __crc32cd(crc2, *(const uint64_t *)buffer);
-			crc1 = __crc32cd(0, t1);
-			crc ^= crc1;
-			crc0 = __crc32cd(0, t0);
-			crc ^= crc0;
-
-			buffer += sizeof(uint64_t);
-		}
-
-#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
-
-		/*No intrinsics*/
-		__asm__("mov    x16,            #0xf38a         \n\t"
-			"movk   x16,            #0xe417, lsl 16 \n\t"
-			"mov    v1.2d[0],       x16             \n\t"
-			"mov    x16,            #0x8014         \n\t"
-			"movk   x16,            #0x8f15, lsl 16 \n\t"
-			"mov    v0.2d[0],       x16             \n\t"
-			:::"x16");
-
-		while ((length -= 1024) >= 0) {
-			PREF1KL2(buffer, 1024*3);
-			__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
-				:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
-			crc1 = 0;
-			crc2 = 0;
-			buffer += sizeof(uint64_t);
-
-			CRC32C7X3X8(buffer, 0);
-			CRC32C7X3X8(buffer, 1);
-			CRC32C7X3X8(buffer, 2);
-			CRC32C7X3X8(buffer, 3);
-			CRC32C7X3X8(buffer, 4);
-			CRC32C7X3X8(buffer, 5);
-
-			buffer += 42*3*sizeof(uint64_t);
-			PREF1KL1(buffer, 1024);
-			__asm__("mov            v2.2d[0],       %x[c1]          \n\t"
-				"pmull          v2.1q,          v2.1d,  v0.1d   \n\t"
-				"mov            v3.2d[0],       %x[c0]          \n\t"
-				"pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
-				"crc32cx        %w[c],          %w[c2], %x[v]   \n\t"
-				"mov            %x[c1],         v2.2d[0]        \n\t"
-				"crc32cx        %w[c1],         wzr,    %x[c1]  \n\t"
-				"eor            %w[c],          %w[c],  %w[c1]  \n\t"
-				"mov            %x[c0],         v3.2d[0]        \n\t"
-				"crc32cx        %w[c0],         wzr,    %x[c0]  \n\t"
-				"eor            %w[c],          %w[c],  %w[c0]  \n\t"
-				:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
-				:[v]"r"(*((const uint64_t *)buffer)));
-			buffer += sizeof(uint64_t);
-		}
-#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
-
-		/* Done if Input data size is aligned with 1024  */
-		if(!(length += 1024))
-			return (~crc);
+# ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
+    const poly64_t k1= 0xe417f38a, k2= 0x8f158014;
+    uint64_t t0, t1;
+
+    /* Process per block size of 1024 Bytes
+     * A block size = 8 + 42*3*sizeof(uint64_t) + 8
+     */
+    while ((length-= 1024) >= 0)
+    {
+      /* Prefetch 3*1024 data for avoiding L2 cache miss */
+      PREF1KL2(buffer, 1024*3);
+      /* Do first 8 bytes here for better pipelining */
+      crc0= __crc32cd(crc, *(const uint64_t *)buffer);
+      crc1= 0;
+      crc2= 0;
+      buffer+= sizeof(uint64_t);
+
+      /* Process block inline
+       * Process crc0 last to avoid dependency with above
+       */
+      CRC32C7X3X8(buffer, 0);
+      CRC32C7X3X8(buffer, 1);
+      CRC32C7X3X8(buffer, 2);
+      CRC32C7X3X8(buffer, 3);
+      CRC32C7X3X8(buffer, 4);
+      CRC32C7X3X8(buffer, 5);
+
+      buffer+= 42*3*sizeof(uint64_t);
+      /* Prefetch data for following block to avoid L1 cache miss */
+      PREF1KL1(buffer, 1024);
+
+      /* Last 8 bytes
+       * Merge crc0 and crc1 into crc2
+       * crc1 multiply by K2
+       * crc0 multiply by K1
+       */
+      t1= (uint64_t)vmull_p64(crc1, k2);
+      t0= (uint64_t)vmull_p64(crc0, k1);
+      crc= __crc32cd(crc2, *(const uint64_t *)buffer);
+      crc1= __crc32cd(0, t1);
+      crc^= crc1;
+      crc0= __crc32cd(0, t0);
+      crc^= crc0;
+
+      buffer+= sizeof(uint64_t);
+    }
+
+# else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+    /*No intrinsics*/
+    __asm__("mov    x16,            #0xf38a         \n\t"
+            "movk   x16,            #0xe417, lsl 16 \n\t"
+            "mov    v1.2d[0],       x16             \n\t"
+            "mov    x16,            #0x8014         \n\t"
+            "movk   x16,            #0x8f15, lsl 16 \n\t"
+            "mov    v0.2d[0],       x16             \n\t"
+            :::"x16");
+
+    while ((length-= 1024) >= 0)
+    {
+      PREF1KL2(buffer, 1024*3);
+      __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
+              :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
+      crc1= 0;
+      crc2= 0;
+      buffer+= sizeof(uint64_t);
+
+      CRC32C7X3X8(buffer, 0);
+      CRC32C7X3X8(buffer, 1);
+      CRC32C7X3X8(buffer, 2);
+      CRC32C7X3X8(buffer, 3);
+      CRC32C7X3X8(buffer, 4);
+      CRC32C7X3X8(buffer, 5);
+
+      buffer+= 42*3*sizeof(uint64_t);
+      PREF1KL1(buffer, 1024);
+      __asm__("mov            v2.2d[0],       %x[c1]          \n\t"
+              "pmull          v2.1q,          v2.1d,  v0.1d   \n\t"
+              "mov            v3.2d[0],       %x[c0]          \n\t"
+              "pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
+              "crc32cx        %w[c],          %w[c2], %x[v]   \n\t"
+              "mov            %x[c1],         v2.2d[0]        \n\t"
+              "crc32cx        %w[c1],         wzr,    %x[c1]  \n\t"
+              "eor            %w[c],          %w[c],  %w[c1]  \n\t"
+              "mov            %x[c0],         v3.2d[0]        \n\t"
+              "crc32cx        %w[c0],         wzr,    %x[c0]  \n\t"
+              "eor            %w[c],          %w[c],  %w[c0]  \n\t"
+              :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
+              :[v]"r"(*((const uint64_t *)buffer)));
+      buffer+= sizeof(uint64_t);
+    }
+# endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
+
+    /* Done if Input data size is aligned with 1024  */
+    if (!(length+= 1024))
+      return ~crc;

 #endif /* HAVE_ARMV8_CRYPTO */

-		while ((length -= sizeof(uint64_t)) >= 0) {
-			CRC32CX(crc, *(uint64_t *)buffer);
-			buffer += sizeof(uint64_t);
-		}
-		/* The following is more efficient than the straight loop */
-		if (length & sizeof(uint32_t)) {
-			CRC32CW(crc, *(uint32_t *)buffer);
-			buffer += sizeof(uint32_t);
-		}
-		if (length & sizeof(uint16_t)) {
-			CRC32CH(crc, *(uint16_t *)buffer);
-			buffer += sizeof(uint16_t);
-		}
-		if (length & sizeof(uint8_t))
-			CRC32CB(crc, *buffer);
-
-	} else {
-#ifdef HAVE_ARMV8_CRYPTO
-#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
-		const poly64_t k1 = 0xe417f38a;
-		uint64_t t0;
-		while ((length -= 1024) >= 0) {
-			crc0 = __crc32cd(crc, 0);
-
-			CRC32C7X3X8_ZERO;
-			CRC32C7X3X8_ZERO;
-			CRC32C7X3X8_ZERO;
-			CRC32C7X3X8_ZERO;
-			CRC32C7X3X8_ZERO;
-			CRC32C7X3X8_ZERO;
-
-			/* Merge crc0 into crc: crc0 multiply by K1 */
-			t0 = (uint64_t)vmull_p64(crc0, k1);
-			crc = __crc32cd(0, t0);
-		}
-#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
-		__asm__("mov    x16,            #0xf38a         \n\t"
-			"movk   x16,            #0xe417, lsl 16 \n\t"
-			"mov    v1.2d[0],       x16             \n\t"
-			:::"x16");
-
-		while ((length -= 1024) >= 0) {
-			__asm__("crc32cx %w[c0], %w[c], xzr\n\t"
-				:[c0]"=r"(crc0):[c]"r"(crc));
-
-			CRC32C7X3X8_ZERO;
-			CRC32C7X3X8_ZERO;
-			CRC32C7X3X8_ZERO;
-			CRC32C7X3X8_ZERO;
-			CRC32C7X3X8_ZERO;
-			CRC32C7X3X8_ZERO;
-
-			__asm__("mov            v3.2d[0],       %x[c0]          \n\t"
-				"pmull          v3.1q,          v3.1d,  v1.1d   \n\t"
-				"mov            %x[c0],         v3.2d[0]        \n\t"
-				"crc32cx        %w[c],          wzr,    %x[c0]  \n\t"
-				:[c]"=r"(crc)
-				:[c0]"r"(crc0));
-		}
-#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
-		if(!(length += 1024))
-			return (~crc);
-#endif /* HAVE_ARMV8_CRYPTO */
-		while ((length -= sizeof(uint64_t)) >= 0)
-			CRC32CX(crc, 0);
+  }  // end if pmull_supported

-		/* The following is more efficient than the straight loop */
-		if (length & sizeof(uint32_t))
-			CRC32CW(crc, 0);
+  while ((length-= sizeof(uint64_t)) >= 0)
+  {
+    CRC32CX(crc, *(uint64_t *)buffer);
+    buffer+= sizeof(uint64_t);
+  }

-		if (length & sizeof(uint16_t))
-			CRC32CH(crc, 0);
+  /* The following is more efficient than the straight loop */
+  if (length & sizeof(uint32_t))
+  {
+    CRC32CW(crc, *(uint32_t *)buffer);
+    buffer+= sizeof(uint32_t);
+  }

-		if (length & sizeof(uint8_t))
-			CRC32CB(crc, 0);
-	}
+  if (length & sizeof(uint16_t))
+  {
+    CRC32CH(crc, *(uint16_t *)buffer);
+    buffer+= sizeof(uint16_t);
+  }

-	return (~crc);
+  if (length & sizeof(uint8_t))
+    CRC32CB(crc, *buffer);
+
+  return ~crc;
 }

 /* There are multiple approaches to calculate crc.

--- a/storage/innobase/ut/ut0crc32.cc
+++ b/storage/innobase/ut/ut0crc32.cc
@@ -342,11 +342,11 @@ allocations, would not hurt if called twice, but would be pointless. */
 void ut_crc32_init()
 {
 #ifndef HAVE_CRC32_VPMSUM
-# if defined(__GNUC__) && defined(HAVE_ARMV8_CRC) && defined(HAVE_ARMV8_CRYPTO)
-  if (crc32c_aarch64_available())
+# if defined(__GNUC__) && defined(HAVE_ARMV8_CRC)
+  if (const char *crc32c_implementation= crc32c_aarch64_available())
  {
    ut_crc32_low= crc32c_aarch64;
-    ut_crc32_implementation= "Using ARMv8 crc32 + pmull instructions";
+    ut_crc32_implementation= crc32c_implementation;
    return;
  }
 # elif defined(TRY_SSE4_2)