Performance improvements in "from latin1" and "to utf8" conversion.

Mini-benchmarking demonstrates up to 10% improvement in latin1->utf8 conversion. modified: @ strings/ctype-latin1.c redundant test in ctype-latin1.c removed @ strings/ctype-utf8.c my_uni_utf8 rewritten in a more efficient way

Performance improvements in "from latin1" and "to utf8" conversion.
Mini-benchmarking demonstrates up to 10% improvement in latin1->utf8 conversion. modified: @ strings/ctype-latin1.c redundant test in ctype-latin1.c removed @ strings/ctype-utf8.c my_uni_utf8 rewritten in a more efficient way
f5c3c285 · Alexander Barkov · 62222eb5 · f5c3c285 · f5c3c285
Commit f5c3c285 authored Mar 12, 2013 by Alexander Barkov
Show whitespace changes
Inline Side-by-side

Showing with 33 additions and 41 deletions

strings/ctype-latin1.c strings/ctype-latin1.c +8 -3

strings/ctype-utf8.c strings/ctype-utf8.c +25 -38

No files found.
--- a/strings/ctype-latin1.c
+++ b/strings/ctype-latin1.c
@@ -364,9 +364,14 @@ int my_mb_wc_latin1(CHARSET_INFO *cs  __attribute__((unused)),
 {
  if (str >= end)
    return MY_CS_TOOSMALL;
+  /*
-  *wc=cs_to_uni[*str];
+    There are no unassigned characters in latin1.
-  return (!wc[0] && str[0]) ? -1 : 1;
+    Every code point in latin1 is mapped to some Unicode code point.
+    We can always return 1, no needs to check the value of cs_to_uni[*str].
+  */
+  *wc= cs_to_uni[*str];
+  DBUG_ASSERT(wc[0] || !str[0]);
+  return 1;
 }
 static

--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -2404,46 +2404,33 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
 static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)),
                        my_wc_t wc, uchar *r, uchar *e)
 {
-  int count;
+  if (wc < 0x80)
+  {
    if (r >= e)
      return MY_CS_TOOSMALL;
+    *r= (uchar) wc;
-  if (wc < 0x80)
+    return 1;
-    count = 1;
-  else if (wc < 0x800)
-    count = 2;
-  else if (wc < 0x10000)
-    count = 3;
-#ifdef UNICODE_32BIT
-  else if (wc < 0x200000)
-    count = 4;
-  else if (wc < 0x4000000)
-    count = 5;
-  else if (wc <= 0x7fffffff)
-    count = 6;
-#endif
-    else return MY_CS_ILUNI;
-  /*
-    e is a character after the string r, not the last character of it.
-    Because of it (r+count > e), not (r+count-1 >e )
-   */
-  if ( r+count > e )
-    return MY_CS_TOOSMALLN(count);
-  switch (count) {
-    /* Fall through all cases!!! */
-#ifdef UNICODE_32BIT
-    case 6: r[5] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x4000000;
-    case 5: r[4] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x200000;
-    case 4: r[3] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000;
-#endif
-    case 3: r[2] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800;
-    case 2: r[1] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0;
-    case 1: r[0] = (uchar) wc;
  }
-  return count;
+  if (wc < 0x800)
+  {
+    if (r + 2 > e)
+      return MY_CS_TOOSMALLN(2);
+    /* U+0080..U+07FF:  00000xxx.xxyyyyyy -> 110xxxxx 10yyyyyy */
+    *r++= (uchar) (0xC0 | (wc >> 6));
+    *r=   (uchar) (0x80 | (wc & 0x3F));
+    return 2;
+  }
+  if (wc < 0x10000)
+  {
+    if (r + 3 > e)
+      return MY_CS_TOOSMALLN(3);
+    /* U+0800..U+FFFF: xxxxyyyy.yyzzzzzz  -> 1110xxxx 10yyyyyy 10zzzzzz */
+    *r++= (uchar) (0xE0 | (wc >> 12));
+    *r++= (uchar) (0x80 | ((wc >> 6) & 0x3f));
+    *r=   (uchar) (0x80 | (wc & 0x3f));
+    return 3;
+  }
+  return MY_CS_ILUNI;
 }