Commit f5c3c285 authored by Alexander Barkov's avatar Alexander Barkov

Performance improvements in "from latin1" and "to utf8" conversion.

Mini-benchmarking demonstrates up to 10% improvement in latin1->utf8
conversion.

modified:
  @ strings/ctype-latin1.c
  redundant test in ctype-latin1.c removed

  @ strings/ctype-utf8.c
  my_uni_utf8 rewritten in a more efficient way
parent 62222eb5
...@@ -364,9 +364,14 @@ int my_mb_wc_latin1(CHARSET_INFO *cs __attribute__((unused)), ...@@ -364,9 +364,14 @@ int my_mb_wc_latin1(CHARSET_INFO *cs __attribute__((unused)),
{ {
if (str >= end) if (str >= end)
return MY_CS_TOOSMALL; return MY_CS_TOOSMALL;
/*
*wc=cs_to_uni[*str]; There are no unassigned characters in latin1.
return (!wc[0] && str[0]) ? -1 : 1; Every code point in latin1 is mapped to some Unicode code point.
We can always return 1, no needs to check the value of cs_to_uni[*str].
*/
*wc= cs_to_uni[*str];
DBUG_ASSERT(wc[0] || !str[0]);
return 1;
} }
static static
......
...@@ -2404,46 +2404,33 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)), ...@@ -2404,46 +2404,33 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)), static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)),
my_wc_t wc, uchar *r, uchar *e) my_wc_t wc, uchar *r, uchar *e)
{ {
int count; if (wc < 0x80)
{
if (r >= e) if (r >= e)
return MY_CS_TOOSMALL; return MY_CS_TOOSMALL;
*r= (uchar) wc;
if (wc < 0x80) return 1;
count = 1;
else if (wc < 0x800)
count = 2;
else if (wc < 0x10000)
count = 3;
#ifdef UNICODE_32BIT
else if (wc < 0x200000)
count = 4;
else if (wc < 0x4000000)
count = 5;
else if (wc <= 0x7fffffff)
count = 6;
#endif
else return MY_CS_ILUNI;
/*
e is a character after the string r, not the last character of it.
Because of it (r+count > e), not (r+count-1 >e )
*/
if ( r+count > e )
return MY_CS_TOOSMALLN(count);
switch (count) {
/* Fall through all cases!!! */
#ifdef UNICODE_32BIT
case 6: r[5] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x4000000;
case 5: r[4] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x200000;
case 4: r[3] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000;
#endif
case 3: r[2] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800;
case 2: r[1] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0;
case 1: r[0] = (uchar) wc;
} }
return count; if (wc < 0x800)
{
if (r + 2 > e)
return MY_CS_TOOSMALLN(2);
/* U+0080..U+07FF: 00000xxx.xxyyyyyy -> 110xxxxx 10yyyyyy */
*r++= (uchar) (0xC0 | (wc >> 6));
*r= (uchar) (0x80 | (wc & 0x3F));
return 2;
}
if (wc < 0x10000)
{
if (r + 3 > e)
return MY_CS_TOOSMALLN(3);
/* U+0800..U+FFFF: xxxxyyyy.yyzzzzzz -> 1110xxxx 10yyyyyy 10zzzzzz */
*r++= (uchar) (0xE0 | (wc >> 12));
*r++= (uchar) (0x80 | ((wc >> 6) & 0x3f));
*r= (uchar) (0x80 | (wc & 0x3f));
return 3;
}
return MY_CS_ILUNI;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment