Commit f5c3c285 authored by Alexander Barkov's avatar Alexander Barkov

Performance improvements in "from latin1" and "to utf8" conversion.

Mini-benchmarking demonstrates up to 10% improvement in latin1->utf8
conversion.

modified:
  @ strings/ctype-latin1.c
  redundant test in ctype-latin1.c removed

  @ strings/ctype-utf8.c
  my_uni_utf8 rewritten in a more efficient way
parent 62222eb5
......@@ -364,9 +364,14 @@ int my_mb_wc_latin1(CHARSET_INFO *cs __attribute__((unused)),
{
if (str >= end)
return MY_CS_TOOSMALL;
*wc=cs_to_uni[*str];
return (!wc[0] && str[0]) ? -1 : 1;
/*
There are no unassigned characters in latin1.
Every code point in latin1 is mapped to some Unicode code point.
We can always return 1, no needs to check the value of cs_to_uni[*str].
*/
*wc= cs_to_uni[*str];
DBUG_ASSERT(wc[0] || !str[0]);
return 1;
}
static
......
......@@ -2404,46 +2404,33 @@ static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)),
my_wc_t wc, uchar *r, uchar *e)
{
int count;
if (wc < 0x80)
{
if (r >= e)
return MY_CS_TOOSMALL;
if (wc < 0x80)
count = 1;
else if (wc < 0x800)
count = 2;
else if (wc < 0x10000)
count = 3;
#ifdef UNICODE_32BIT
else if (wc < 0x200000)
count = 4;
else if (wc < 0x4000000)
count = 5;
else if (wc <= 0x7fffffff)
count = 6;
#endif
else return MY_CS_ILUNI;
/*
e is a character after the string r, not the last character of it.
Because of it (r+count > e), not (r+count-1 >e )
*/
if ( r+count > e )
return MY_CS_TOOSMALLN(count);
switch (count) {
/* Fall through all cases!!! */
#ifdef UNICODE_32BIT
case 6: r[5] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x4000000;
case 5: r[4] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x200000;
case 4: r[3] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x10000;
#endif
case 3: r[2] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0x800;
case 2: r[1] = (uchar) (0x80 | (wc & 0x3f)); wc = wc >> 6; wc |= 0xc0;
case 1: r[0] = (uchar) wc;
*r= (uchar) wc;
return 1;
}
return count;
if (wc < 0x800)
{
if (r + 2 > e)
return MY_CS_TOOSMALLN(2);
/* U+0080..U+07FF: 00000xxx.xxyyyyyy -> 110xxxxx 10yyyyyy */
*r++= (uchar) (0xC0 | (wc >> 6));
*r= (uchar) (0x80 | (wc & 0x3F));
return 2;
}
if (wc < 0x10000)
{
if (r + 3 > e)
return MY_CS_TOOSMALLN(3);
/* U+0800..U+FFFF: xxxxyyyy.yyzzzzzz -> 1110xxxx 10yyyyyy 10zzzzzz */
*r++= (uchar) (0xE0 | (wc >> 12));
*r++= (uchar) (0x80 | ((wc >> 6) & 0x3f));
*r= (uchar) (0x80 | (wc & 0x3f));
return 3;
}
return MY_CS_ILUNI;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment