Commit 24044566 authored by Alexander Barkov's avatar Alexander Barkov

Merging my_convert() from 10.0-serg

modified:
  include/m_ctype.h
  mysys/ma_dyncol.c
  mysys/string.c
  sql/sql_string.cc
  sql/sql_string.h
  strings/ctype.c
parent 5f6380ad
...@@ -591,6 +591,10 @@ my_bool my_charset_is_ascii_compatible(CHARSET_INFO *cs); ...@@ -591,6 +591,10 @@ my_bool my_charset_is_ascii_compatible(CHARSET_INFO *cs);
extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n, extern size_t my_vsnprintf_ex(CHARSET_INFO *cs, char *to, size_t n,
const char* fmt, va_list ap); const char* fmt, va_list ap);
uint32 my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length,
CHARSET_INFO *from_cs, uint *errors);
#define _MY_U 01 /* Upper case */ #define _MY_U 01 /* Upper case */
#define _MY_L 02 /* Lower case */ #define _MY_L 02 /* Lower case */
#define _MY_NMR 04 /* Numeral (digit) */ #define _MY_NMR 04 /* Numeral (digit) */
......
...@@ -3853,20 +3853,19 @@ mariadb_dyncol_val_str(DYNAMIC_STRING *str, DYNAMIC_COLUMN_VALUE *val, ...@@ -3853,20 +3853,19 @@ mariadb_dyncol_val_str(DYNAMIC_STRING *str, DYNAMIC_COLUMN_VALUE *val,
if (!quote) if (!quote)
{ {
/* convert to the destination */ /* convert to the destination */
str->length+= copy_and_convert_extended(str->str, bufflen, str->length+= my_convert(str->str, bufflen,
cs, cs,
from, (uint32)len, from, (uint32)len,
val->x.string.charset, val->x.string.charset,
&dummy_errors); &dummy_errors);
return ER_DYNCOL_OK; return ER_DYNCOL_OK;
} }
if ((alloc= (char *)my_malloc(bufflen, MYF(0)))) if ((alloc= (char *)my_malloc(bufflen, MYF(0))))
{ {
len= len= my_convert(alloc, bufflen, cs,
copy_and_convert_extended(alloc, bufflen, cs, from, (uint32)len,
from, (uint32)len, val->x.string.charset,
val->x.string.charset, &dummy_errors);
&dummy_errors);
from= alloc; from= alloc;
} }
else else
......
...@@ -223,77 +223,3 @@ void dynstr_reassociate(DYNAMIC_STRING *str, char **ptr, size_t *length, ...@@ -223,77 +223,3 @@ void dynstr_reassociate(DYNAMIC_STRING *str, char **ptr, size_t *length,
*alloc_length= str->max_length; *alloc_length= str->max_length;
str->str=0; str->str=0;
} }
/*
copy a string from one character set to another
SYNOPSIS
copy_and_convert()
to Store result here
to_cs Character set of result string
from Copy from here
from_length Length of from string
from_cs From character set
NOTES
'to' must be big enough as form_length * to_cs->mbmaxlen
RETURN
length of bytes copied to 'to'
*/
uint32
copy_and_convert_extended(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length,
CHARSET_INFO *from_cs,
uint *errors)
{
int cnvres;
my_wc_t wc;
const uchar *from_end= (const uchar*) from+from_length;
char *to_start= to;
uchar *to_end= (uchar*) to+to_length;
my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
uint error_count= 0;
while (1)
{
if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from,
from_end)) > 0)
from+= cnvres;
else if (cnvres == MY_CS_ILSEQ)
{
error_count++;
from++;
wc= '?';
}
else if (cnvres > MY_CS_TOOSMALL)
{
/*
A correct multibyte sequence detected
But it doesn't have Unicode mapping.
*/
error_count++;
from+= (-cnvres);
wc= '?';
}
else
break; // Not enough characters
outp:
if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
to+= cnvres;
else if (cnvres == MY_CS_ILUNI && wc != '?')
{
error_count++;
wc= '?';
goto outp;
}
else
break;
}
*errors= error_count;
return (uint32) (to - to_start);
}
...@@ -776,67 +776,6 @@ String *copy_if_not_alloced(String *to,String *from,uint32 from_length) ...@@ -776,67 +776,6 @@ String *copy_if_not_alloced(String *to,String *from,uint32 from_length)
Help functions Help functions
****************************************************************************/ ****************************************************************************/
/*
Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
*/
uint32
copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length, CHARSET_INFO *from_cs,
uint *errors)
{
/*
If any of the character sets is not ASCII compatible,
immediately switch to slow mb_wc->wc_mb method.
*/
if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
return copy_and_convert_extended(to, to_length, to_cs,
from, from_length, from_cs, errors);
uint32 length= min(to_length, from_length), length2= length;
#if defined(__i386__) || defined(__x86_64__)
/*
Special loop for i386, it allows to refer to a
non-aligned memory block as UINT32, which makes
it possible to copy four bytes at once. This
gives about 10% performance improvement comparing
to byte-by-byte loop.
*/
for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
{
if ((*(uint32*)from) & 0x80808080)
break;
*((uint32*) to)= *((const uint32*) from);
}
#endif
for (; ; *to++= *from++, length--)
{
if (!length)
{
*errors= 0;
return length2;
}
if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
{
uint32 copied_length= length2 - length;
to_length-= copied_length;
from_length-= copied_length;
return copied_length + copy_and_convert_extended(to, to_length,
to_cs,
from, from_length,
from_cs,
errors);
}
}
DBUG_ASSERT(FALSE); // Should never get to here
return 0; // Make compiler happy
}
/** /**
Copy string with HEX-encoding of "bad" characters. Copy string with HEX-encoding of "bad" characters.
......
...@@ -34,9 +34,13 @@ typedef struct st_mem_root MEM_ROOT; ...@@ -34,9 +34,13 @@ typedef struct st_mem_root MEM_ROOT;
int sortcmp(const String *a,const String *b, CHARSET_INFO *cs); int sortcmp(const String *a,const String *b, CHARSET_INFO *cs);
String *copy_if_not_alloced(String *a,String *b,uint32 arg_length); String *copy_if_not_alloced(String *a,String *b,uint32 arg_length);
uint32 copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs, inline uint32 copy_and_convert(char *to, uint32 to_length,
const char *from, uint32 from_length, const CHARSET_INFO *to_cs,
CHARSET_INFO *from_cs, uint *errors); const char *from, uint32 from_length,
const CHARSET_INFO *from_cs, uint *errors)
{
return my_convert(to, to_length, to_cs, from, from_length, from_cs, errors);
}
uint32 well_formed_copy_nchars(CHARSET_INFO *to_cs, uint32 well_formed_copy_nchars(CHARSET_INFO *to_cs,
char *to, uint to_length, char *to, uint to_length,
CHARSET_INFO *from_cs, CHARSET_INFO *from_cs,
......
...@@ -430,3 +430,144 @@ my_charset_is_ascii_compatible(CHARSET_INFO *cs) ...@@ -430,3 +430,144 @@ my_charset_is_ascii_compatible(CHARSET_INFO *cs)
} }
return 1; return 1;
} }
/*
Convert a string between two character sets.
'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
@param to[OUT] Store result here
@param to_length Size of "to" buffer
@param to_cs Character set of result string
@param from Copy from here
@param from_length Length of the "from" string
@param from_cs Character set of the "from" string
@param errors[OUT] Number of conversion errors
@return Number of bytes copied to 'to' string
*/
static uint32
my_convert_internal(char *to, uint32 to_length,
CHARSET_INFO *to_cs,
const char *from, uint32 from_length,
CHARSET_INFO *from_cs, uint *errors)
{
int cnvres;
my_wc_t wc;
const uchar *from_end= (const uchar*) from + from_length;
char *to_start= to;
uchar *to_end= (uchar*) to + to_length;
my_charset_conv_mb_wc mb_wc= from_cs->cset->mb_wc;
my_charset_conv_wc_mb wc_mb= to_cs->cset->wc_mb;
uint error_count= 0;
while (1)
{
if ((cnvres= (*mb_wc)(from_cs, &wc, (uchar*) from, from_end)) > 0)
from+= cnvres;
else if (cnvres == MY_CS_ILSEQ)
{
error_count++;
from++;
wc= '?';
}
else if (cnvres > MY_CS_TOOSMALL)
{
/*
A correct multibyte sequence detected
But it doesn't have Unicode mapping.
*/
error_count++;
from+= (-cnvres);
wc= '?';
}
else
break; // Not enough characters
outp:
if ((cnvres= (*wc_mb)(to_cs, wc, (uchar*) to, to_end)) > 0)
to+= cnvres;
else if (cnvres == MY_CS_ILUNI && wc != '?')
{
error_count++;
wc= '?';
goto outp;
}
else
break;
}
*errors= error_count;
return (uint32) (to - to_start);
}
/*
Convert a string between two character sets.
Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
'to' must be large enough to store (form_length * to_cs->mbmaxlen) bytes.
@param to[OUT] Store result here
@param to_length Size of "to" buffer
@param to_cs Character set of result string
@param from Copy from here
@param from_length Length of the "from" string
@param from_cs Character set of the "from" string
@param errors[OUT] Number of conversion errors
@return Number of bytes copied to 'to' string
*/
uint32
my_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length,
CHARSET_INFO *from_cs, uint *errors)
{
uint32 length, length2;
/*
If any of the character sets is not ASCII compatible,
immediately switch to slow mb_wc->wc_mb method.
*/
if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
return my_convert_internal(to, to_length, to_cs,
from, from_length, from_cs, errors);
length= length2= MY_MIN(to_length, from_length);
#if defined(__i386__) || defined(__x86_64__)
/*
Special loop for i386, it allows to refer to a
non-aligned memory block as UINT32, which makes
it possible to copy four bytes at once. This
gives about 10% performance improvement comparing
to byte-by-byte loop.
*/
for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
{
if ((*(uint32*)from) & 0x80808080)
break;
*((uint32*) to)= *((const uint32*) from);
}
#endif /* __i386__ */
for (; ; *to++= *from++, length--)
{
if (!length)
{
*errors= 0;
return length2;
}
if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
{
uint32 copied_length= length2 - length;
to_length-= copied_length;
from_length-= copied_length;
return copied_length + my_convert_internal(to, to_length, to_cs,
from, from_length, from_cs,
errors);
}
}
DBUG_ASSERT(FALSE); // Should never get to here
return 0; // Make compiler happy
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment