Commit 482d3bfd authored by Alexander Barkov's avatar Alexander Barkov

Backporting WL#3759 Optimize identifier conversion in client-server protocol

This patch provides performance improvements:
- send_fields() when character_set_results = latin1
  is now about twice faster for column/table/database
  names, consisting on ASCII characters.

Changes:

- Protocol doesn't use "convert" temporary buffer anymore,
  and converts strings directly to "packet".

- General conversion optimization: quick conversion
  of ASCII strings was added.

modified files:

include/m_ctype.h
- Adding a new flag.
- Adding a new function prototype

libmysqld/lib_sql.cc
- Adding quick conversion method for embedded library:
  conversion is now done directly to result buffer,
  without using a temporary buffer.

mysys/charset.c
- Mark all dynamic ucs2 character sets as non-ASCII
- Mark some dymamic 7bit and 8bit charsets as non-ASCII
  (for example swe7 is not fully ASCII compatible).

sql/protocol.cc
- Adding quick method to convert a string directly
  into protocol buffer, without using a temporary buffer.

sql/protocol.h
- Adding a new method prototype

sql/sql_string.cc
  Optimization for conversion between two ASCII-compatible charsets:
- quickly convert ASCII strings,
  switch to mc_wc->wc_mb method only when a non-ASCII character is met.
- copy four ASCII characters at once on i386

strings/conf_to_src.c
- Marking non-ASCII character sets with a flag.

strings/ctype-extra.c
- Regenerating ctype-extra.c by running "conf_to_src".

strings/ctype-uca.c
- Marking UCS2 character set as non-ASCII.

strings/ctype-ucs2.c
- Marking UCS2 character set as non-ASCII.

strings/ctype.c
- A new function to detect if a 7bit or 8bit character set
  is ascii compatible.
parent 3a5c4ab4
......@@ -87,6 +87,7 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
#define MY_CS_CSSORT 1024 /* if case sensitive sort order */
#define MY_CS_HIDDEN 2048 /* don't display in SHOW */
#define MY_CS_PUREASCII 4096 /* if a charset is pure ascii */
#define MY_CS_NONASCII 8192 /* if not ASCII-compatible */
#define MY_CHARSET_UNDEFINED 0
/* Character repertoire flags */
......@@ -474,6 +475,7 @@ my_bool my_charset_is_ascii_based(CHARSET_INFO *cs);
my_bool my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs);
uint my_charset_repertoire(CHARSET_INFO *cs);
my_bool my_charset_is_ascii_compatible(CHARSET_INFO *cs);
#define _MY_U 01 /* Upper case */
#define _MY_L 02 /* Lower case */
......
......@@ -1175,3 +1175,27 @@ int vprint_msg_to_log(enum loglevel level __attribute__((unused)),
mysql_server_last_errno= CR_UNKNOWN_ERROR;
return 0;
}
bool Protocol::net_store_data(const uchar *from, size_t length,
CHARSET_INFO *from_cs, CHARSET_INFO *to_cs)
{
uint conv_length= to_cs->mbmaxlen * length / from_cs->mbminlen;
uint dummy_error;
char *field_buf;
if (!thd->mysql) // bootstrap file handling
return false;
if (!(field_buf= (char*) alloc_root(alloc, conv_length + sizeof(uint) + 1)))
return true;
*next_field= field_buf + sizeof(uint);
length= copy_and_convert(*next_field, conv_length, to_cs,
(const char*) from, length, from_cs, &dummy_error);
*(uint *) field_buf= length;
(*next_field)[length]= 0;
if (next_mysql_field->max_length < length)
next_mysql_field->max_length= length;
++next_field;
++next_mysql_field;
return false;
}
......@@ -248,6 +248,7 @@ static int add_collation(CHARSET_INFO *cs)
{
#if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
copy_uca_collation(newcs, &my_charset_ucs2_unicode_ci);
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
#endif
}
else if (!strcmp(cs->csname, "utf8"))
......@@ -280,6 +281,8 @@ static int add_collation(CHARSET_INFO *cs)
if (my_charset_is_8bit_pure_ascii(all_charsets[cs->number]))
all_charsets[cs->number]->state|= MY_CS_PUREASCII;
if (!my_charset_is_ascii_compatible(cs))
all_charsets[cs->number]->state|= MY_CS_NONASCII;
}
}
else
......
......@@ -58,6 +58,64 @@ bool Protocol_binary::net_store_data(const uchar *from, size_t length)
}
/*
net_store_data() - extended version with character set conversion.
It is optimized for short strings whose length after
conversion is garanteed to be less than 251, which accupies
exactly one byte to store length. It allows not to use
the "convert" member as a temporary buffer, conversion
is done directly to the "packet" member.
The limit 251 is good enough to optimize send_result_set_metadata()
because column, table, database names fit into this limit.
*/
#ifndef EMBEDDED_LIBRARY
bool Protocol::net_store_data(const uchar *from, size_t length,
CHARSET_INFO *from_cs, CHARSET_INFO *to_cs)
{
uint dummy_errors;
/* Calculate maxumum possible result length */
uint conv_length= to_cs->mbmaxlen * length / from_cs->mbminlen;
if (conv_length > 250)
{
/*
For strings with conv_length greater than 250 bytes
we don't know how many bytes we will need to store length: one or two,
because we don't know result length until conversion is done.
For example, when converting from utf8 (mbmaxlen=3) to latin1,
conv_length=300 means that the result length can vary between 100 to 300.
length=100 needs one byte, length=300 needs to bytes.
Thus conversion directly to "packet" is not worthy.
Let's use "convert" as a temporary buffer.
*/
return (convert->copy((const char*) from, length, from_cs,
to_cs, &dummy_errors) ||
net_store_data((const uchar*) convert->ptr(), convert->length()));
}
ulong packet_length= packet->length();
ulong new_length= packet_length + conv_length + 1;
if (new_length > packet->alloced_length() && packet->realloc(new_length))
return 1;
char *length_pos= (char*) packet->ptr() + packet_length;
char *to= length_pos + 1;
to+= copy_and_convert(to, conv_length, to_cs,
(const char*) from, length, from_cs, &dummy_errors);
net_store_length((uchar*) length_pos, to - length_pos - 1);
packet->length((uint) (to - packet->ptr()));
return 0;
}
#endif
/**
Send a error string to client.
......@@ -827,10 +885,10 @@ bool Protocol::store_string_aux(const char *from, size_t length,
fromcs != &my_charset_bin &&
tocs != &my_charset_bin)
{
uint dummy_errors;
return (convert->copy(from, length, fromcs, tocs, &dummy_errors) ||
net_store_data((uchar*) convert->ptr(), convert->length()));
/* Store with conversion */
return net_store_data((uchar*) from, length, fromcs, tocs);
}
/* Store without conversion */
return net_store_data((uchar*) from, length);
}
......
......@@ -43,6 +43,8 @@ protected:
MYSQL_FIELD *next_mysql_field;
MEM_ROOT *alloc;
#endif
bool net_store_data(const uchar *from, size_t length,
CHARSET_INFO *fromcs, CHARSET_INFO *tocs);
bool store_string_aux(const char *from, size_t length,
CHARSET_INFO *fromcs, CHARSET_INFO *tocs);
public:
......
......@@ -794,9 +794,10 @@ String *copy_if_not_alloced(String *to,String *from,uint32 from_length)
*/
uint32
copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length, CHARSET_INFO *from_cs,
static uint32
copy_and_convert_extended(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length,
CHARSET_INFO *from_cs,
uint *errors)
{
int cnvres;
......@@ -849,6 +850,65 @@ outp:
}
/*
Optimized for quick copying of ASCII characters in the range 0x00..0x7F.
*/
uint32
copy_and_convert(char *to, uint32 to_length, CHARSET_INFO *to_cs,
const char *from, uint32 from_length, CHARSET_INFO *from_cs,
uint *errors)
{
/*
If any of the character sets is not ASCII compatible,
immediately switch to slow mb_wc->wc_mb method.
*/
if ((to_cs->state | from_cs->state) & MY_CS_NONASCII)
return copy_and_convert_extended(to, to_length, to_cs,
from, from_length, from_cs, errors);
uint32 length= min(to_length, from_length), length2= length;
#if defined(__i386__)
/*
Special loop for i386, it allows to refer to a
non-aligned memory block as UINT32, which makes
it possible to copy four bytes at once. This
gives about 10% performance improvement comparing
to byte-by-byte loop.
*/
for ( ; length >= 4; length-= 4, from+= 4, to+= 4)
{
if ((*(uint32*)from) & 0x80808080)
break;
*((uint32*) to)= *((const uint32*) from);
}
#endif
for (; ; *to++= *from++, length--)
{
if (!length)
{
*errors= 0;
return length2;
}
if (*((unsigned char*) from) > 0x7F) /* A non-ASCII character */
{
uint32 copied_length= length2 - length;
to_length-= copied_length;
from_length-= copied_length;
return copied_length + copy_and_convert_extended(to, to_length,
to_cs,
from, from_length,
from_cs,
errors);
}
}
DBUG_ASSERT(FALSE); // Should never get to here
return 0; // Make compiler happy
}
/**
Copy string with HEX-encoding of "bad" characters.
......
......@@ -184,11 +184,12 @@ void dispcset(FILE *f,CHARSET_INFO *cs)
{
fprintf(f,"{\n");
fprintf(f," %d,%d,%d,\n",cs->number,0,0);
fprintf(f," MY_CS_COMPILED%s%s%s%s,\n",
fprintf(f," MY_CS_COMPILED%s%s%s%s%s,\n",
cs->state & MY_CS_BINSORT ? "|MY_CS_BINSORT" : "",
cs->state & MY_CS_PRIMARY ? "|MY_CS_PRIMARY" : "",
is_case_sensitive(cs) ? "|MY_CS_CSSORT" : "",
my_charset_is_8bit_pure_ascii(cs) ? "|MY_CS_PUREASCII" : "");
my_charset_is_8bit_pure_ascii(cs) ? "|MY_CS_PUREASCII" : "",
!my_charset_is_ascii_compatible(cs) ? "|MY_CS_NONASCII": "");
if (cs->name)
{
......
......@@ -6,7 +6,7 @@
./conf_to_src ../sql/share/charsets/ > FILE
*/
/* Copyright (C) 2000-2007 MySQL AB
/* Copyright 2000-2008 MySQL AB, 2008 Sun Microsystems, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -6804,7 +6804,7 @@ CHARSET_INFO compiled_charsets[] = {
#ifdef HAVE_CHARSET_swe7
{
10,0,0,
MY_CS_COMPILED|MY_CS_PRIMARY,
MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_NONASCII,
"swe7", /* cset name */
"swe7_swedish_ci", /* coll name */
"", /* comment */
......@@ -8454,7 +8454,7 @@ CHARSET_INFO compiled_charsets[] = {
#ifdef HAVE_CHARSET_swe7
{
82,0,0,
MY_CS_COMPILED|MY_CS_BINSORT,
MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_NONASCII,
"swe7", /* cset name */
"swe7_bin", /* coll name */
"", /* comment */
......@@ -8550,72 +8550,6 @@ CHARSET_INFO compiled_charsets[] = {
}
,
#endif
#ifdef HAVE_CHARSET_geostd8
{
92,0,0,
MY_CS_COMPILED|MY_CS_PRIMARY,
"geostd8", /* cset name */
"geostd8_general_ci", /* coll name */
"", /* comment */
NULL, /* tailoring */
ctype_geostd8_general_ci, /* ctype */
to_lower_geostd8_general_ci, /* lower */
to_upper_geostd8_general_ci, /* upper */
sort_order_geostd8_general_ci, /* sort_order */
NULL, /* contractions */
NULL, /* sort_order_big*/
to_uni_geostd8_general_ci, /* to_uni */
NULL, /* from_uni */
my_unicase_default, /* caseinfo */
NULL, /* state map */
NULL, /* ident map */
1, /* strxfrm_multiply*/
1, /* caseup_multiply*/
1, /* casedn_multiply*/
1, /* mbminlen */
1, /* mbmaxlen */
0, /* min_sort_char */
255, /* max_sort_char */
' ', /* pad_char */
0, /* escape_with_backslash_is_dangerous */
&my_charset_8bit_handler,
&my_collation_8bit_simple_ci_handler,
}
,
#endif
#ifdef HAVE_CHARSET_geostd8
{
93,0,0,
MY_CS_COMPILED|MY_CS_BINSORT,
"geostd8", /* cset name */
"geostd8_bin", /* coll name */
"", /* comment */
NULL, /* tailoring */
ctype_geostd8_bin, /* ctype */
to_lower_geostd8_bin, /* lower */
to_upper_geostd8_bin, /* upper */
NULL, /* sort_order */
NULL, /* contractions */
NULL, /* sort_order_big*/
to_uni_geostd8_bin, /* to_uni */
NULL, /* from_uni */
my_unicase_default, /* caseinfo */
NULL, /* state map */
NULL, /* ident map */
1, /* strxfrm_multiply*/
1, /* caseup_multiply*/
1, /* casedn_multiply*/
1, /* mbminlen */
1, /* mbmaxlen */
0, /* min_sort_char */
255, /* max_sort_char */
' ', /* pad_char */
0, /* escape_with_backslash_is_dangerous */
&my_charset_8bit_handler,
&my_collation_8bit_bin_handler,
}
,
#endif
#ifdef HAVE_CHARSET_latin1
{
94,0,0,
......
This diff is collapsed.
......@@ -1712,7 +1712,7 @@ MY_CHARSET_HANDLER my_charset_ucs2_handler=
CHARSET_INFO my_charset_ucs2_general_ci=
{
35,0,0, /* number */
MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE,
MY_CS_COMPILED|MY_CS_PRIMARY|MY_CS_STRNXFRM|MY_CS_UNICODE|MY_CS_NONASCII,
"ucs2", /* cs name */
"ucs2_general_ci", /* name */
"", /* comment */
......@@ -1744,7 +1744,7 @@ CHARSET_INFO my_charset_ucs2_general_ci=
CHARSET_INFO my_charset_ucs2_bin=
{
90,0,0, /* number */
MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE,
MY_CS_COMPILED|MY_CS_BINSORT|MY_CS_UNICODE|MY_CS_NONASCII,
"ucs2", /* cs name */
"ucs2_bin", /* name */
"", /* comment */
......
......@@ -405,3 +405,23 @@ my_charset_is_8bit_pure_ascii(CHARSET_INFO *cs)
}
return 1;
}
/*
Shared function between conf_to_src and mysys.
Check if a 8bit character set is compatible with
ascii on the range 0x00..0x7F.
*/
my_bool
my_charset_is_ascii_compatible(CHARSET_INFO *cs)
{
uint i;
if (!cs->tab_to_uni)
return 1;
for (i= 0; i < 128; i++)
{
if (cs->tab_to_uni[i] != i)
return 0;
}
return 1;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment