Bug#20404: SHOW CREATE TABLE fails with Turkish I

Problem: SHOW CREATE TABLE printed garbage in table name for tables having TURKISH I (i.e. LATIN CAPITABLE LETTER I WITH DOT ABOVE) when lower-case-table-name=1. Reason: In some cases during lower/upper conversion in utf8, the result string can be shorter the original string (including the above letter). Old implementation of caseup_str() and casedn_str() didn't handle the result length properly, assuming that length cannot change. This fix changes the result type of cs->cset->casedn_str() and cs->cset->caseup_str() from VOID to UINT, to return the result length, as well as put '\0' terminator on a proper place. Also, my_caseup_str_utf8() and my_casedn_str_utf8() were rewritten not to use strlen() for performance purposes. It was done with help of adding of new functions - my_utf8_uni_no_range() and my_uni_utf8_no_range() - for null terminated strings. include/m_ctype.h: Changeing return type from void to int for caseup_str() and casedn_str() mysql-test/r/lowercase_table.result: Adding test case mysql-test/t/lowercase_table.test: Adding test case sql/sql_parse.cc: Set table->table.length to result of my_casedn_str(). strings/ctype-bin.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-mb.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-simple.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-ucs2.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-utf8.c: Changeing return type from void to int for caseup_str() and casedn_str(). Optimization, to get rid of strlen(): Adding my_utf8_uni_no_range() and my_uni_utf8_no_range() - for null terninated strings.

Bug#20404: SHOW CREATE TABLE fails with Turkish I
Problem: SHOW CREATE TABLE printed garbage in table name for tables having TURKISH I (i.e. LATIN CAPITABLE LETTER I WITH DOT ABOVE) when lower-case-table-name=1. Reason: In some cases during lower/upper conversion in utf8, the result string can be shorter the original string (including the above letter). Old implementation of caseup_str() and casedn_str() didn't handle the result length properly, assuming that length cannot change. This fix changes the result type of cs->cset->casedn_str() and cs->cset->caseup_str() from VOID to UINT, to return the result length, as well as put '\0' terminator on a proper place. Also, my_caseup_str_utf8() and my_casedn_str_utf8() were rewritten not to use strlen() for performance purposes. It was done with help of adding of new functions - my_utf8_uni_no_range() and my_uni_utf8_no_range() - for null terminated strings. include/m_ctype.h: Changeing return type from void to int for caseup_str() and casedn_str() mysql-test/r/lowercase_table.result: Adding test case mysql-test/t/lowercase_table.test: Adding test case sql/sql_parse.cc: Set table->table.length to result of my_casedn_str(). strings/ctype-bin.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-mb.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-simple.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-ucs2.c: Changeing return type from void to int for caseup_str() and casedn_str() strings/ctype-utf8.c: Changeing return type from void to int for caseup_str() and casedn_str(). Optimization, to get rid of strlen(): Adding my_utf8_uni_no_range() and my_uni_utf8_no_range() - for null terninated strings.
e463ee94 · unknown · 4453dc48 · e463ee94 · e463ee94 · e463ee94
Commit e463ee94 authored Oct 30, 2006 by unknown
9 changed files
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -178,8 +178,8 @@ typedef struct my_charset_handler_st
 	       unsigned char *s,unsigned char *e);
  /* Functions for case and sort convertion */
-  void    (*caseup_str)(struct charset_info_st *, char *);
+  uint    (*caseup_str)(struct charset_info_st *, char *);
-  void    (*casedn_str)(struct charset_info_st *, char *);
+  uint    (*casedn_str)(struct charset_info_st *, char *);
  uint    (*caseup)(struct charset_info_st *, char *src, uint srclen,
                                              char *dst, uint dstlen);
  uint    (*casedn)(struct charset_info_st *, char *src, uint srclen,
@@ -311,8 +311,8 @@ extern uint my_instr_simple(struct charset_info_st *,
 /* Functions for 8bit */
-extern void my_caseup_str_8bit(CHARSET_INFO *, char *);
+extern uint my_caseup_str_8bit(CHARSET_INFO *, char *);
-extern void my_casedn_str_8bit(CHARSET_INFO *, char *);
+extern uint my_casedn_str_8bit(CHARSET_INFO *, char *);
 extern uint my_caseup_8bit(CHARSET_INFO *, char *src, uint srclen,
                                           char *dst, uint dstlen);
 extern uint my_casedn_8bit(CHARSET_INFO *, char *src, uint srclen,
@@ -399,8 +399,8 @@ int my_mbcharlen_8bit(CHARSET_INFO *, uint c);
 /* Functions for multibyte charsets */
-extern void my_caseup_str_mb(CHARSET_INFO *, char *);
+extern uint my_caseup_str_mb(CHARSET_INFO *, char *);
-extern void my_casedn_str_mb(CHARSET_INFO *, char *);
+extern uint my_casedn_str_mb(CHARSET_INFO *, char *);
 extern uint my_caseup_mb(CHARSET_INFO *, char *src, uint srclen,
                                         char *dst, uint dstlen);
 extern uint my_casedn_mb(CHARSET_INFO *, char *src, uint srclen,

--- a/mysql-test/r/lowercase_table.result
+++ b/mysql-test/r/lowercase_table.result
@@ -84,3 +84,27 @@ create table t2 like T1;
 drop table t1, t2;
 show tables;
 Tables_in_test
+set names utf8;
+drop table if exists İ,İİ;
+create table İ (s1 int);
+show create table İ;
+Table	Create Table
+İ	CREATE TABLE `i` (
+  `s1` int(11) default NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+show tables;
+Tables_in_test
+i
+drop table İ;
+create table İİ (s1 int);
+show create table İİ;
+Table	Create Table
+İİ	CREATE TABLE `ii` (
+  `s1` int(11) default NULL
+) ENGINE=MyISAM DEFAULT CHARSET=latin1
+show tables;
+Tables_in_test
+ii
+drop table İİ;
+set names latin1;
+End of 5.0 tests
--- a/mysql-test/t/lowercase_table.test
+++ b/mysql-test/t/lowercase_table.test
@@ -85,3 +85,23 @@ drop table t1, t2;
 show tables;
 # End of 4.1 tests
+#
+# Bug#20404: SHOW CREATE TABLE fails with Turkish I
+#
+set names utf8;
+--disable_warnings
+drop table if exists İ,İİ;
+--enable_warnings
+create table İ (s1 int);
+show create table İ;
+show tables;
+drop table İ;
+create table İİ (s1 int);
+show create table İİ;
+show tables;
+drop table İİ;
+set names latin1;
+--echo End of 5.0 tests
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -6177,7 +6177,7 @@ TABLE_LIST *st_select_lex::add_table_to_list(THD *thd,
  ptr->alias= alias_str;
  if (lower_case_table_names && table->table.length)
-    my_casedn_str(files_charset_info, table->table.str);
+    table->table.length= my_casedn_str(files_charset_info, table->table.str);
  ptr->table_name=table->table.str;
  ptr->table_name_length=table->table.length;
  ptr->lock_type=   lock_type;

--- a/strings/ctype-bin.c
+++ b/strings/ctype-bin.c
@@ -211,9 +211,10 @@ static int my_strnncollsp_8bit_bin(CHARSET_INFO * cs __attribute__((unused)),
 /* This function is used for all conversion functions */
-static void my_case_str_bin(CHARSET_INFO *cs __attribute__((unused)),
+static uint my_case_str_bin(CHARSET_INFO *cs __attribute__((unused)),
 			    char *str __attribute__((unused)))
 {
+  return 0;
 }
 static uint my_case_bin(CHARSET_INFO *cs __attribute__((unused)),

--- a/strings/ctype-mb.c
+++ b/strings/ctype-mb.c
@@ -21,40 +21,44 @@
 #ifdef USE_MB
-void my_caseup_str_mb(CHARSET_INFO * cs, char *str)
+uint my_caseup_str_mb(CHARSET_INFO * cs, char *str)
 {
  register uint32 l;
-  register uchar *map=cs->to_upper;
+  register uchar *map= cs->to_upper;
+  char *str_orig= str;
  while (*str)
  {
    /* Pointing after the '\0' is safe here. */
-    if ((l=my_ismbchar(cs, str, str + cs->mbmaxlen)))
+    if ((l= my_ismbchar(cs, str, str + cs->mbmaxlen)))
-      str+=l;
+      str+= l;
    else
    { 
-      *str=(char) map[(uchar)*str];
+      *str= (char) map[(uchar)*str];
      str++;
    }
  }
+  return str - str_orig;
 }
-void my_casedn_str_mb(CHARSET_INFO * cs, char *str)
+uint my_casedn_str_mb(CHARSET_INFO * cs, char *str)
 {
  register uint32 l;
-  register uchar *map=cs->to_lower;
+  register uchar *map= cs->to_lower;
+  char *str_orig= str;
  while (*str)
  {
    /* Pointing after the '\0' is safe here. */
-    if ((l=my_ismbchar(cs, str, str + cs->mbmaxlen)))
+    if ((l= my_ismbchar(cs, str, str + cs->mbmaxlen)))
-      str+=l;
+      str+= l;
    else
    {
-      *str=(char) map[(uchar)*str];
+      *str= (char) map[(uchar)*str];
      str++;
    }
  }
+  return str - str_orig;
 }
 uint my_caseup_mb(CHARSET_INFO * cs, char *src, uint srclen,

--- a/strings/ctype-simple.c
+++ b/strings/ctype-simple.c
@@ -188,20 +188,26 @@ int my_strnncollsp_simple(CHARSET_INFO * cs, const uchar *a, uint a_length,
 }
-void my_caseup_str_8bit(CHARSET_INFO * cs,char *str)
+uint my_caseup_str_8bit(CHARSET_INFO * cs,char *str)
 {
-  register uchar *map=cs->to_upper;
+  register uchar *map= cs->to_upper;
-  while ((*str = (char) map[(uchar) *str]) != 0)
+  char *str_orig= str;
+  while ((*str= (char) map[(uchar) *str]) != 0)
    str++;
+  return str - str_orig;
 }
-void my_casedn_str_8bit(CHARSET_INFO * cs,char *str)
+uint my_casedn_str_8bit(CHARSET_INFO * cs,char *str)
 {
-  register uchar *map=cs->to_lower;
+  register uchar *map= cs->to_lower;
-  while ((*str = (char) map[(uchar)*str]) != 0)
+  char *str_orig= str;
+  while ((*str= (char) map[(uchar) *str]) != 0)
    str++;
+  return str - str_orig;
 }
 uint my_caseup_8bit(CHARSET_INFO * cs, char *src, uint srclen,
                    char *dst __attribute__((unused)),
                    uint dstlen __attribute__((unused)))

--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -159,13 +159,13 @@ static void my_hash_sort_ucs2(CHARSET_INFO *cs, const uchar *s, uint slen,
 }
-static void my_caseup_str_ucs2(CHARSET_INFO * cs  __attribute__((unused)), 
+static uint my_caseup_str_ucs2(CHARSET_INFO * cs  __attribute__((unused)), 
 			       char * s __attribute__((unused)))
 {
+  return 0;
 }
 static uint my_casedn_ucs2(CHARSET_INFO *cs, char *src, uint srclen,
                           char *dst __attribute__((unused)),
                           uint dstlen __attribute__((unused)))
@@ -188,9 +188,11 @@ static uint my_casedn_ucs2(CHARSET_INFO *cs, char *src, uint srclen,
  return srclen;
 }
-static void my_casedn_str_ucs2(CHARSET_INFO *cs __attribute__((unused)), 
+static uint my_casedn_str_ucs2(CHARSET_INFO *cs __attribute__((unused)), 
 			       char * s __attribute__((unused)))
 {
+  return 0;
 }

--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -2045,6 +2045,52 @@ static int my_utf8_uni(CHARSET_INFO *cs __attribute__((unused)),
  return MY_CS_ILSEQ;
 }
+/*
+  The same as above, but without range check
+  for example, for a null-terminated string
+*/
+static int my_utf8_uni_no_range(CHARSET_INFO *cs __attribute__((unused)),
+                                my_wc_t * pwc, const uchar *s)
+{
+  unsigned char c;
+  c= s[0];
+  if (c < 0x80)
+  {
+    *pwc = c;
+    return 1;
+  }
+  if (c < 0xc2)
+    return MY_CS_ILSEQ;
+  if (c < 0xe0)
+  {
+    if (!((s[1] ^ 0x80) < 0x40))
+      return MY_CS_ILSEQ;
+    *pwc = ((my_wc_t) (c & 0x1f) << 6) | (my_wc_t) (s[1] ^ 0x80);
+    return 2;
+  }
+  if (c < 0xf0)
+  {
+    if (!((s[1] ^ 0x80) < 0x40 &&
+          (s[2] ^ 0x80) < 0x40 &&
+          (c >= 0xe1 || s[1] >= 0xa0)))
+      return MY_CS_ILSEQ;
+    *pwc= ((my_wc_t) (c & 0x0f) << 12)   |
+          ((my_wc_t) (s[1] ^ 0x80) << 6) |
+           (my_wc_t) (s[2] ^ 0x80);
+    return 3;
+  }
+  return MY_CS_ILSEQ;
+}
 static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)) ,
                 my_wc_t wc, uchar *r, uchar *e)
 {
@@ -2091,6 +2137,34 @@ static int my_uni_utf8 (CHARSET_INFO *cs __attribute__((unused)) ,
 }
+/*
+  The same as above, but without range check.
+*/
+static int my_uni_utf8_no_range(CHARSET_INFO *cs __attribute__((unused)),
+                                my_wc_t wc, uchar *r)
+{
+  int count;
+  if (wc < 0x80)
+    count= 1;
+  else if (wc < 0x800)
+    count= 2;
+  else if (wc < 0x10000)
+    count= 3;
+  else
+    return MY_CS_ILUNI;
+  switch (count)
+  {
+    /* Fall through all cases!!! */
+    case 3: r[2]= (uchar) (0x80 | (wc & 0x3f)); wc= wc >> 6; wc |= 0x800;
+    case 2: r[1]= (uchar) (0x80 | (wc & 0x3f)); wc= wc >> 6; wc |= 0xc0;
+    case 1: r[0]= (uchar) wc;
+  }
+  return count;
+}
 static uint my_caseup_utf8(CHARSET_INFO *cs, char *src, uint srclen,
                                             char *dst, uint dstlen)
 {
@@ -2141,10 +2215,26 @@ static void my_hash_sort_utf8(CHARSET_INFO *cs, const uchar *s, uint slen,
 }
-static void my_caseup_str_utf8(CHARSET_INFO * cs, char * s)
+static uint my_caseup_str_utf8(CHARSET_INFO *cs, char *src)
 {
-  uint len= (uint) strlen(s);
+  my_wc_t wc;
-  my_caseup_utf8(cs, s, len, s, len);
+  int srcres, dstres;
+  char *dst= src, *dst0= src;
+  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
+  DBUG_ASSERT(cs->caseup_multiply == 1);
+  while (*src &&
+         (srcres= my_utf8_uni_no_range(cs, &wc, (uchar *) src)) > 0)
+  {
+    int plane= (wc>>8) & 0xFF;
+    wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].toupper : wc;
+    if ((dstres= my_uni_utf8_no_range(cs, wc, (uchar*) dst)) <= 0)
+      break;
+    src+= srcres;
+    dst+= dstres;
+  }
+  *dst= '\0';
+  return (uint) (dst - dst0);
 }
@@ -2170,10 +2260,43 @@ static uint my_casedn_utf8(CHARSET_INFO *cs, char *src, uint srclen,
  return (uint) (dst - dst0);
 }
-static void my_casedn_str_utf8(CHARSET_INFO *cs, char * s)
+static uint my_casedn_str_utf8(CHARSET_INFO *cs, char *src)
 {
-  uint len= (uint) strlen(s);
+  my_wc_t wc;
-  my_casedn_utf8(cs, s, len, s, len);
+  int srcres, dstres;
+  char *dst= src, *dst0= src;
+  MY_UNICASE_INFO **uni_plane= cs->caseinfo;
+  DBUG_ASSERT(cs->casedn_multiply == 1);
+  while (*src &&
+         (srcres= my_utf8_uni_no_range(cs, &wc, (uchar *) src)) > 0)
+  {
+    int plane= (wc>>8) & 0xFF;
+    wc= uni_plane[plane] ? uni_plane[plane][wc & 0xFF].tolower : wc;
+    if ((dstres= my_uni_utf8_no_range(cs, wc, (uchar*) dst)) <= 0)
+      break;
+    src+= srcres;
+    dst+= dstres;
+  }
+  /*
+   In rare cases lower string can be shorter than
+   the original string, for example:
+   "U+0130 LATIN CAPITAL LETTER I WITH DOT ABOVE"
+   (which is 0xC4B0 in utf8, i.e. two bytes)
+   is converted into
+   "U+0069 LATIN SMALL LETTER I"
+   (which is 0x69 in utf8, i.e. one byte)
+   So, we need to put '\0' terminator after converting.
+  */
+  *dst= '\0';
+  return (uint) (dst - dst0);
 }