Bug #6040 can't retrieve records with umlaut characters in case insensitive manner

1596d465 · bar@mysql.com · a95c1173 · 1596d465 · 1596d465 · 1596d465
Commit 1596d465 authored Oct 18, 2004 by bar@mysql.com
5 changed files
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -365,6 +365,11 @@ uint my_instr_mb(struct charset_info_st *,
                 const char *s, uint s_length,
                 my_match_t *match, uint nmatch);

+int my_wildcmp_unicode(CHARSET_INFO *cs,
+                       const char *str, const char *str_end,
+                       const char *wildstr, const char *wildend,
+                       int escape, int w_one, int w_many,
+                       MY_UNICASE_INFO **weights);

 extern my_bool my_parse_charset_xml(const char *bug, uint len,
 				    int (*add)(CHARSET_INFO *cs));

--- a/mysql-test/r/ctype_utf8.result
+++ b/mysql-test/r/ctype_utf8.result
@@ -63,6 +63,15 @@ select 'A' like 'a' collate utf8_bin;
 select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%');
 _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%')
 1
+select convert(_latin1'Gnter Andr' using utf8) like CONVERT(_latin1'GNTER%' USING utf8);
+convert(_latin1'Gnter Andr' using utf8) like CONVERT(_latin1'GNTER%' USING utf8)
+1
+select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
+CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8)
+1
+select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
+CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8)
+1
 SELECT 'a' = 'a ';
 'a' = 'a '
 1

--- a/mysql-test/t/ctype_utf8.test
+++ b/mysql-test/t/ctype_utf8.test
@@ -33,6 +33,14 @@ select 'A' like 'a';
 select 'A' like 'a' collate utf8_bin;
 select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%');

+# Bug #6040: can't retrieve records with umlaut
+# characters in case insensitive manner.
+# Case insensitive search LIKE comparison
+# was broken for multibyte characters:
+select convert(_latin1'Gnter Andr' using utf8) like CONVERT(_latin1'GNTER%' USING utf8);
+select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
+select CONVERT(_koi8r'' USING utf8) LIKE CONVERT(_koi8r'' USING utf8);
+
 #
 # Check the following:
 # "a"  == "a "

--- a/strings/ctype-ucs2.c
+++ b/strings/ctype-ucs2.c
@@ -1231,172 +1231,14 @@ uint my_lengthsp_ucs2(CHARSET_INFO *cs __attribute__((unused)),
 }


-/*
-** Compare string against string with wildcard
-**	0 if matched
-**	-1 if not matched with wildcard
-**	 1 if matched with wildcard
-*/
-
-static
-int my_wildcmp_ucs2(CHARSET_INFO *cs,
-		    const char *str,const char *str_end,
-		    const char *wildstr,const char *wildend,
-		    int escape, int w_one, int w_many,
-		    MY_UNICASE_INFO **weights)
-{
-  int result= -1;			/* Not found, using wildcards */
-  my_wc_t s_wc, w_wc;
-  int scan, plane;
-  
-  while (wildstr != wildend)
-  {
-    
-    while (1)
-    {
-      scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			(const uchar*)wildend);
-      if (scan <= 0)
-        return 1;
-      
-      if (w_wc ==  (my_wc_t)escape)
-      {
-        wildstr+= scan;
-        scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			  (const uchar*)wildend);
-        if (scan <= 0)
-          return 1;
-      }
-      
-      if (w_wc == (my_wc_t)w_many)
-      {
-        result= 1;				/* Found an anchor char */
-        break;
-      }
-      
-      wildstr+= scan;
-      scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str, (const uchar*)str_end);
-      if (scan <=0)
-        return 1;
-      str+= scan;
-      
-      if (w_wc == (my_wc_t)w_one)
-      {
-        result= 1;				/* Found an anchor char */
-      }
-      else
-      {
-        if (weights)
-        {
-          plane=(s_wc>>8) & 0xFF;
-          s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
-          plane=(w_wc>>8) & 0xFF;
-          w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
-        }
-        if (s_wc != w_wc)
-          return 1;				/* No match */
-      }
-      if (wildstr == wildend)
-	return (str != str_end);		/* Match if both are at end */
-    }
-    
-    
-    if (w_wc == (my_wc_t)w_many)
-    {						/* Found w_many */
-    
-      /* Remove any '%' and '_' from the wild search string */
-      for ( ; wildstr != wildend ; )
-      {
-        scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			  (const uchar*)wildend);
-        if (scan <= 0)
-          return 1;
-        
-	if (w_wc == (my_wc_t)w_many)
-	{
-	  wildstr+= scan;
-	  continue;
-	} 
-	
-	if (w_wc == (my_wc_t)w_one)
-	{
-	  wildstr+= scan;
-	  scan= my_ucs2_uni(cs, &s_wc, (const uchar*)str,
-			    (const uchar*)str_end);
-          if (scan <=0)
-            return 1;
-          str+= scan;
-	  continue;
-	}
-	break;					/* Not a wild character */
-      }
-      
-      if (wildstr == wildend)
-	return 0;				/* Ok if w_many is last */
-      
-      if (str == str_end)
-	return -1;
-      
-      scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			(const uchar*)wildend);
-      if (scan <= 0)
-        return 1;
-      
-      if (w_wc ==  (my_wc_t)escape)
-      {
-        wildstr+= scan;
-        scan= my_ucs2_uni(cs,&w_wc, (const uchar*)wildstr,
-			  (const uchar*)wildend);
-        if (scan <= 0)
-          return 1;
-      }
-      
-      while (1)
-      {
-        /* Skip until the first character from wildstr is found */
-        while (str != str_end)
-        {
-          scan= my_ucs2_uni(cs,&s_wc, (const uchar*)str,
-			    (const uchar*)str_end);
-          if (scan <= 0)
-            return 1;
-          if (weights)
-          {
-            plane=(s_wc>>8) & 0xFF;
-            s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
-            plane=(w_wc>>8) & 0xFF;
-            w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
-          }
-          
-          if (s_wc == w_wc)
-            break;
-          str+= scan;
-        }
-        if (str == str_end)
-          return -1;
-        
-        result= my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,escape,
-                                w_one,w_many,weights);
-        
-        if (result <= 0)
-          return result;
-        
-        str+= scan;
-      } 
-    }
-  }
-  return (str != str_end ? 1 : 0);
-}
-
-
 static
 int my_wildcmp_ucs2_ci(CHARSET_INFO *cs,
 		    const char *str,const char *str_end,
 		    const char *wildstr,const char *wildend,
 		    int escape, int w_one, int w_many)
 {
-  return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,
-                         escape,w_one,w_many,uni_plane); 
+  return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+                            escape,w_one,w_many,uni_plane); 
 }


@@ -1406,8 +1248,8 @@ int my_wildcmp_ucs2_bin(CHARSET_INFO *cs,
 		    const char *wildstr,const char *wildend,
 		    int escape, int w_one, int w_many)
 {
-  return my_wildcmp_ucs2(cs,str,str_end,wildstr,wildend,
-                         escape,w_one,w_many,NULL); 
+  return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+                            escape,w_one,w_many,NULL); 
 }



--- a/strings/ctype-utf8.c
+++ b/strings/ctype-utf8.c
@@ -1518,6 +1518,161 @@ MY_UNICASE_INFO *uni_plane[256]={

 };

+
+/*
+** Compare string against string with wildcard
+** This function is used in UTF8 and UCS2
+**
+**	0 if matched
+**	-1 if not matched with wildcard
+**	 1 if matched with wildcard
+*/
+
+int my_wildcmp_unicode(CHARSET_INFO *cs,
+		       const char *str,const char *str_end,
+		       const char *wildstr,const char *wildend,
+		       int escape, int w_one, int w_many,
+		       MY_UNICASE_INFO **weights)
+{
+  int result= -1;			/* Not found, using wildcards */
+  my_wc_t s_wc, w_wc;
+  int scan, plane;
+  int (*mb_wc)(struct charset_info_st *cs, my_wc_t *wc,
+               const unsigned char *s,const unsigned char *e);
+  mb_wc= cs->cset->mb_wc;
+  
+  while (wildstr != wildend)
+  {
+    while (1)
+    {
+      if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                       (const uchar*)wildend)) <= 0)
+        return 1;
+      
+      if (w_wc ==  (my_wc_t)escape)
+      {
+        wildstr+= scan;
+        if ((scan= mb_wc(cs,&w_wc, (const uchar*)wildstr,
+                         (const uchar*)wildend)) <= 0)
+          return 1;
+      }
+      
+      if (w_wc == (my_wc_t)w_many)
+      {
+        result= 1;				/* Found an anchor char */
+        break;
+      }
+      
+      wildstr+= scan;
+      if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+                       (const uchar*)str_end)) <=0)
+        return 1;
+      str+= scan;
+      
+      if (w_wc == (my_wc_t)w_one)
+      {
+        result= 1;				/* Found an anchor char */
+      }
+      else
+      {
+        if (weights)
+        {
+          plane=(s_wc>>8) & 0xFF;
+          s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
+          plane=(w_wc>>8) & 0xFF;
+          w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
+        }
+        if (s_wc != w_wc)
+          return 1;				/* No match */
+      }
+      if (wildstr == wildend)
+	return (str != str_end);		/* Match if both are at end */
+    }
+    
+    
+    if (w_wc == (my_wc_t)w_many)
+    {						/* Found w_many */
+    
+      /* Remove any '%' and '_' from the wild search string */
+      for ( ; wildstr != wildend ; )
+      {
+        if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                         (const uchar*)wildend)) <= 0)
+          return 1;
+        
+	if (w_wc == (my_wc_t)w_many)
+	{
+	  wildstr+= scan;
+	  continue;
+	} 
+	
+	if (w_wc == (my_wc_t)w_one)
+	{
+	  wildstr+= scan;
+          if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+                           (const uchar*)str_end)) <=0)
+            return 1;
+          str+= scan;
+	  continue;
+	}
+	break;					/* Not a wild character */
+      }
+      
+      if (wildstr == wildend)
+	return 0;				/* Ok if w_many is last */
+      
+      if (str == str_end)
+	return -1;
+      
+      if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                       (const uchar*)wildend)) <=0)
+        return 1;
+      
+      if (w_wc ==  (my_wc_t)escape)
+      {
+        wildstr+= scan;
+        if ((scan= mb_wc(cs, &w_wc, (const uchar*)wildstr,
+                         (const uchar*)wildend)) <=0)
+          return 1;
+      }
+      
+      while (1)
+      {
+        /* Skip until the first character from wildstr is found */
+        while (str != str_end)
+        {
+          if ((scan= mb_wc(cs, &s_wc, (const uchar*)str,
+                           (const uchar*)str_end)) <=0)
+            return 1;
+          if (weights)
+          {
+            plane=(s_wc>>8) & 0xFF;
+            s_wc = weights[plane] ? weights[plane][s_wc & 0xFF].sort : s_wc;
+            plane=(w_wc>>8) & 0xFF;
+            w_wc = weights[plane] ? weights[plane][w_wc & 0xFF].sort : w_wc;
+          }
+          
+          if (s_wc == w_wc)
+            break;
+          str+= scan;
+        }
+        if (str == str_end)
+          return -1;
+        
+        result= my_wildcmp_unicode(cs, str, str_end, wildstr, wildend,
+                                   escape, w_one, w_many,
+                                   weights);
+        
+        if (result <= 0)
+          return result;
+        
+        str+= scan;
+      } 
+    }
+  }
+  return (str != str_end ? 1 : 0);
+}
+
 #endif


@@ -1992,6 +2147,17 @@ static int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t)
  return  my_strncasecmp_utf8(cs, s, t, len);
 }

+static
+int my_wildcmp_utf8(CHARSET_INFO *cs,
+		    const char *str,const char *str_end,
+		    const char *wildstr,const char *wildend,
+		    int escape, int w_one, int w_many)
+{
+  return my_wildcmp_unicode(cs,str,str_end,wildstr,wildend,
+                            escape,w_one,w_many,uni_plane); 
+}
+
+
 static int my_strnxfrm_utf8(CHARSET_INFO *cs,
                            uchar *dst, uint dstlen,
                            const uchar *src, uint srclen)
@@ -2060,7 +2226,7 @@ static MY_COLLATION_HANDLER my_collation_ci_handler =
    my_strnncollsp_utf8,
    my_strnxfrm_utf8,
    my_like_range_mb,
-    my_wildcmp_mb,
+    my_wildcmp_utf8,
    my_strcasecmp_utf8,
    my_instr_mb,
    my_hash_sort_utf8