MDEV-11255 LDML: allow defining 2-level UCA collations

0259b3cb · Alexander Barkov · 90c5b2f5 · 0259b3cb · 0259b3cb · 0259b3cb
Commit 0259b3cb authored Nov 08, 2016 by Alexander Barkov
7 changed files
--- a/mysql-test/r/ctype_ldml.result
+++ b/mysql-test/r/ctype_ldml.result
--- a/mysql-test/std_data/ldml/Index.xml
+++ b/mysql-test/std_data/ldml/Index.xml
@@ -1167,6 +1167,45 @@
      </rules>
   </collation>

+    <collation name="utf8_czech_test_w2" id="370" version="5.2.0">
+      <settings strength="2"/>
+      <rules>
+        <reset>C</reset><p>\u010D</p><t>\u010C</t>
+        <reset>H</reset><p>ch</p><t>Ch</t><t>CH</t>
+        <reset>R</reset><p>\u0159</p><t>\u0158</t>
+        <reset>S</reset><p>\u0161</p><t>\u0160</t>
+        <reset>Z</reset><p>\u017E</p><t>\u017D</t>
+      </rules>
+    </collation>
+
+    <collation name="utf8_czech_test_nopad_w2" id="371" version="5.2.0" flag="nopad">
+      <settings strength="2"/>
+      <rules>
+        <reset>C</reset><p>\u010D</p><t>\u010C</t>
+        <reset>H</reset><p>ch</p><t>Ch</t><t>CH</t>
+        <reset>R</reset><p>\u0159</p><t>\u0158</t>
+        <reset>S</reset><p>\u0161</p><t>\u0160</t>
+        <reset>Z</reset><p>\u017E</p><t>\u017D</t>
+      </rules>
+    </collation>
+
+    <!--
+       This collation definition is bad.
+       It uses Unicode-4.0.0 (the default version), and requests strength="2".
+       Unicode-4.0.0 does not have information about the secondary weight level.
+       The version="5.2.0" collation attribute was forgotten in this definition.
+    -->
+    <collation name="utf8_czech_test_bad_w2" id="372">
+      <settings strength="2"/>
+      <rules>
+        <reset>C</reset><p>\u010D</p><t>\u010C</t>
+        <reset>H</reset><p>ch</p><t>Ch</t><t>CH</t>
+        <reset>R</reset><p>\u0159</p><t>\u0158</t>
+        <reset>S</reset><p>\u0161</p><t>\u0160</t>
+        <reset>Z</reset><p>\u017E</p><t>\u017D</t>
+      </rules>
+    </collation>
+
  </charset>
  
 </charsets>
--- a/mysql-test/suite/innodb/r/innodb_ctype_ldml.result
+++ b/mysql-test/suite/innodb/r/innodb_ctype_ldml.result
--- a/mysql-test/t/ctype_ldml.test
+++ b/mysql-test/t/ctype_ldml.test
@@ -499,3 +499,106 @@ SELECT HEX(a), REPLACE(a,' ','<SP>') FROM t1 WHERE a='a';
 SELECT HEX(a), REPLACE(a,' ','<SP>') FROM t1 ORDER BY a;
 SELECT HEX(a), REPLACE(a,' ','<SP>') FROM t1 ORDER BY a DESC;
 DROP TABLE t1;
+
+
+SET NAMES utf8 COLLATE utf8_czech_test_w2;
+CREATE TABLE t1 AS SELECT SPACE(10) AS c1 LIMIT 0;
+--source include/ctype_unicode_latin.inc
+INSERT INTO t1 VALUES ('a ');
+SELECT c1, HEX(WEIGHT_STRING(c1 LEVEL 1)), HEX(WEIGHT_STRING(c1 LEVEL 2)) FROM t1 ORDER BY c1, BINARY c1;
+SELECT c1, HEX(WEIGHT_STRING(c1 AS CHAR(3) LEVEL 1)), HEX(WEIGHT_STRING(c1 AS CHAR(3) LEVEL 2)) FROM t1 WHERE c1 BETWEEN 'a' AND 'aZ' ORDER BY c1, BINARY c1;
+DROP TABLE t1;
+
+SELECT 'a' = 'a ';
+SELECT 'a'  < 'á';
+SELECT 'áa' < 'ab';
+SELECT 'á'  < 'ä';
+SELECT 'äa' < 'áb';
+SELECT 'c'  < 'č';
+SELECT 'cb' < 'ča';
+SELECT 'd'  < 'ď';
+SELECT 'ďa' < 'db';
+SELECT 'e'  < 'é';
+SELECT 'éa' < 'eb';
+SELECT 'é'  < 'ě';
+SELECT 'ěa' < 'éb';
+SELECT 'i'  < 'í';
+SELECT 'ía' < 'ib';
+SELECT 'n'  < 'ň';
+SELECT 'ňa' < 'nb';
+SELECT 'o'  < 'ó';
+SELECT 'óa' < 'ob';
+SELECT 'ó'  < 'ö';
+SELECT 'öa' < 'ób';
+SELECT 'r'  < 'ř';
+SELECT 'rb' < 'řa';
+SELECT 's'  < 'š';
+SELECT 'sb' < 'ša';
+SELECT 't'  < 'ť';
+SELECT 'ťa' < 'tb';
+SELECT 'u'  < 'ú';
+SELECT 'úa' < 'ub';
+SELECT 'ú'  < 'ů';
+SELECT 'ůa' < 'úb';
+SELECT 'ů'  < 'ü';
+SELECT 'üa' < 'ůb';
+SELECT 'y'  < 'ý';
+SELECT 'ýa' < 'yb';
+SELECT 'z'  < 'ž';
+SELECT 'zb' < 'ža';
+SELECT 'hž' < 'ch';
+SELECT 'chž'< 'i';
+
+
+
+SET NAMES utf8 COLLATE utf8_czech_test_nopad_w2;
+CREATE TABLE t1 AS SELECT SPACE(10) AS c1 LIMIT 0;
+--source include/ctype_unicode_latin.inc
+INSERT INTO t1 VALUES ('a ');
+SELECT c1, HEX(WEIGHT_STRING(c1 LEVEL 1)), HEX(WEIGHT_STRING(c1 LEVEL 2)) FROM t1 ORDER BY c1, BINARY c1;
+SELECT c1, HEX(WEIGHT_STRING(c1 AS CHAR(3) LEVEL 1)), HEX(WEIGHT_STRING(c1 AS CHAR(3) LEVEL 2)) FROM t1 WHERE c1 BETWEEN 'a' AND 'aZ' ORDER BY c1, BINARY c1;
+DROP TABLE t1;
+
+SELECT 'a' = 'a ';
+SELECT 'a'  < 'á';
+SELECT 'áa' < 'ab';
+SELECT 'á'  < 'ä';
+SELECT 'äa' < 'áb';
+SELECT 'c'  < 'č';
+SELECT 'cb' < 'ča';
+SELECT 'd'  < 'ď';
+SELECT 'ďa' < 'db';
+SELECT 'e'  < 'é';
+SELECT 'éa' < 'eb';
+SELECT 'é'  < 'ě';
+SELECT 'ěa' < 'éb';
+SELECT 'i'  < 'í';
+SELECT 'ía' < 'ib';
+SELECT 'n'  < 'ň';
+SELECT 'ňa' < 'nb';
+SELECT 'o'  < 'ó';
+SELECT 'óa' < 'ob';
+SELECT 'ó'  < 'ö';
+SELECT 'öa' < 'ób';
+SELECT 'r'  < 'ř';
+SELECT 'rb' < 'řa';
+SELECT 's'  < 'š';
+SELECT 'sb' < 'ša';
+SELECT 't'  < 'ť';
+SELECT 'ťa' < 'tb';
+SELECT 'u'  < 'ú';
+SELECT 'úa' < 'ub';
+SELECT 'ú'  < 'ů';
+SELECT 'ůa' < 'úb';
+SELECT 'ů'  < 'ü';
+SELECT 'üa' < 'ůb';
+SELECT 'y'  < 'ý';
+SELECT 'ýa' < 'yb';
+SELECT 'z'  < 'ž';
+SELECT 'zb' < 'ža';
+SELECT 'hž' < 'ch';
+SELECT 'chž'< 'i';
+
+
+--error ER_UNKNOWN_COLLATION
+SELECT 'a' COLLATE utf8_czech_test_bad_w2;
--- a/mysys/charset.c
+++ b/mysys/charset.c
@@ -258,12 +258,38 @@ static my_bool simple_cs_is_full(CHARSET_INFO *cs)


 #if defined(HAVE_UCA_COLLATIONS) && (defined(HAVE_CHARSET_ucs2) || defined(HAVE_CHARSET_utf8))
+/**
+  Initialize a loaded collation.
+  @param [OUT] to     - The new charset_info_st structure to initialize.
+  @param [IN]  from   - A template collation, to fill the missing data from.
+  @param [IN]  loaded - The collation data loaded from the LDML file.
+                        some data may be missing in "loaded".
+*/
 static void
-copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from)
+copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from,
+                   CHARSET_INFO *loaded)
 {
  to->cset= from->cset;
  to->coll= from->coll;
-  to->strxfrm_multiply= from->strxfrm_multiply;
+  /*
+    Single-level UCA collation have strnxfrm_multiple=8.
+    In case of a multi-level UCA collation we use strnxfrm_multiply=4.
+    That means MY_COLLATION_HANDLER::strnfrmlen() will request the caller
+    to allocate a buffer smaller size for each level, for performance purpose,
+    and to fit longer VARCHARs to @@max_sort_length.
+    This makes filesort produce non-precise order for some rare Unicode
+    characters that produce more than 4 weights (long expansions).
+    UCA requires 2 bytes per weight multiplied by the number of levels.
+    In case of a 2-level collation, each character requires 4*2=8 bytes.
+    Therefore, the longest VARCHAR that fits into the default @@max_sort_length
+    is 1024/8=VARCHAR(128). With strnxfrm_multiply==8, only VARCHAR(64)
+    would fit.
+    Note, the built-in collation utf8_thai_520_w2 also uses strnxfrm_multiply=4,
+    for the same purpose.
+    TODO: we could add a new LDML syntax to choose strxfrm_multiply value.
+  */
+  to->strxfrm_multiply= loaded->levels_for_order > 1 ?
+                        4 : from->strxfrm_multiply;
  to->min_sort_char= from->min_sort_char;
  to->max_sort_char= from->max_sort_char;
  to->mbminlen= from->mbminlen;
@@ -312,7 +338,8 @@ static int add_collation(struct charset_info_st *cs)
 #if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
                                  &my_charset_ucs2_unicode_nopad_ci :
-                                  &my_charset_ucs2_unicode_ci);
+                                  &my_charset_ucs2_unicode_ci,
+                                  cs);
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
 #endif        
      }
@@ -321,7 +348,8 @@ static int add_collation(struct charset_info_st *cs)
 #if defined (HAVE_CHARSET_utf8) && defined(HAVE_UCA_COLLATIONS)
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
                                  &my_charset_utf8_unicode_nopad_ci :
-                                  &my_charset_utf8_unicode_ci);
+                                  &my_charset_utf8_unicode_ci,
+                                  cs);
        newcs->ctype= my_charset_utf8_unicode_ci.ctype;
        if (init_state_maps(newcs))
          return MY_XML_ERROR;
@@ -332,7 +360,8 @@ static int add_collation(struct charset_info_st *cs)
 #if defined (HAVE_CHARSET_utf8mb4) && defined(HAVE_UCA_COLLATIONS)
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
                                  &my_charset_utf8mb4_unicode_nopad_ci :
-                                  &my_charset_utf8mb4_unicode_ci);
+                                  &my_charset_utf8mb4_unicode_ci,
+                                  cs);
        newcs->ctype= my_charset_utf8mb4_unicode_ci.ctype;
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
 #endif
@@ -342,7 +371,8 @@ static int add_collation(struct charset_info_st *cs)
 #if defined (HAVE_CHARSET_utf16) && defined(HAVE_UCA_COLLATIONS)
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
                                  &my_charset_utf16_unicode_nopad_ci :
-                                  &my_charset_utf16_unicode_ci);
+                                  &my_charset_utf16_unicode_ci,
+                                  cs);
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
 #endif
      }
@@ -351,7 +381,8 @@ static int add_collation(struct charset_info_st *cs)
 #if defined (HAVE_CHARSET_utf32) && defined(HAVE_UCA_COLLATIONS)
        copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
                                  &my_charset_utf32_unicode_nopad_ci :
-                                  &my_charset_utf32_unicode_ci);
+                                  &my_charset_utf32_unicode_ci,
+                                  cs);
        newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
 #endif
      }

--- a/strings/ctype-uca.c
+++ b/strings/ctype-uca.c
--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -667,6 +667,8 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
  case _CS_ST_STRENGTH:
    /* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */
    rc= tailoring_append(st, "[strength %.*s]", len, attr);
+    if (len && attr[0] >= '1' && attr[0] <= '9')
+      i->cs.levels_for_order= attr[0] - '0';
    break;

  case _CS_ST_ALTERNATE: