Commit 0259b3cb authored by Alexander Barkov's avatar Alexander Barkov

MDEV-11255 LDML: allow defining 2-level UCA collations

parent 90c5b2f5
This diff is collapsed.
......@@ -1167,6 +1167,45 @@
</rules>
</collation>
<collation name="utf8_czech_test_w2" id="370" version="5.2.0">
<settings strength="2"/>
<rules>
<reset>C</reset><p>\u010D</p><t>\u010C</t>
<reset>H</reset><p>ch</p><t>Ch</t><t>CH</t>
<reset>R</reset><p>\u0159</p><t>\u0158</t>
<reset>S</reset><p>\u0161</p><t>\u0160</t>
<reset>Z</reset><p>\u017E</p><t>\u017D</t>
</rules>
</collation>
<collation name="utf8_czech_test_nopad_w2" id="371" version="5.2.0" flag="nopad">
<settings strength="2"/>
<rules>
<reset>C</reset><p>\u010D</p><t>\u010C</t>
<reset>H</reset><p>ch</p><t>Ch</t><t>CH</t>
<reset>R</reset><p>\u0159</p><t>\u0158</t>
<reset>S</reset><p>\u0161</p><t>\u0160</t>
<reset>Z</reset><p>\u017E</p><t>\u017D</t>
</rules>
</collation>
<!--
This collation definition is bad.
It uses Unicode-4.0.0 (the default version), and requests strength="2".
Unicode-4.0.0 does not have information about the secondary weight level.
The version="5.2.0" collation attribute was forgotten in this definition.
-->
<collation name="utf8_czech_test_bad_w2" id="372">
<settings strength="2"/>
<rules>
<reset>C</reset><p>\u010D</p><t>\u010C</t>
<reset>H</reset><p>ch</p><t>Ch</t><t>CH</t>
<reset>R</reset><p>\u0159</p><t>\u0158</t>
<reset>S</reset><p>\u0161</p><t>\u0160</t>
<reset>Z</reset><p>\u017E</p><t>\u017D</t>
</rules>
</collation>
</charset>
</charsets>
......@@ -499,3 +499,106 @@ SELECT HEX(a), REPLACE(a,' ','<SP>') FROM t1 WHERE a='a';
SELECT HEX(a), REPLACE(a,' ','<SP>') FROM t1 ORDER BY a;
SELECT HEX(a), REPLACE(a,' ','<SP>') FROM t1 ORDER BY a DESC;
DROP TABLE t1;
SET NAMES utf8 COLLATE utf8_czech_test_w2;
CREATE TABLE t1 AS SELECT SPACE(10) AS c1 LIMIT 0;
--source include/ctype_unicode_latin.inc
INSERT INTO t1 VALUES ('a ');
SELECT c1, HEX(WEIGHT_STRING(c1 LEVEL 1)), HEX(WEIGHT_STRING(c1 LEVEL 2)) FROM t1 ORDER BY c1, BINARY c1;
SELECT c1, HEX(WEIGHT_STRING(c1 AS CHAR(3) LEVEL 1)), HEX(WEIGHT_STRING(c1 AS CHAR(3) LEVEL 2)) FROM t1 WHERE c1 BETWEEN 'a' AND 'aZ' ORDER BY c1, BINARY c1;
DROP TABLE t1;
SELECT 'a' = 'a ';
SELECT 'a' < 'á';
SELECT 'áa' < 'ab';
SELECT 'á' < 'ä';
SELECT 'äa' < 'áb';
SELECT 'c' < 'č';
SELECT 'cb' < 'ča';
SELECT 'd' < 'ď';
SELECT 'ďa' < 'db';
SELECT 'e' < 'é';
SELECT 'éa' < 'eb';
SELECT 'é' < 'ě';
SELECT 'ěa' < 'éb';
SELECT 'i' < 'í';
SELECT 'ía' < 'ib';
SELECT 'n' < 'ň';
SELECT 'ňa' < 'nb';
SELECT 'o' < 'ó';
SELECT 'óa' < 'ob';
SELECT 'ó' < 'ö';
SELECT 'öa' < 'ób';
SELECT 'r' < 'ř';
SELECT 'rb' < 'řa';
SELECT 's' < 'š';
SELECT 'sb' < 'ša';
SELECT 't' < 'ť';
SELECT 'ťa' < 'tb';
SELECT 'u' < 'ú';
SELECT 'úa' < 'ub';
SELECT 'ú' < 'ů';
SELECT 'ůa' < 'úb';
SELECT 'ů' < 'ü';
SELECT 'üa' < 'ůb';
SELECT 'y' < 'ý';
SELECT 'ýa' < 'yb';
SELECT 'z' < 'ž';
SELECT 'zb' < 'ža';
SELECT 'hž' < 'ch';
SELECT 'chž'< 'i';
SET NAMES utf8 COLLATE utf8_czech_test_nopad_w2;
CREATE TABLE t1 AS SELECT SPACE(10) AS c1 LIMIT 0;
--source include/ctype_unicode_latin.inc
INSERT INTO t1 VALUES ('a ');
SELECT c1, HEX(WEIGHT_STRING(c1 LEVEL 1)), HEX(WEIGHT_STRING(c1 LEVEL 2)) FROM t1 ORDER BY c1, BINARY c1;
SELECT c1, HEX(WEIGHT_STRING(c1 AS CHAR(3) LEVEL 1)), HEX(WEIGHT_STRING(c1 AS CHAR(3) LEVEL 2)) FROM t1 WHERE c1 BETWEEN 'a' AND 'aZ' ORDER BY c1, BINARY c1;
DROP TABLE t1;
SELECT 'a' = 'a ';
SELECT 'a' < 'á';
SELECT 'áa' < 'ab';
SELECT 'á' < 'ä';
SELECT 'äa' < 'áb';
SELECT 'c' < 'č';
SELECT 'cb' < 'ča';
SELECT 'd' < 'ď';
SELECT 'ďa' < 'db';
SELECT 'e' < 'é';
SELECT 'éa' < 'eb';
SELECT 'é' < 'ě';
SELECT 'ěa' < 'éb';
SELECT 'i' < 'í';
SELECT 'ía' < 'ib';
SELECT 'n' < 'ň';
SELECT 'ňa' < 'nb';
SELECT 'o' < 'ó';
SELECT 'óa' < 'ob';
SELECT 'ó' < 'ö';
SELECT 'öa' < 'ób';
SELECT 'r' < 'ř';
SELECT 'rb' < 'řa';
SELECT 's' < 'š';
SELECT 'sb' < 'ša';
SELECT 't' < 'ť';
SELECT 'ťa' < 'tb';
SELECT 'u' < 'ú';
SELECT 'úa' < 'ub';
SELECT 'ú' < 'ů';
SELECT 'ůa' < 'úb';
SELECT 'ů' < 'ü';
SELECT 'üa' < 'ůb';
SELECT 'y' < 'ý';
SELECT 'ýa' < 'yb';
SELECT 'z' < 'ž';
SELECT 'zb' < 'ža';
SELECT 'hž' < 'ch';
SELECT 'chž'< 'i';
--error ER_UNKNOWN_COLLATION
SELECT 'a' COLLATE utf8_czech_test_bad_w2;
......@@ -258,12 +258,38 @@ static my_bool simple_cs_is_full(CHARSET_INFO *cs)
#if defined(HAVE_UCA_COLLATIONS) && (defined(HAVE_CHARSET_ucs2) || defined(HAVE_CHARSET_utf8))
/**
Initialize a loaded collation.
@param [OUT] to - The new charset_info_st structure to initialize.
@param [IN] from - A template collation, to fill the missing data from.
@param [IN] loaded - The collation data loaded from the LDML file.
some data may be missing in "loaded".
*/
static void
copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from)
copy_uca_collation(struct charset_info_st *to, CHARSET_INFO *from,
CHARSET_INFO *loaded)
{
to->cset= from->cset;
to->coll= from->coll;
to->strxfrm_multiply= from->strxfrm_multiply;
/*
Single-level UCA collation have strnxfrm_multiple=8.
In case of a multi-level UCA collation we use strnxfrm_multiply=4.
That means MY_COLLATION_HANDLER::strnfrmlen() will request the caller
to allocate a buffer smaller size for each level, for performance purpose,
and to fit longer VARCHARs to @@max_sort_length.
This makes filesort produce non-precise order for some rare Unicode
characters that produce more than 4 weights (long expansions).
UCA requires 2 bytes per weight multiplied by the number of levels.
In case of a 2-level collation, each character requires 4*2=8 bytes.
Therefore, the longest VARCHAR that fits into the default @@max_sort_length
is 1024/8=VARCHAR(128). With strnxfrm_multiply==8, only VARCHAR(64)
would fit.
Note, the built-in collation utf8_thai_520_w2 also uses strnxfrm_multiply=4,
for the same purpose.
TODO: we could add a new LDML syntax to choose strxfrm_multiply value.
*/
to->strxfrm_multiply= loaded->levels_for_order > 1 ?
4 : from->strxfrm_multiply;
to->min_sort_char= from->min_sort_char;
to->max_sort_char= from->max_sort_char;
to->mbminlen= from->mbminlen;
......@@ -312,7 +338,8 @@ static int add_collation(struct charset_info_st *cs)
#if defined(HAVE_CHARSET_ucs2) && defined(HAVE_UCA_COLLATIONS)
copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
&my_charset_ucs2_unicode_nopad_ci :
&my_charset_ucs2_unicode_ci);
&my_charset_ucs2_unicode_ci,
cs);
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
#endif
}
......@@ -321,7 +348,8 @@ static int add_collation(struct charset_info_st *cs)
#if defined (HAVE_CHARSET_utf8) && defined(HAVE_UCA_COLLATIONS)
copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
&my_charset_utf8_unicode_nopad_ci :
&my_charset_utf8_unicode_ci);
&my_charset_utf8_unicode_ci,
cs);
newcs->ctype= my_charset_utf8_unicode_ci.ctype;
if (init_state_maps(newcs))
return MY_XML_ERROR;
......@@ -332,7 +360,8 @@ static int add_collation(struct charset_info_st *cs)
#if defined (HAVE_CHARSET_utf8mb4) && defined(HAVE_UCA_COLLATIONS)
copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
&my_charset_utf8mb4_unicode_nopad_ci :
&my_charset_utf8mb4_unicode_ci);
&my_charset_utf8mb4_unicode_ci,
cs);
newcs->ctype= my_charset_utf8mb4_unicode_ci.ctype;
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED;
#endif
......@@ -342,7 +371,8 @@ static int add_collation(struct charset_info_st *cs)
#if defined (HAVE_CHARSET_utf16) && defined(HAVE_UCA_COLLATIONS)
copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
&my_charset_utf16_unicode_nopad_ci :
&my_charset_utf16_unicode_ci);
&my_charset_utf16_unicode_ci,
cs);
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
#endif
}
......@@ -351,7 +381,8 @@ static int add_collation(struct charset_info_st *cs)
#if defined (HAVE_CHARSET_utf32) && defined(HAVE_UCA_COLLATIONS)
copy_uca_collation(newcs, newcs->state & MY_CS_NOPAD ?
&my_charset_utf32_unicode_nopad_ci :
&my_charset_utf32_unicode_ci);
&my_charset_utf32_unicode_ci,
cs);
newcs->state|= MY_CS_AVAILABLE | MY_CS_LOADED | MY_CS_NONASCII;
#endif
}
......
This diff is collapsed.
......@@ -667,6 +667,8 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, size_t len)
case _CS_ST_STRENGTH:
/* 1, 2, 3, 4, 5, or primary, secondary, tertiary, quaternary, identical */
rc= tailoring_append(st, "[strength %.*s]", len, attr);
if (len && attr[0] >= '1' && attr[0] <= '9')
i->cs.levels_for_order= attr[0] - '0';
break;
case _CS_ST_ALTERNATE:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment