Commit 8da22a75 authored by Alexander Nozdrin's avatar Alexander Nozdrin

Bug#55980 Character sets: supplementary character _bin ordering is wrong

Problem:
- ORDER BY for utf8mb4_bin, utf16_bin and utf32_bin returned
  results in a wrong order, because old functions
  (supporting only BMP range) were used to handle these collations.
- Additionally, utf16_bin did not sort supplementary characters
  between U+D700 and U+E000, as WL#1213 specification specified.
parent 1b3d5da5
...@@ -539,6 +539,11 @@ size_t my_strnxfrm_unicode(CHARSET_INFO *, ...@@ -539,6 +539,11 @@ size_t my_strnxfrm_unicode(CHARSET_INFO *,
uchar *dst, size_t dstlen, uchar *dst, size_t dstlen,
const uchar *src, size_t srclen); const uchar *src, size_t srclen);
size_t my_strnxfrm_unicode_full_bin(CHARSET_INFO *,
uchar *dst, size_t dstlen,
const uchar *src, size_t srclen);
size_t my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *, size_t);
int my_wildcmp_unicode(CHARSET_INFO *cs, int my_wildcmp_unicode(CHARSET_INFO *cs,
const char *str, const char *str_end, const char *str, const char *str_end,
const char *wildstr, const char *wildend, const char *wildstr, const char *wildend,
......
#
# Testing filesort for full Unicode character sets
# with supplementary characters.
#
--echo #
--echo # Bug#55980 Character sets: supplementary character _bin ordering is wrong
--echo #
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
SHOW CREATE TABLE t1;
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
ALTER TABLE t1 ADD KEY(a);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
DROP TABLE IF EXISTS t1;
...@@ -611,6 +611,31 @@ utf16_bin 00610009 ...@@ -611,6 +611,31 @@ utf16_bin 00610009
utf16_bin 0061 utf16_bin 0061
utf16_bin 00610020 utf16_bin 00610020
drop table t1; drop table t1;
#
# Bug#55980 Character sets: supplementary character _bin ordering is wrong
#
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(1) CHARACTER SET utf16 COLLATE utf16_bin NOT NULL DEFAULT ''
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
0385 CE85
D800DF84 F0908E84
DBC0DC00 F4808080
FF9D EFBE9D
ALTER TABLE t1 ADD KEY(a);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
0385 CE85
D800DF84 F0908E84
DBC0DC00 F4808080
FF9D EFBE9D
DROP TABLE IF EXISTS t1;
select @@collation_connection; select @@collation_connection;
@@collation_connection @@collation_connection
utf16_bin utf16_bin
......
...@@ -610,6 +610,31 @@ utf32_bin 0000006100000009 ...@@ -610,6 +610,31 @@ utf32_bin 0000006100000009
utf32_bin 00000061 utf32_bin 00000061
utf32_bin 0000006100000020 utf32_bin 0000006100000020
drop table t1; drop table t1;
#
# Bug#55980 Character sets: supplementary character _bin ordering is wrong
#
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(1) CHARACTER SET utf32 COLLATE utf32_bin NOT NULL DEFAULT ''
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
00000385 CE85
0000FF9D EFBE9D
00010384 F0908E84
00100000 F4808080
ALTER TABLE t1 ADD KEY(a);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
00000385 CE85
0000FF9D EFBE9D
00010384 F0908E84
00100000 F4808080
DROP TABLE IF EXISTS t1;
select @@collation_connection; select @@collation_connection;
@@collation_connection @@collation_connection
utf32_bin utf32_bin
......
...@@ -987,6 +987,31 @@ utf8mb4_bin 6109 ...@@ -987,6 +987,31 @@ utf8mb4_bin 6109
utf8mb4_bin 61 utf8mb4_bin 61
utf8mb4_bin 6120 utf8mb4_bin 6120
drop table t1; drop table t1;
#
# Bug#55980 Character sets: supplementary character _bin ordering is wrong
#
CREATE TABLE t1 AS SELECT REPEAT('a',1) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(1) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin NOT NULL DEFAULT ''
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_utf8mb4 0xEFBE9D),(_utf8mb4 0xF0908E84);
INSERT INTO t1 VALUES (_utf8mb4 0xCE85),(_utf8mb4 0xF4808080);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
CE85 CE85
EFBE9D EFBE9D
F0908E84 F0908E84
F4808080 F4808080
ALTER TABLE t1 ADD KEY(a);
SELECT HEX(a), HEX(CONVERT(a USING utf8mb4)) FROM t1 ORDER BY a;
HEX(a) HEX(CONVERT(a USING utf8mb4))
CE85 CE85
EFBE9D EFBE9D
F0908E84 F0908E84
F4808080 F4808080
DROP TABLE IF EXISTS t1;
select @@collation_connection; select @@collation_connection;
@@collation_connection @@collation_connection
utf8mb4_bin utf8mb4_bin
......
...@@ -326,6 +326,7 @@ SET collation_connection='utf16_general_ci'; ...@@ -326,6 +326,7 @@ SET collation_connection='utf16_general_ci';
SET NAMES latin1; SET NAMES latin1;
SET collation_connection='utf16_bin'; SET collation_connection='utf16_bin';
-- source include/ctype_filesort.inc -- source include/ctype_filesort.inc
-- source include/ctype_filesort2.inc
-- source include/ctype_like_escape.inc -- source include/ctype_like_escape.inc
# #
......
...@@ -328,6 +328,7 @@ SET collation_connection='utf32_general_ci'; ...@@ -328,6 +328,7 @@ SET collation_connection='utf32_general_ci';
SET NAMES latin1; SET NAMES latin1;
SET collation_connection='utf32_bin'; SET collation_connection='utf32_bin';
-- source include/ctype_filesort.inc -- source include/ctype_filesort.inc
-- source include/ctype_filesort2.inc
-- source include/ctype_like_escape.inc -- source include/ctype_like_escape.inc
# #
......
...@@ -733,6 +733,7 @@ SET collation_connection='utf8mb4_general_ci'; ...@@ -733,6 +733,7 @@ SET collation_connection='utf8mb4_general_ci';
-- source include/ctype_german.inc -- source include/ctype_german.inc
SET collation_connection='utf8mb4_bin'; SET collation_connection='utf8mb4_bin';
-- source include/ctype_filesort.inc -- source include/ctype_filesort.inc
-- source include/ctype_filesort2.inc
-- source include/ctype_like_escape.inc -- source include/ctype_like_escape.inc
# #
......
...@@ -1469,7 +1469,7 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs, ...@@ -1469,7 +1469,7 @@ my_strnncoll_utf16_bin(CHARSET_INFO *cs,
} }
if (s_wc != t_wc) if (s_wc != t_wc)
{ {
return s_wc > t_wc ? 1 : -1; return my_bincmp(s, s + s_res, t, t + t_res);
} }
s+= s_res; s+= s_res;
...@@ -1511,7 +1511,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs, ...@@ -1511,7 +1511,7 @@ my_strnncollsp_utf16_bin(CHARSET_INFO *cs,
if (s_wc != t_wc) if (s_wc != t_wc)
{ {
return s_wc > t_wc ? 1 : -1; return my_bincmp(s, s + s_res, t, t + t_res);
} }
s+= s_res; s+= s_res;
...@@ -1684,8 +1684,8 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler = ...@@ -1684,8 +1684,8 @@ static MY_COLLATION_HANDLER my_collation_utf16_bin_handler =
NULL, /* init */ NULL, /* init */
my_strnncoll_utf16_bin, my_strnncoll_utf16_bin,
my_strnncollsp_utf16_bin, my_strnncollsp_utf16_bin,
my_strnxfrm_unicode, my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_simple, my_strnxfrmlen_unicode_full_bin,
my_like_range_utf16, my_like_range_utf16,
my_wildcmp_utf16_bin, my_wildcmp_utf16_bin,
my_strcasecmp_mb2_or_mb4, my_strcasecmp_mb2_or_mb4,
...@@ -2711,8 +2711,8 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler = ...@@ -2711,8 +2711,8 @@ static MY_COLLATION_HANDLER my_collation_utf32_bin_handler =
NULL, /* init */ NULL, /* init */
my_strnncoll_utf32_bin, my_strnncoll_utf32_bin,
my_strnncollsp_utf32_bin, my_strnncollsp_utf32_bin,
my_strnxfrm_unicode, my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_utf32, my_strnxfrmlen_unicode_full_bin,
my_like_range_utf32, my_like_range_utf32,
my_wildcmp_utf32_bin, my_wildcmp_utf32_bin,
my_strcasecmp_mb2_or_mb4, my_strcasecmp_mb2_or_mb4,
......
...@@ -1893,7 +1893,13 @@ my_wildcmp_unicode(CHARSET_INFO *cs, ...@@ -1893,7 +1893,13 @@ my_wildcmp_unicode(CHARSET_INFO *cs,
/* /*
This function is shared between utf8mb3/utf8mb4/ucs2/utf16/utf32 Store sorting weights using 2 bytes per character.
This function is shared between
- utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
which support BMP only (U+0000..U+FFFF).
- utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
which map all supplementary characters to weight 0xFFFD.
*/ */
size_t size_t
my_strnxfrm_unicode(CHARSET_INFO *cs, my_strnxfrm_unicode(CHARSET_INFO *cs,
...@@ -1937,6 +1943,70 @@ my_strnxfrm_unicode(CHARSET_INFO *cs, ...@@ -1937,6 +1943,70 @@ my_strnxfrm_unicode(CHARSET_INFO *cs,
} }
/*
Store sorting weights using 3 bytes per character.
This function is shared between utf8mb4_bin, utf16_bin, utf32_bin.
*/
size_t
my_strnxfrm_unicode_full_bin(CHARSET_INFO *cs,
uchar *dst, size_t dstlen,
const uchar *src, size_t srclen)
{
my_wc_t wc;
uchar *de= dst + dstlen;
uchar *de_beg= de - 2; /* The beginning of the last chunk */
const uchar *se = src + srclen;
LINT_INIT(wc);
DBUG_ASSERT(src);
DBUG_ASSERT(cs->state & MY_CS_BINSORT);
while (dst < de_beg)
{
int res;
if ((res= cs->cset->mb_wc(cs, &wc, src, se)) <= 0)
break;
src+= res;
if (cs->mbminlen == 2) /* utf16_bin */
{
/*
Reorder code points to weights as follows:
U+0000..U+D7FF -> [00][00][00]..[00][D7][FF] BMP part #1
U+10000..U+10FFFF -> [01][00][00]..[10][FF][FF] Supplementary
U+E000..U+FFFF -> [20][E0][00]..[20][FF][FF] BMP part #2
*/
if (wc >= 0xE000 && wc <= 0xFFFF)
wc+= 0x200000;
}
*dst++= (uchar) (wc >> 16);
*dst++= (uchar) ((wc >> 8) & 0xFF);
*dst++= (uchar) (wc & 0xFF);
}
while (dst < de_beg) /* Fill the tail with keys for space character */
{
*dst++= 0x00;
*dst++= 0x00;
*dst++= 0x20;
}
/* Clear the last one or two bytes, if "dstlen" was not divisible by 3 */
if (dst < de)
{
*dst++= 0x00;
if (dst < de)
*dst= 0x00;
}
return dstlen;
}
size_t
my_strnxfrmlen_unicode_full_bin(CHARSET_INFO *cs, size_t len)
{
return ((len + 3) / cs->mbmaxlen) * 3;
}
#endif /* HAVE_UNIDATA */ #endif /* HAVE_UNIDATA */
...@@ -5067,8 +5137,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler = ...@@ -5067,8 +5137,8 @@ static MY_COLLATION_HANDLER my_collation_utf8mb4_bin_handler =
NULL, /* init */ NULL, /* init */
my_strnncoll_mb_bin, my_strnncoll_mb_bin,
my_strnncollsp_mb_bin, my_strnncollsp_mb_bin,
my_strnxfrm_unicode, my_strnxfrm_unicode_full_bin,
my_strnxfrmlen_utf8mb4, my_strnxfrmlen_unicode_full_bin,
my_like_range_mb, my_like_range_mb,
my_wildcmp_mb_bin, my_wildcmp_mb_bin,
my_strcasecmp_mb_bin, my_strcasecmp_mb_bin,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment