Commit 63120090 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-10262 ucs2_thai_520_w2: wrong implicit weights on the secondary level

parent 61492ea5
...@@ -131,6 +131,7 @@ typedef struct my_uca_level_info_st ...@@ -131,6 +131,7 @@ typedef struct my_uca_level_info_st
uchar *lengths; uchar *lengths;
uint16 **weights; uint16 **weights;
MY_CONTRACTIONS contractions; MY_CONTRACTIONS contractions;
uint levelno;
} MY_UCA_WEIGHT_LEVEL; } MY_UCA_WEIGHT_LEVEL;
......
...@@ -22,6 +22,13 @@ SELECT HEX(a), HEX(WEIGHT_STRING(a AS CHAR(4) LEVEL 2)) FROM t1; ...@@ -22,6 +22,13 @@ SELECT HEX(a), HEX(WEIGHT_STRING(a AS CHAR(4) LEVEL 2)) FROM t1;
SELECT HEX(a), HEX(WEIGHT_STRING(a AS CHAR(4) LEVEL 3)) FROM t1; SELECT HEX(a), HEX(WEIGHT_STRING(a AS CHAR(4) LEVEL 3)) FROM t1;
DROP TABLE t1; DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0; CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1; SHOW CREATE TABLE t1;
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
......
...@@ -14033,6 +14033,19 @@ Table Create Table ...@@ -14033,6 +14033,19 @@ Table Create Table
t1 CREATE TABLE `t1` ( t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf8 COLLATE utf8_thai_520_w2 NOT NULL `a` varchar(10) CHARACTER SET utf8 COLLATE utf8_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1 ) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
ucs2 HEX(a) HEX(WEIGHT_STRING(a))
3400 E39080 FB80B4000020
F001 EF8081 FBC1F0010020
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf8 COLLATE utf8_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2)) a HEX(WEIGHT_STRING(a LEVEL 2))
...@@ -14714,6 +14727,19 @@ Table Create Table ...@@ -14714,6 +14727,19 @@ Table Create Table
t1 CREATE TABLE `t1` ( t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_thai_520_w2 NOT NULL `a` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1 ) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
ucs2 HEX(a) HEX(WEIGHT_STRING(a))
3400 3400 FB80B4000020
F001 F001 FBC1F0010020
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET ucs2 COLLATE ucs2_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2)) a HEX(WEIGHT_STRING(a LEVEL 2))
......
...@@ -6663,6 +6663,19 @@ Table Create Table ...@@ -6663,6 +6663,19 @@ Table Create Table
t1 CREATE TABLE `t1` ( t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf16 COLLATE utf16_thai_520_w2 NOT NULL `a` varchar(10) CHARACTER SET utf16 COLLATE utf16_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1 ) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
ucs2 HEX(a) HEX(WEIGHT_STRING(a))
3400 3400 FB80B4000020
F001 F001 FBC1F0010020
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf16 COLLATE utf16_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2)) a HEX(WEIGHT_STRING(a LEVEL 2))
......
...@@ -6683,6 +6683,19 @@ Table Create Table ...@@ -6683,6 +6683,19 @@ Table Create Table
t1 CREATE TABLE `t1` ( t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf32 COLLATE utf32_thai_520_w2 NOT NULL `a` varchar(10) CHARACTER SET utf32 COLLATE utf32_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1 ) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
ucs2 HEX(a) HEX(WEIGHT_STRING(a))
3400 00003400 FB80B4000020
F001 0000F001 FBC1F0010020
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf32 COLLATE utf32_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2)) a HEX(WEIGHT_STRING(a LEVEL 2))
......
...@@ -5373,6 +5373,19 @@ Table Create Table ...@@ -5373,6 +5373,19 @@ Table Create Table
t1 CREATE TABLE `t1` ( t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_thai_520_w2 NOT NULL `a` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1 ) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES (_ucs2 0x3400);
INSERT INTO t1 VALUES (_ucs2 0xF001);
SELECT HEX(CONVERT(a USING ucs2)) AS ucs2, HEX(a), HEX(WEIGHT_STRING(a)) FROM t1;
ucs2 HEX(a) HEX(WEIGHT_STRING(a))
3400 E39080 FB80B4000020
F001 EF8081 FBC1F0010020
DROP TABLE t1;
CREATE TABLE t1 AS SELECT SPACE(10) AS a LIMIT 0;
SHOW CREATE TABLE t1;
Table Create Table
t1 CREATE TABLE `t1` (
`a` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_thai_520_w2 NOT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å'); INSERT INTO t1 VALUES ('A'),('À'),('Á'),('Â'),('Ã'),('Ä'),('Å');
SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a; SELECT a, HEX(WEIGHT_STRING(a LEVEL 2)) FROM t1 ORDER BY a;
a HEX(WEIGHT_STRING(a LEVEL 2)) a HEX(WEIGHT_STRING(a LEVEL 2))
......
...@@ -6539,7 +6539,8 @@ MY_UCA_INFO my_uca_v400= ...@@ -6539,7 +6539,8 @@ MY_UCA_INFO my_uca_v400=
0, /* nitems */ 0, /* nitems */
NULL, /* item */ NULL, /* item */
NULL /* flags */ NULL /* flags */
} },
0 /* levelno */
}, },
}, },
...@@ -30084,7 +30085,8 @@ MY_UCA_INFO my_uca_v520_th= ...@@ -30084,7 +30085,8 @@ MY_UCA_INFO my_uca_v520_th=
THAI_CONTRACTIONS, /* nitems */ THAI_CONTRACTIONS, /* nitems */
thai_contractions, /* item */ thai_contractions, /* item */
NULL /* flags */ NULL /* flags */
} },
0 /* levelno */
}, },
{ {
0x10FFFF, /* maxchar */ 0x10FFFF, /* maxchar */
...@@ -30094,7 +30096,8 @@ MY_UCA_INFO my_uca_v520_th= ...@@ -30094,7 +30096,8 @@ MY_UCA_INFO my_uca_v520_th=
THAI_CONTRACTIONS_W2, /* nitems */ THAI_CONTRACTIONS_W2, /* nitems */
thai_contractions_w2, /* item */ thai_contractions_w2, /* item */
NULL /* flags */ NULL /* flags */
} },
1 /* levelno */
}, },
}, },
...@@ -30127,8 +30130,9 @@ MY_UCA_INFO my_uca_v520= ...@@ -30127,8 +30130,9 @@ MY_UCA_INFO my_uca_v520=
{ /* Contractions: */ { /* Contractions: */
0, /* nitems */ 0, /* nitems */
NULL, /* item */ NULL, /* item */
NULL /* flags */ NULL /* flags */
} },
0 /* levelno */
}, },
}, },
...@@ -31529,37 +31533,88 @@ my_uca_previous_context_find(my_uca_scanner *scanner, ...@@ -31529,37 +31533,88 @@ my_uca_previous_context_find(my_uca_scanner *scanner,
/****************************************************************/ /****************************************************************/
/**
Implicit weights for a code CP are constructed as follows:
[.AAAA.0020.0002][.BBBB.0000.0000]
where:
AAAA= BASE + (CP >> 15);
BBBB= (CP & 0x7FFF) | 0x8000;
There are two weights in the primary level (AAAA followed by BBBB).
There is one weight on other levels:
- 0020 on the secondary level
- 0002 on the tertiary level
*/
/**
Return BASE for an implicit weight on the primary level
According to UCA, BASE is calculated as follows:
- FB40 for Unified_Ideograph=True AND
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FB80 for Unified_Ideograph=True AND NOT
((Block=CJK_Unified_Ideograph) OR
(Block=CJK_Compatibility_Ideographs))
- FBC0 for any other code point
TODO: it seems we're not handling BASE correctly:
- check what are those blocks
- there are more Unified Ideograph blocks in the latest Unicode versions
*/
static inline uint16
my_uca_implicit_weight_base(my_wc_t code)
{
if (code >= 0x3400 && code <= 0x4DB5)
return 0xFB80;
if (code >= 0x4E00 && code <= 0x9FA5)
return 0xFB40;
return 0xFBC0;
}
/** /**
Return implicit UCA weight Return an implicit UCA weight for the primary level.
Used for characters that do not have assigned UCA weights. Used for characters that do not have assigned UCA weights.
@param scanner UCA weight scanner @param scanner UCA weight scanner
@return The leading implicit weight. @return The leading implicit weight.
The second weight is stored in scanner->implicit[0]
and is later returned on the next my_uca_scanner_next_any() call.
*/ */
static inline int static inline int
my_uca_scanner_next_implicit(my_uca_scanner *scanner) my_uca_scanner_next_implicit_primary(my_uca_scanner *scanner)
{ {
scanner->code= (scanner->page << 8) + scanner->code; my_wc_t wc= (scanner->page << 8) + scanner->code;
scanner->implicit[0]= (scanner->code & 0x7FFF) | 0x8000; scanner->implicit[0]= (wc & 0x7FFF) | 0x8000; /* The second weight */
scanner->implicit[1]= 0; scanner->implicit[1]= 0; /* 0 terminator */
scanner->wbeg= scanner->implicit; scanner->wbeg= scanner->implicit;
return my_uca_implicit_weight_base(wc) + (wc >> 15);
scanner->page= scanner->page >> 7;
if (scanner->code >= 0x3400 && scanner->code <= 0x4DB5)
scanner->page+= 0xFB80;
else if (scanner->code >= 0x4E00 && scanner->code <= 0x9FA5)
scanner->page+= 0xFB40;
else
scanner->page+= 0xFBC0;
return scanner->page;
} }
/**
Return an implicit weight for the current level
(according to scanner->level->levelno).
*/
static inline int
my_uca_scanner_next_implicit(my_uca_scanner *scanner)
{
switch (scanner->level->levelno) {
case 0: return my_uca_scanner_next_implicit_primary(scanner);/* Primary level*/
case 1: scanner->wbeg= nochar; return 0x0020; /* Secondary level */
case 2: scanner->wbeg= nochar; return 0x0002; /* Tertiary level */
default: scanner->wbeg= nochar; break;
}
DBUG_ASSERT(0);
return 0;
}
/* /*
The same two functions for any character set The same two functions for any character set
*/ */
...@@ -33829,6 +33884,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level, ...@@ -33829,6 +33884,7 @@ init_weight_level(MY_CHARSET_LOADER *loader, MY_COLL_RULES *rules, int level,
size_t i, npages= (src->maxchar + 1) / 256; size_t i, npages= (src->maxchar + 1) / 256;
dst->maxchar= src->maxchar; dst->maxchar= src->maxchar;
dst->levelno= src->levelno;
if (check_rules(loader, rules, dst, src)) if (check_rules(loader, rules, dst, src))
return TRUE; return TRUE;
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment