Commit 63e1d22f authored by unknown's avatar unknown

UTF8 now process space as PAD character correctly.

parent 403948cb
...@@ -12,3 +12,27 @@ select * from t1; ...@@ -12,3 +12,27 @@ select * from t1;
id id
000000000001 000000000001
drop table t1; drop table t1;
SELECT 'a' = 'a ';
'a' = 'a '
1
SELECT 'a\0' < 'a';
'a\0' < 'a'
1
SELECT 'a\0' < 'a ';
'a\0' < 'a '
1
SELECT 'a\t' < 'a';
'a\t' < 'a'
1
SELECT 'a\t' < 'a ';
'a\t' < 'a '
1
CREATE TABLE t1 (a char(10) not null);
INSERT INTO t1 VALUES ('a'),('a\0'),('a\t'),('a ');
SELECT hex(a),STRCMP(a,'a'), STRCMP(a,'a ') FROM t1;
hex(a) STRCMP(a,'a') STRCMP(a,'a ')
61 0 0
6100 -1 -1
6109 -1 -1
61 0 0
DROP TABLE t1;
...@@ -63,6 +63,30 @@ select 'A' like 'a' collate utf8_bin; ...@@ -63,6 +63,30 @@ select 'A' like 'a' collate utf8_bin;
select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%'); select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%');
_utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%') _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%')
1 1
SELECT 'a' = 'a ';
'a' = 'a '
1
SELECT 'a\0' < 'a';
'a\0' < 'a'
1
SELECT 'a\0' < 'a ';
'a\0' < 'a '
1
SELECT 'a\t' < 'a';
'a\t' < 'a'
1
SELECT 'a\t' < 'a ';
'a\t' < 'a '
1
CREATE TABLE t1 (a char(10) character set utf8 not null);
INSERT INTO t1 VALUES ('a'),('a\0'),('a\t'),('a ');
SELECT hex(a),STRCMP(a,'a'), STRCMP(a,'a ') FROM t1;
hex(a) STRCMP(a,'a') STRCMP(a,'a ')
61 0 0
6100 -1 -1
6109 -1 -1
61 0 0
DROP TABLE t1;
select insert('txs',2,1,'hi'),insert('is ',4,0,'a'),insert('txxxxt',2,4,'es'); select insert('txs',2,1,'hi'),insert('is ',4,0,'a'),insert('txxxxt',2,4,'es');
insert('txs',2,1,'hi') insert('is ',4,0,'a') insert('txxxxt',2,4,'es') insert('txs',2,1,'hi') insert('is ',4,0,'a') insert('txxxxt',2,4,'es')
this is a test this is a test
......
...@@ -13,3 +13,20 @@ select * from t1 where id=000000000001; ...@@ -13,3 +13,20 @@ select * from t1 where id=000000000001;
delete from t1 where id=000000000002; delete from t1 where id=000000000002;
select * from t1; select * from t1;
drop table t1; drop table t1;
#
# Check the following:
# "a" == "a "
# "a\0" < "a"
# "a\0" < "a "
SELECT 'a' = 'a ';
SELECT 'a\0' < 'a';
SELECT 'a\0' < 'a ';
SELECT 'a\t' < 'a';
SELECT 'a\t' < 'a ';
CREATE TABLE t1 (a char(10) not null);
INSERT INTO t1 VALUES ('a'),('a\0'),('a\t'),('a ');
SELECT hex(a),STRCMP(a,'a'), STRCMP(a,'a ') FROM t1;
DROP TABLE t1;
...@@ -33,6 +33,23 @@ select 'A' like 'a'; ...@@ -33,6 +33,23 @@ select 'A' like 'a';
select 'A' like 'a' collate utf8_bin; select 'A' like 'a' collate utf8_bin;
select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%'); select _utf8 0xD0B0D0B1D0B2 like concat(_utf8'%',_utf8 0xD0B1,_utf8 '%');
#
# Check the following:
# "a" == "a "
# "a\0" < "a"
# "a\0" < "a "
SELECT 'a' = 'a ';
SELECT 'a\0' < 'a';
SELECT 'a\0' < 'a ';
SELECT 'a\t' < 'a';
SELECT 'a\t' < 'a ';
CREATE TABLE t1 (a char(10) character set utf8 not null);
INSERT INTO t1 VALUES ('a'),('a\0'),('a\t'),('a ');
SELECT hex(a),STRCMP(a,'a'), STRCMP(a,'a ') FROM t1;
DROP TABLE t1;
# #
# Fix this, it should return 1: # Fix this, it should return 1:
# #
......
...@@ -1837,18 +1837,98 @@ static int my_strnncoll_utf8(CHARSET_INFO *cs, ...@@ -1837,18 +1837,98 @@ static int my_strnncoll_utf8(CHARSET_INFO *cs,
} }
/* /*
TODO: Has to be fixed as strnncollsp in ctype-simple Compare strings, discarding end space
SYNOPSIS
my_strnncollsp_utf8()
cs character set handler
a First string to compare
a_length Length of 'a'
b Second string to compare
b_length Length of 'b'
IMPLEMENTATION
If one string is shorter as the other, then we space extend the other
so that the strings have equal length.
This will ensure that the following things hold:
"a" == "a "
"a\0" < "a"
"a\0" < "a "
RETURN
< 0 a < b
= 0 a == b
> 0 a > b
*/ */
static static int my_strnncollsp_utf8(CHARSET_INFO *cs,
int my_strnncollsp_utf8(CHARSET_INFO * cs,
const uchar *s, uint slen, const uchar *s, uint slen,
const uchar *t, uint tlen) const uchar *t, uint tlen)
{ {
for ( ; slen && s[slen-1] == ' ' ; slen--); int s_res,t_res;
for ( ; tlen && t[tlen-1] == ' ' ; tlen--); my_wc_t s_wc,t_wc;
return my_strnncoll_utf8(cs,s,slen,t,tlen); const uchar *se= s+slen;
const uchar *te= t+tlen;
while ( s < se && t < te )
{
int plane;
s_res=my_utf8_uni(cs,&s_wc, s, se);
t_res=my_utf8_uni(cs,&t_wc, t, te);
if ( s_res <= 0 || t_res <= 0 )
{
/* Incorrect string, compare by char value */
return ((int)s[0]-(int)t[0]);
}
plane=(s_wc>>8) & 0xFF;
s_wc = uni_plane[plane] ? uni_plane[plane][s_wc & 0xFF].sort : s_wc;
plane=(t_wc>>8) & 0xFF;
t_wc = uni_plane[plane] ? uni_plane[plane][t_wc & 0xFF].sort : t_wc;
if ( s_wc != t_wc )
{
return ((int) s_wc) - ((int) t_wc);
}
s+=s_res;
t+=t_res;
}
slen= se-s;
tlen= te-t;
if (slen != tlen)
{
int swap= 0;
if (slen < tlen)
{
slen= tlen;
s= t;
se= te;
swap= -1;
}
/*
This following loop uses the fact that in UTF-8
all multibyte characters are greater than space,
and all multibyte head characters are greater than
space. It means if we meet a character greater
than space, it always means that the longer string
is greater. So we can reuse the same loop from the
8bit version, without having to process full multibute
sequences.
*/
for ( ; s < se; s++)
{
if (*s != ' ')
return ((int)*s - (int) ' ') ^ swap;
}
}
return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment