Commit df2d425a authored by bar@mysql.com's avatar bar@mysql.com

Bug#16233: XML: ExtractValue() fails with special characters

ExtractValue didn't understand tag and attribute names
consisting of "tricky" national letters (e.g. latin accenter letters).
It happened because XPath lex parser recognized only basic
latin letter a..z ad a part of an identifier.

Fixed to recognize all letters by means of new "full ctype" which
was added recently.
parent ba5d08f3
...@@ -615,3 +615,26 @@ select extractValue('<e>1</e>','last()'); ...@@ -615,3 +615,26 @@ select extractValue('<e>1</e>','last()');
ERROR HY000: XPATH syntax error: '' ERROR HY000: XPATH syntax error: ''
select extractValue('<e><a>1</a></e>','/e/'); select extractValue('<e><a>1</a></e>','/e/');
ERROR HY000: XPATH syntax error: '' ERROR HY000: XPATH syntax error: ''
set names utf8;
select extractValue('<Ñ><r>r</r></Ñ>','/Ñ/r');
extractValue('<Ñ><r>r</r></Ñ>','/Ñ/r')
r
select extractValue('<r><Ñ>Ñ</Ñ></r>','/r/Ñ');
extractValue('<r><Ñ>Ñ</Ñ></r>','/r/Ñ')
Ñ
select extractValue('<Ñ r="r"/>','/Ñ/@r');
extractValue('<Ñ r="r"/>','/Ñ/@r')
r
select extractValue('<r Ñ="Ñ"/>','/r/@Ñ');
extractValue('<r Ñ="Ñ"/>','/r/@Ñ')
Ñ
DROP PROCEDURE IF EXISTS p2;
CREATE PROCEDURE p2 ()
BEGIN
DECLARE p LONGTEXT CHARACTER SET UTF8 DEFAULT '<Ñ><r>A</r></Ñ>';
SELECT EXTRACTVALUE(p,'/Ñ/r');
END//
CALL p2();
EXTRACTVALUE(p,'/Ñ/r')
A
DROP PROCEDURE p2;
...@@ -295,3 +295,23 @@ select extractValue('<e>1</e>','last()'); ...@@ -295,3 +295,23 @@ select extractValue('<e>1</e>','last()');
--error 1105 --error 1105
select extractValue('<e><a>1</a></e>','/e/'); select extractValue('<e><a>1</a></e>','/e/');
#
# Bug#16233: XML: ExtractValue() fails with special characters
#
set names utf8;
select extractValue('<Ñ><r>r</r></Ñ>','/Ñ/r');
select extractValue('<r><Ñ>Ñ</Ñ></r>','/r/Ñ');
select extractValue('<Ñ r="r"/>','/Ñ/@r');
select extractValue('<r Ñ="Ñ"/>','/r/@Ñ');
--disable_warnings
DROP PROCEDURE IF EXISTS p2;
--enable_warnings
DELIMITER //;
CREATE PROCEDURE p2 ()
BEGIN
DECLARE p LONGTEXT CHARACTER SET UTF8 DEFAULT '<Ñ><r>A</r></Ñ>';
SELECT EXTRACTVALUE(p,'/Ñ/r');
END//
DELIMITER ;//
CALL p2();
DROP PROCEDURE p2;
...@@ -1304,30 +1304,6 @@ my_xpath_init(MY_XPATH *xpath) ...@@ -1304,30 +1304,6 @@ my_xpath_init(MY_XPATH *xpath)
} }
/*
Some ctype-alike helper functions. Note, we cannot
reuse cs->ident_map[], because in Xpath, unlike in SQL,
dash character is a valid identifier part.
*/
static int
my_xident_beg(int c)
{
return (((c) >= 'a' && (c) <= 'z') ||
((c) >= 'A' && (c) <= 'Z') ||
((c) == '_'));
}
static int
my_xident_body(int c)
{
return (((c) >= 'a' && (c) <= 'z') ||
((c) >= 'A' && (c) <= 'Z') ||
((c) >= '0' && (c) <= '9') ||
((c)=='-') || ((c) == '_'));
}
static int static int
my_xdigit(int c) my_xdigit(int c)
{ {
...@@ -1350,7 +1326,7 @@ static void ...@@ -1350,7 +1326,7 @@ static void
my_xpath_lex_scan(MY_XPATH *xpath, my_xpath_lex_scan(MY_XPATH *xpath,
MY_XPATH_LEX *lex, const char *beg, const char *end) MY_XPATH_LEX *lex, const char *beg, const char *end)
{ {
int ch; int ch, ctype, length;
for ( ; beg < end && *beg == ' ' ; beg++); // skip leading spaces for ( ; beg < end && *beg == ' ' ; beg++); // skip leading spaces
lex->beg= beg; lex->beg= beg;
...@@ -1360,20 +1336,20 @@ my_xpath_lex_scan(MY_XPATH *xpath, ...@@ -1360,20 +1336,20 @@ my_xpath_lex_scan(MY_XPATH *xpath,
lex->term= MY_XPATH_LEX_EOF; // end of line reached lex->term= MY_XPATH_LEX_EOF; // end of line reached
return; return;
} }
ch= *beg++;
// Check ident, or a function call, or a keyword
if (ch > 0 && ch < 128 && simpletok[ch]) if ((length= xpath->cs->cset->ctype(xpath->cs, &ctype,
{ (const uchar*) beg,
// a token consisting of one character found (const uchar*) end)) > 0 &&
lex->end= beg; ((ctype & (_MY_L | _MY_U)) || *beg == '_'))
lex->term= ch;
return;
}
if (my_xident_beg(ch)) // ident, or a function call, or a keyword
{ {
// scan until the end of the identifier // scan untill the end of the idenfitier
for ( ; beg < end && my_xident_body(*beg); beg++); for (beg+= length;
(length= xpath->cs->cset->ctype(xpath->cs, &ctype,
(const uchar*) beg,
(const uchar*) end)) > 0 &&
((ctype & (_MY_L | _MY_U | _MY_NMR)) || *beg == '_' || *beg == '-') ;
beg+= length) /* no op */;
lex->end= beg; lex->end= beg;
// check if a function call // check if a function call
...@@ -1388,6 +1364,18 @@ my_xpath_lex_scan(MY_XPATH *xpath, ...@@ -1388,6 +1364,18 @@ my_xpath_lex_scan(MY_XPATH *xpath,
return; return;
} }
ch= *beg++;
if (ch > 0 && ch < 128 && simpletok[ch])
{
// a token consisting of one character found
lex->end= beg;
lex->term= ch;
return;
}
if (my_xdigit(ch)) // a sequence of digits if (my_xdigit(ch)) // a sequence of digits
{ {
for ( ; beg < end && my_xdigit(*beg) ; beg++); for ( ; beg < end && my_xdigit(*beg) ; beg++);
......
...@@ -1362,7 +1362,7 @@ int my_mb_ctype_8bit(CHARSET_INFO *cs, int *ctype, ...@@ -1362,7 +1362,7 @@ int my_mb_ctype_8bit(CHARSET_INFO *cs, int *ctype,
*ctype= 0; *ctype= 0;
return MY_CS_TOOSMALL; return MY_CS_TOOSMALL;
} }
*ctype= cs->ctype[*s]; *ctype= cs->ctype[*s + 1];
return 1; return 1;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment