Commit 00917fae authored by Alexander Barkov's avatar Alexander Barkov

MDEV-9874 LOAD XML INFILE does not handle well broken multi-byte characters

- Moving the new my_charlen()-based code handling multi-byte characters
  from READ_INFO::field_field() to a new method READ_INFO::read_mbtail()
- Reusing read_mbtail() in READ_INFO::read_value(), instead of the old
  my_mbcharlen()-based code which did not catch broken byte sequences
parent d516a2ae
......@@ -10448,5 +10448,16 @@ HEX(a)
3F3F3F
DROP TABLE t1;
#
# MDEV-9874 LOAD XML INFILE does not handle well broken multi-byte characters
#
CREATE TABLE t1 (a TEXT CHARACTER SET utf8);
LOAD XML INFILE '../../std_data/loaddata/mdev9874.xml' INTO TABLE t1 CHARACTER SET utf8 ROWS IDENTIFIED BY '<row>';
Warnings:
Warning 1366 Incorrect string value: '\xD0' for column 'a' at row 1
SELECT HEX(a) FROM t1;
HEX(a)
613F
DROP TABLE t1;
#
# End of 10.2 tests
#
<table><row><a></a></row></table>
\ No newline at end of file
......@@ -1974,6 +1974,14 @@ LOAD DATA INFILE '../../std_data/loaddata/mdev9823.utf8mb4.txt' INTO TABLE t1 CH
SELECT HEX(a) FROM t1;
DROP TABLE t1;
--echo #
--echo # MDEV-9874 LOAD XML INFILE does not handle well broken multi-byte characters
--echo #
CREATE TABLE t1 (a TEXT CHARACTER SET utf8);
LOAD XML INFILE '../../std_data/loaddata/mdev9874.xml' INTO TABLE t1 CHARACTER SET utf8 ROWS IDENTIFIED BY '<row>';
SELECT HEX(a) FROM t1;
DROP TABLE t1;
--echo #
--echo # End of 10.2 tests
--echo #
......@@ -119,6 +119,70 @@ class READ_INFO {
*to= chr;
return false;
}
/**
Read a tail of a multi-byte character.
The first byte of the character is assumed to be already
read from the file and appended to "str".
@returns true - if EOF happened unexpectedly
@returns false - no EOF happened: found a good multi-byte character,
or a bad byte sequence
Note:
The return value depends only on EOF:
- read_mbtail() returns "false" is a good character was read, but also
- read_mbtail() returns "false" if an incomplete byte sequence was found
and no EOF happened.
For example, suppose we have an ujis file with bytes 0x8FA10A, where:
- 0x8FA1 is an incomplete prefix of a 3-byte character
(it should be [8F][A1-FE][A1-FE] to make a full 3-byte character)
- 0x0A is a line demiliter
This file has some broken data, the trailing [A1-FE] is missing.
In this example it works as follows:
- 0x8F is read from the file and put into "data" before the call
for read_mbtail()
- 0xA1 is read from the file and put into "data" by read_mbtail()
- 0x0A is kept in the read queue, so the next read iteration after
the current read_mbtail() call will normally find it and recognize as
a line delimiter
- the current call for read_mbtail() returns "false",
because no EOF happened
*/
bool read_mbtail(String *str)
{
int chlen;
if ((chlen= my_charlen(read_charset, str->end() - 1, str->end())) == 1)
return false; // Single byte character found
for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
{
int chr= GET;
if (chr == my_b_EOF)
{
DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen));
return true; // EOF
}
str->append(chr);
chlen= my_charlen(read_charset, str->ptr() + length0, str->end());
if (chlen == MY_CS_ILSEQ)
{
/**
It has been an incomplete (but a valid) sequence so far,
but the last byte turned it into a bad byte sequence.
Unget the very last byte.
*/
str->length(str->length() - 1);
PUSH(chr);
DBUG_PRINT("info", ("read_mbtail: ILSEQ"));
return false; // Bad byte sequence
}
}
DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen));
return false; // Good multi-byte character
}
public:
bool error,line_cuted,found_null,enclosed;
uchar *row_start, /* Found row starts here */
......@@ -1590,33 +1654,8 @@ int READ_INFO::read_field()
}
}
data.append(chr);
if (use_mb(read_charset))
{
int chlen;
if ((chlen= my_charlen(read_charset, data.end() - 1,
data.end())) != 1)
{
for (uint32 length0= data.length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
{
chr= GET;
if (chr == my_b_EOF)
goto found_eof;
data.append(chr);
chlen= my_charlen(read_charset, data.ptr() + length0, data.end());
if (chlen == MY_CS_ILSEQ)
{
/**
It has been an incomplete (but a valid) sequence so far,
but the last byte turned it into a bad byte sequence.
Unget the very last byte.
*/
data.length(data.length() - 1);
PUSH(chr);
break;
}
}
}
}
if (use_mb(read_charset) && read_mbtail(&data))
goto found_eof;
}
/*
** We come here if buffer is too small. Enlarge it and continue
......@@ -1872,26 +1911,8 @@ int READ_INFO::read_value(int delim, String *val)
int chr;
String tmp;
for (chr= GET; my_tospace(chr) != delim && chr != my_b_EOF;)
for (chr= GET; my_tospace(chr) != delim && chr != my_b_EOF; chr= GET)
{
#ifdef USE_MB
if (my_mbcharlen(read_charset, chr) > 1)
{
DBUG_PRINT("read_xml",("multi byte"));
int i, ml= my_mbcharlen(read_charset, chr);
for (i= 1; i < ml; i++)
{
val->append(chr);
/*
Don't use my_tospace() in the middle of a multi-byte character
TODO: check that the multi-byte sequence is valid.
*/
chr= GET;
if (chr == my_b_EOF)
return chr;
}
}
#endif
if(chr == '&')
{
tmp.length(0);
......@@ -1911,8 +1932,11 @@ int READ_INFO::read_value(int delim, String *val)
}
}
else
{
val->append(chr);
chr= GET;
if (use_mb(read_charset) && read_mbtail(val))
return my_b_EOF;
}
}
return my_tospace(chr);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment