Commit dd0ff302 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-11343 LOAD DATA INFILE fails to load data with an escape character...

MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character

Partially backporting MDEV-9874 from 10.2 to 10.0

READ_INFO::read_field() raised the ER_INVALID_CHARACTER_STRING error
when reading an escape character followed by a multi-byte character.

Raising wellformedness errors in READ_INFO::read_field() was wrong,
because the main goal of READ_INFO::read_field() is to *unescape* the
data which was presumably escaped using mysql_real_escape_string(),
using the same character set with the one specified in
"LOAD DATA INFILE ... CHARACTER SET ..." (or assumed by default).

During LOAD DATA, multi-byte characters are not always scanned as a single
entity! In case of escaped data, parts of a multi-byte character can be
scanned on different loop iterations. So the old code erroneously tested
welformedness in the middle of a multi-byte character.

Moreover, the data after unescaping can go into a BLOB field, not a text field.
Wellformedness tests are meaningless in this case.

Ater this patch, wellformedness is only checked later, during
Field::store(str,length,cs) time. The loop that scans bytes only
makes sure to revert the changes made by mysql_real_escape_string().

Note, in some cases users can supply data which did not really go through
mysql_real_escape_string() and was escaped by some other means,
or was not escaped at all. The file reported in this MDEV contains
the string "\ä", which is an example of such improperly escaped data, as
- either there should be two backslashes:   "\\ä"
- or there should be no backslashes at all: "ä"
mysql_real_escape_string() could not generate "\ä".
parent 099ce1dd
......@@ -180,6 +180,9 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
/* A helper macros for "need at least n bytes" */
#define MY_CS_TOOSMALLN(n) (-100-(n))
#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL)
#define MY_SEQ_INTTAIL 1
#define MY_SEQ_SPACES 2
......
......@@ -3356,5 +3356,28 @@ DFFFFFDFFFFF9CFFFF9DFFFF9EFFFF
# End of 5.6 tests
#
#
# Start of 10.0 tests
#
#
# MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character
#
CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
LOAD DATA INFILE '../../std_data/loaddata/mdev-11343.txt' INTO TABLE t1 CHARACTER SET utf8mb4;
SELECT HEX(a) FROM t1;
HEX(a)
C3A4
C3A478
78C3A4
78C3A478
EA99A0
EA99A078
78EA99A0
78EA99A078
F09F988E
F09F988E78
78F09F988E
78F09F988E78
DROP TABLE t1;
#
# End of tests
#
......@@ -552,7 +552,8 @@ CREATE DATABASE d2 CHARSET utf8;
USE d2;
CREATE TABLE t1 (val TEXT);
LOAD DATA INFILE '../../std_data/bug20683959loaddata.txt' INTO TABLE t1;
ERROR HY000: Invalid utf8 character string: '"RT @niouzechun: \9058\221A'
Warnings:
Warning 1366 Incorrect string value: '\xF5\x80\x81\xAE\xE7\xB9...' for column 'val' at row 1
DROP TABLE d1.t1, d2.t1;
DROP DATABASE d1;
DROP DATABASE d2;
\äx
x\ä
x\äx
\Ꙡ
\Ꙡx
x\Ꙡ
x\Ꙡx
\😎
\😎x
x\😎
x\😎x
......@@ -1864,6 +1864,17 @@ set @@collation_connection=utf8mb4_bin;
--echo # End of 5.6 tests
--echo #
--echo #
--echo # Start of 10.0 tests
--echo #
--echo #
--echo # MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character
--echo #
CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
LOAD DATA INFILE '../../std_data/loaddata/mdev-11343.txt' INTO TABLE t1 CHARACTER SET utf8mb4;
SELECT HEX(a) FROM t1;
DROP TABLE t1;
--echo #
--echo # End of tests
......
......@@ -675,7 +675,6 @@ SELECT HEX(val) FROM t1;
CREATE DATABASE d2 CHARSET utf8;
USE d2;
CREATE TABLE t1 (val TEXT);
--error ER_INVALID_CHARACTER_STRING
LOAD DATA INFILE '../../std_data/bug20683959loaddata.txt' INTO TABLE t1;
DROP TABLE d1.t1, d2.t1;
......
......@@ -79,6 +79,81 @@ class READ_INFO {
NET *io_net;
int level; /* for load xml */
#if MYSQL_VERSION_ID >= 100200
#error This 10.0 and 10.1 specific fix should be removed in 10.2.
#error Fix read_mbtail() to use my_charlen() instead of my_charlen_tmp()
#else
int my_charlen_tmp(CHARSET_INFO *cs, const char *str, const char *end)
{
my_wc_t wc;
return cs->cset->mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end);
}
/**
Read a tail of a multi-byte character.
The first byte of the character is assumed to be already
read from the file and appended to "str".
@returns true - if EOF happened unexpectedly
@returns false - no EOF happened: found a good multi-byte character,
or a bad byte sequence
Note:
The return value depends only on EOF:
- read_mbtail() returns "false" is a good character was read, but also
- read_mbtail() returns "false" if an incomplete byte sequence was found
and no EOF happened.
For example, suppose we have an ujis file with bytes 0x8FA10A, where:
- 0x8FA1 is an incomplete prefix of a 3-byte character
(it should be [8F][A1-FE][A1-FE] to make a full 3-byte character)
- 0x0A is a line demiliter
This file has some broken data, the trailing [A1-FE] is missing.
In this example it works as follows:
- 0x8F is read from the file and put into "data" before the call
for read_mbtail()
- 0xA1 is read from the file and put into "data" by read_mbtail()
- 0x0A is kept in the read queue, so the next read iteration after
the current read_mbtail() call will normally find it and recognize as
a line delimiter
- the current call for read_mbtail() returns "false",
because no EOF happened
*/
bool read_mbtail(String *str)
{
int chlen;
if ((chlen= my_charlen_tmp(read_charset, str->end() - 1, str->end())) == 1)
return false; // Single byte character found
for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
{
int chr= GET;
if (chr == my_b_EOF)
{
DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen));
return true; // EOF
}
str->append(chr);
chlen= my_charlen_tmp(read_charset, str->ptr() + length0, str->end());
if (chlen == MY_CS_ILSEQ)
{
/**
It has been an incomplete (but a valid) sequence so far,
but the last byte turned it into a bad byte sequence.
Unget the very last byte.
*/
str->length(str->length() - 1);
PUSH(chr);
DBUG_PRINT("info", ("read_mbtail: ILSEQ"));
return false; // Bad byte sequence
}
}
DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen));
return false; // Good multi-byte character
}
#endif
public:
bool error,line_cuted,found_null,enclosed;
uchar *row_start, /* Found row starts here */
......@@ -1474,6 +1549,54 @@ inline int READ_INFO::terminator(const uchar *ptr,uint length)
}
/**
Read a field.
The data in the loaded file was presumably escaped using
- either select_export::send_data() OUTFILE
- or mysql_real_escape_string()
using the same character set with the one specified in the current
"LOAD DATA INFILE ... CHARACTER SET ..." (or the default LOAD character set).
Note, non-escaped multi-byte characters are scanned as a single entity.
This is needed to correctly distinguish between:
- 0x5C as an escape character versus
- 0x5C as the second byte in a multi-byte sequence (big5, cp932, gbk, sjis)
Parts of escaped multi-byte characters are scanned on different loop
iterations. See the comment about 0x5C handling in select_export::send_data()
in sql_class.cc.
READ_INFO::read_field() does not check wellformedness.
Raising wellformedness errors or warnings in READ_INFO::read_field()
would be wrong, as the data after unescaping can go into a BLOB field,
or into a TEXT/VARCHAR field of a different character set.
The loop below only makes sure to revert escaping made by
select_export::send_data() or mysql_real_escape_string().
Wellformedness is checked later, during Field::store(str,length,cs) time.
Note, in some cases users can supply data which did not go through
escaping properly. For example, utf8 "\<C3><A4>"
(backslash followed by LATIN SMALL LETTER A WITH DIAERESIS)
is improperly escaped data that could not be generated by
select_export::send_data() / mysql_real_escape_string():
- either there should be two backslashes: "\\<C3><A4>"
- or there should be no backslashes at all: "<C3><A4>"
"\<C3>" and "<A4> are scanned on two different loop iterations and
store "<C3><A4>" into the field.
Note, adding useless escapes before multi-byte characters like in the
example above is safe in case of utf8, but is not safe in case of
character sets that have escape_with_backslash_is_dangerous==TRUE,
such as big5, cp932, gbk, sjis. This can lead to mis-interpretation of the
data. Suppose we have a big5 character "<EE><5C>" followed by <30> (digit 0).
If we add an extra escape before this sequence, then we'll get
<5C><EE><5C><30>. The first loop iteration will turn <5C><EE> into <EE>.
The second loop iteration will turn <5C><30> into <30>.
So the program that generates a dump file for further use with LOAD DATA
must make sure to use escapes properly.
*/
int READ_INFO::read_field()
{
int chr,found_enclosed_char;
......@@ -1510,7 +1633,8 @@ int READ_INFO::read_field()
for (;;)
{
while ( to < end_of_buff)
// Make sure we have enough space for the longest multi-byte character.
while ( to + read_charset->mbmaxlen < end_of_buff)
{
chr = GET;
if (chr == my_b_EOF)
......@@ -1598,52 +1722,27 @@ int READ_INFO::read_field()
}
}
#ifdef USE_MB
uint ml= my_mbcharlen(read_charset, chr);
if (ml == 0)
{
*to= '\0';
my_error(ER_INVALID_CHARACTER_STRING, MYF(0),
read_charset->csname, buffer);
error= true;
return 1;
}
if (ml > 1 &&
to + ml <= end_of_buff)
{
uchar* p= to;
*to++ = chr;
for (uint i= 1; i < ml; i++)
{
chr= GET;
if (chr == my_b_EOF)
{
/*
Need to back up the bytes already ready from illformed
multi-byte char
*/
to-= i;
goto found_eof;
}
*to++ = chr;
}
if (my_ismbchar(read_charset,
(const char *)p,
(const char *)to))
continue;
for (uint i= 0; i < ml; i++)
PUSH(*--to);
chr= GET;
}
else if (ml > 1)
{
// Buffer is too small, exit while loop, and reallocate.
PUSH(chr);
break;
}
#endif
*to++ = (uchar) chr;
#if MYSQL_VERSION_ID >= 100200
#error This 10.0 and 10.1 specific fix should be removed in 10.2
#else
if (my_mbcharlen(read_charset, (uchar) chr) > 1)
{
/*
A known MBHEAD found. Try to scan the full multi-byte character.
Otherwise, a possible following second byte 0x5C would be
mis-interpreted as an escape on the next iteration.
(Important for big5, gbk, sjis, cp932).
*/
String tmp((char *) to - 1, read_charset->mbmaxlen, read_charset);
tmp.length(1);
bool eof= read_mbtail(&tmp);
to+= tmp.length() - 1;
if (eof)
goto found_eof;
}
#endif
}
/*
** We come here if buffer is too small. Enlarge it and continue
......
......@@ -136,6 +136,7 @@ class String
inline bool is_empty() const { return (str_length == 0); }
inline void mark_as_const() { Alloced_length= 0;}
inline const char *ptr() const { return Ptr; }
inline const char *end() const { return Ptr + str_length; }
inline char *c_ptr()
{
DBUG_ASSERT(!alloced || !Ptr || !Alloced_length ||
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment