MDEV-11343 LOAD DATA INFILE fails to load data with an escape character...

MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character Partially backporting MDEV-9874 from 10.2 to 10.0 READ_INFO::read_field() raised the ER_INVALID_CHARACTER_STRING error when reading an escape character followed by a multi-byte character. Raising wellformedness errors in READ_INFO::read_field() was wrong, because the main goal of READ_INFO::read_field() is to *unescape* the data which was presumably escaped using mysql_real_escape_string(), using the same character set with the one specified in "LOAD DATA INFILE ... CHARACTER SET ..." (or assumed by default). During LOAD DATA, multi-byte characters are not always scanned as a single entity! In case of escaped data, parts of a multi-byte character can be scanned on different loop iterations. So the old code erroneously tested welformedness in the middle of a multi-byte character. Moreover, the data after unescaping can go into a BLOB field, not a text field. Wellformedness tests are meaningless in this case. Ater this patch, wellformedness is only checked later, during Field::store(str,length,cs) time. The loop that scans bytes only makes sure to revert the changes made by mysql_real_escape_string(). Note, in some cases users can supply data which did not really go through mysql_real_escape_string() and was escaped by some other means, or was not escaped at all. The file reported in this MDEV contains the string "\ä", which is an example of such improperly escaped data, as - either there should be two backslashes: "\\ä" - or there should be no backslashes at all: "ä" mysql_real_escape_string() could not generate "\ä".

MDEV-11343 LOAD DATA INFILE fails to load data with an escape character...
MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character Partially backporting MDEV-9874 from 10.2 to 10.0 READ_INFO::read_field() raised the ER_INVALID_CHARACTER_STRING error when reading an escape character followed by a multi-byte character. Raising wellformedness errors in READ_INFO::read_field() was wrong, because the main goal of READ_INFO::read_field() is to *unescape* the data which was presumably escaped using mysql_real_escape_string(), using the same character set with the one specified in "LOAD DATA INFILE ... CHARACTER SET ..." (or assumed by default). During LOAD DATA, multi-byte characters are not always scanned as a single entity! In case of escaped data, parts of a multi-byte character can be scanned on different loop iterations. So the old code erroneously tested welformedness in the middle of a multi-byte character. Moreover, the data after unescaping can go into a BLOB field, not a text field. Wellformedness tests are meaningless in this case. Ater this patch, wellformedness is only checked later, during Field::store(str,length,cs) time. The loop that scans bytes only makes sure to revert the changes made by mysql_real_escape_string(). Note, in some cases users can supply data which did not really go through mysql_real_escape_string() and was escaped by some other means, or was not escaped at all. The file reported in this MDEV contains the string "\ä", which is an example of such improperly escaped data, as - either there should be two backslashes: "\\ä" - or there should be no backslashes at all: "ä" mysql_real_escape_string() could not generate "\ä".
dd0ff302 · Alexander Barkov · 099ce1dd · dd0ff302 · dd0ff302 · dd0ff302
Commit dd0ff302 authored Nov 29, 2016 by Alexander Barkov
8 changed files
--- a/include/m_ctype.h
+++ b/include/m_ctype.h
@@ -180,6 +180,9 @@ extern MY_UNI_CTYPE my_uni_ctype[256];
 /* A helper macros for "need at least n bytes" */
 #define MY_CS_TOOSMALLN(n)    (-100-(n))

+#define MY_CS_IS_TOOSMALL(rc) ((rc) >= MY_CS_TOOSMALL6 && (rc) <= MY_CS_TOOSMALL)
+
+
 #define MY_SEQ_INTTAIL	1
 #define MY_SEQ_SPACES	2


--- a/mysql-test/r/ctype_utf8mb4.result
+++ b/mysql-test/r/ctype_utf8mb4.result
@@ -3356,5 +3356,28 @@ DFFFFFDFFFFF9CFFFF9DFFFF9EFFFF
 # End of 5.6 tests
 #
 #
+# Start of 10.0 tests
+#
+#
+# MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character
+#
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
+LOAD DATA INFILE '../../std_data/loaddata/mdev-11343.txt' INTO TABLE t1 CHARACTER SET utf8mb4;
+SELECT HEX(a) FROM t1;
+HEX(a)
+C3A4
+C3A478
+78C3A4
+78C3A478
+EA99A0
+EA99A078
+78EA99A0
+78EA99A078
+F09F988E
+F09F988E78
+78F09F988E
+78F09F988E78
+DROP TABLE t1;
+#
 # End of tests
 #
--- a/mysql-test/r/loaddata.result
+++ b/mysql-test/r/loaddata.result
@@ -552,7 +552,8 @@ CREATE DATABASE d2 CHARSET utf8;
 USE d2;
 CREATE TABLE t1 (val TEXT);
 LOAD DATA INFILE '../../std_data/bug20683959loaddata.txt' INTO TABLE t1;
-ERROR HY000: Invalid utf8 character string: '"RT @niouzechun: \9058\221A'
+Warnings:
+Warning	1366	Incorrect string value: '\xF5\x80\x81\xAE\xE7\xB9...' for column 'val' at row 1
 DROP TABLE d1.t1, d2.t1;
 DROP DATABASE d1;
 DROP DATABASE d2;
--- a/mysql-test/std_data/loaddata/mdev-11343.txt
+++ b/mysql-test/std_data/loaddata/mdev-11343.txt
+\ä
+\äx
+x\ä
+x\äx
+\Ꙡ
+\Ꙡx
+x\Ꙡ
+x\Ꙡx
+\😎
+\😎x
+x\😎
+x\😎x
--- a/mysql-test/t/ctype_utf8mb4.test
+++ b/mysql-test/t/ctype_utf8mb4.test
@@ -1864,6 +1864,17 @@ set @@collation_connection=utf8mb4_bin;
 --echo # End of 5.6 tests
 --echo #

+--echo #
+--echo # Start of 10.0 tests
+--echo #
+
+--echo #
+--echo # MDEV-11343 LOAD DATA INFILE fails to load data with an escape character followed by a multi-byte character
+--echo #
+CREATE TABLE t1 (a TEXT CHARACTER SET utf8mb4);
+LOAD DATA INFILE '../../std_data/loaddata/mdev-11343.txt' INTO TABLE t1 CHARACTER SET utf8mb4;
+SELECT HEX(a) FROM t1;
+DROP TABLE t1;

 --echo #
 --echo # End of tests

--- a/mysql-test/t/loaddata.test
+++ b/mysql-test/t/loaddata.test
@@ -675,7 +675,6 @@ SELECT HEX(val) FROM t1;
 CREATE DATABASE d2 CHARSET utf8;
 USE d2;
 CREATE TABLE t1 (val TEXT);
--error ER_INVALID_CHARACTER_STRING
 LOAD DATA INFILE '../../std_data/bug20683959loaddata.txt' INTO TABLE t1;

 DROP TABLE d1.t1, d2.t1;

--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -79,6 +79,81 @@ class READ_INFO {
  NET *io_net;
  int level; /* for load xml */

+
+#if MYSQL_VERSION_ID >= 100200
+#error This 10.0 and 10.1 specific fix should be removed in 10.2.
+#error Fix read_mbtail() to use my_charlen() instead of my_charlen_tmp()
+#else
+  int my_charlen_tmp(CHARSET_INFO *cs, const char *str, const char *end)
+  {
+    my_wc_t wc;
+    return cs->cset->mb_wc(cs, &wc, (const uchar *) str, (const uchar *) end);
+  }
+
+  /**
+    Read a tail of a multi-byte character.
+    The first byte of the character is assumed to be already
+    read from the file and appended to "str".
+
+    @returns  true  - if EOF happened unexpectedly
+    @returns  false - no EOF happened: found a good multi-byte character,
+                                       or a bad byte sequence
+
+    Note:
+    The return value depends only on EOF:
+    - read_mbtail() returns "false" is a good character was read, but also
+    - read_mbtail() returns "false" if an incomplete byte sequence was found
+      and no EOF happened.
+
+    For example, suppose we have an ujis file with bytes 0x8FA10A, where:
+    - 0x8FA1 is an incomplete prefix of a 3-byte character
+      (it should be [8F][A1-FE][A1-FE] to make a full 3-byte character)
+    - 0x0A is a line demiliter
+    This file has some broken data, the trailing [A1-FE] is missing.
+
+    In this example it works as follows:
+    - 0x8F is read from the file and put into "data" before the call
+      for read_mbtail()
+    - 0xA1 is read from the file and put into "data" by read_mbtail()
+    - 0x0A is kept in the read queue, so the next read iteration after
+      the current read_mbtail() call will normally find it and recognize as
+      a line delimiter
+    - the current call for read_mbtail() returns "false",
+      because no EOF happened
+  */
+  bool read_mbtail(String *str)
+  {
+    int chlen;
+    if ((chlen= my_charlen_tmp(read_charset, str->end() - 1, str->end())) == 1)
+      return false; // Single byte character found
+    for (uint32 length0= str->length() - 1 ; MY_CS_IS_TOOSMALL(chlen); )
+    {
+      int chr= GET;
+      if (chr == my_b_EOF)
+      {
+        DBUG_PRINT("info", ("read_mbtail: chlen=%d; unexpected EOF", chlen));
+        return true; // EOF
+      }
+      str->append(chr);
+      chlen= my_charlen_tmp(read_charset, str->ptr() + length0, str->end());
+      if (chlen == MY_CS_ILSEQ)
+      {
+        /**
+          It has been an incomplete (but a valid) sequence so far,
+          but the last byte turned it into a bad byte sequence.
+          Unget the very last byte.
+        */
+        str->length(str->length() - 1);
+        PUSH(chr);
+        DBUG_PRINT("info", ("read_mbtail: ILSEQ"));
+        return false; // Bad byte sequence
+      }
+    }
+    DBUG_PRINT("info", ("read_mbtail: chlen=%d", chlen));
+    return false; // Good multi-byte character
+  }
+#endif
+
 public:
  bool error,line_cuted,found_null,enclosed;
  uchar	*row_start,			/* Found row starts here */
@@ -1474,6 +1549,54 @@ inline int READ_INFO::terminator(const uchar *ptr,uint length)
 }


+/**
+  Read a field.
+
+  The data in the loaded file was presumably escaped using
+  - either select_export::send_data() OUTFILE
+  - or mysql_real_escape_string()
+  using the same character set with the one specified in the current
+  "LOAD DATA INFILE ... CHARACTER SET ..." (or the default LOAD character set).
+
+  Note, non-escaped multi-byte characters are scanned as a single entity.
+  This is needed to correctly distinguish between:
+  - 0x5C as an escape character versus
+  - 0x5C as the second byte in a multi-byte sequence (big5, cp932, gbk, sjis)
+
+  Parts of escaped multi-byte characters are scanned on different loop
+  iterations. See the comment about 0x5C handling in select_export::send_data()
+  in sql_class.cc.
+
+  READ_INFO::read_field() does not check wellformedness.
+  Raising wellformedness errors or warnings in READ_INFO::read_field()
+  would be wrong, as the data after unescaping can go into a BLOB field,
+  or into a TEXT/VARCHAR field of a different character set.
+  The loop below only makes sure to revert escaping made by
+  select_export::send_data() or mysql_real_escape_string().
+  Wellformedness is checked later, during Field::store(str,length,cs) time.
+
+  Note, in some cases users can supply data which did not go through
+  escaping properly. For example, utf8 "\<C3><A4>"
+  (backslash followed by LATIN SMALL LETTER A WITH DIAERESIS)
+  is improperly escaped data that could not be generated by
+  select_export::send_data() / mysql_real_escape_string():
+  - either there should be two backslashes:   "\\<C3><A4>"
+  - or there should be no backslashes at all: "<C3><A4>"
+  "\<C3>" and "<A4> are scanned on two different loop iterations and
+  store "<C3><A4>" into the field.
+
+  Note, adding useless escapes before multi-byte characters like in the
+  example above is safe in case of utf8, but is not safe in case of
+  character sets that have escape_with_backslash_is_dangerous==TRUE,
+  such as big5, cp932, gbk, sjis. This can lead to mis-interpretation of the
+  data. Suppose we have a big5 character "<EE><5C>" followed by <30> (digit 0).
+  If we add an extra escape before this sequence, then we'll get
+  <5C><EE><5C><30>. The first loop iteration will turn <5C><EE> into <EE>.
+  The second loop iteration will turn <5C><30> into <30>.
+  So the program that generates a dump file for further use with LOAD DATA
+  must make sure to use escapes properly.
+*/
+
 int READ_INFO::read_field()
 {
  int chr,found_enclosed_char;
@@ -1510,7 +1633,8 @@ int READ_INFO::read_field()

  for (;;)
  {
-    while ( to < end_of_buff)
+    // Make sure we have enough space for the longest multi-byte character.
+    while ( to + read_charset->mbmaxlen < end_of_buff)
    {
      chr = GET;
      if (chr == my_b_EOF)
@@ -1598,52 +1722,27 @@ int READ_INFO::read_field()
 	}
      }
 #ifdef USE_MB
-        uint ml= my_mbcharlen(read_charset, chr);
-        if (ml == 0)
-        {
-          *to= '\0';
-          my_error(ER_INVALID_CHARACTER_STRING, MYF(0),
-                   read_charset->csname, buffer);
-          error= true;
-          return 1;
-        }
-
-        if (ml > 1 &&
-            to + ml <= end_of_buff)
-        {
-          uchar* p= to;
-          *to++ = chr;
-
-          for (uint i= 1; i < ml; i++)
-          {
-            chr= GET;
-            if (chr == my_b_EOF)
-            {
-              /*
-                Need to back up the bytes already ready from illformed
-                multi-byte char 
-              */
-              to-= i;
-              goto found_eof;
-            }
-            *to++ = chr;
-          }
-          if (my_ismbchar(read_charset,
-                        (const char *)p,
-                        (const char *)to))
-            continue;
-          for (uint i= 0; i < ml; i++)
-            PUSH(*--to);
-          chr= GET;
-        }
-        else if (ml > 1)
-        {
-          // Buffer is too small, exit while loop, and reallocate.
-          PUSH(chr);
-          break;
-        }
 #endif
      *to++ = (uchar) chr;
+#if MYSQL_VERSION_ID >= 100200
+#error This 10.0 and 10.1 specific fix should be removed in 10.2
+#else
+      if (my_mbcharlen(read_charset, (uchar) chr) > 1)
+      {
+        /*
+          A known MBHEAD found. Try to scan the full multi-byte character.
+          Otherwise, a possible following second byte 0x5C would be
+          mis-interpreted as an escape on the next iteration.
+          (Important for big5, gbk, sjis, cp932).
+        */
+        String tmp((char *) to - 1, read_charset->mbmaxlen, read_charset);
+        tmp.length(1);
+        bool eof= read_mbtail(&tmp);
+        to+= tmp.length() - 1;
+        if (eof)
+          goto found_eof;
+      }
+#endif
    }
    /*
    ** We come here if buffer is too small. Enlarge it and continue

--- a/sql/sql_string.h
+++ b/sql/sql_string.h
@@ -136,6 +136,7 @@ class String
  inline bool is_empty() const { return (str_length == 0); }
  inline void mark_as_const() { Alloced_length= 0;}
  inline const char *ptr() const { return Ptr; }
+  inline const char *end() const { return Ptr + str_length; }
  inline char *c_ptr()
  {
    DBUG_ASSERT(!alloced || !Ptr || !Alloced_length ||