Commit a8efe7ab authored by Alexander Barkov's avatar Alexander Barkov

MDEV-17502 MDEV-17474 Change Unicode xxx_general_ci and xxx_bin collation...

MDEV-17502 MDEV-17474 Change Unicode xxx_general_ci and xxx_bin collation implementation to "inline" style
parent 1bb90411
......@@ -871,14 +871,6 @@ size_t my_strnxfrm_mb_nopad(CHARSET_INFO *,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags);
size_t my_strnxfrm_unicode(CHARSET_INFO *,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags);
size_t my_strnxfrm_unicode_nopad(CHARSET_INFO *,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags);
size_t my_strnxfrmlen_unicode(CHARSET_INFO *, size_t);
size_t my_strnxfrm_unicode_full_bin(CHARSET_INFO *,
......
......@@ -23,6 +23,8 @@
#include <my_sys.h>
#include <stdarg.h>
#include "ctype-unidata.h"
#if defined(HAVE_CHARSET_utf16) || defined(HAVE_CHARSET_ucs2)
#define HAVE_CHARSET_mb2
......@@ -1192,10 +1194,17 @@ my_lengthsp_mb2(CHARSET_INFO *cs __attribute__((unused)),
static inline int my_weight_mb2_utf16mb2_general_ci(uchar b0, uchar b1)
{
my_wc_t wc= MY_UTF16_WC2(b0, b1);
MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8];
MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
return (int) (page ? page[wc & 0xFF].sort : wc);
}
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16_general_ci
#define DEFINE_STRNXFRM_UNICODE
#define DEFINE_STRNXFRM_UNICODE_NOPAD
#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf16_quick(pwc, s, e)
#define OPTIMIZE_ASCII 0
#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
#define UNICASE_PAGE0 my_unicase_default_page00
#define UNICASE_PAGES my_unicase_default_pages
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b0,b1)
#define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
......@@ -1493,7 +1502,7 @@ static MY_COLLATION_HANDLER my_collation_utf16_general_ci_handler =
NULL, /* init */
my_strnncoll_utf16_general_ci,
my_strnncollsp_utf16_general_ci,
my_strnxfrm_unicode,
my_strnxfrm_utf16_general_ci,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_utf16_ci,
......@@ -1525,7 +1534,7 @@ static MY_COLLATION_HANDLER my_collation_utf16_general_nopad_ci_handler =
NULL, /* init */
my_strnncoll_utf16_general_ci,
my_strnncollsp_utf16_general_nopad_ci,
my_strnxfrm_unicode_nopad,
my_strnxfrm_nopad_utf16_general_ci,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_utf16_ci,
......@@ -1722,6 +1731,13 @@ struct charset_info_st my_charset_utf16_nopad_bin=
#define IS_MB4_CHAR(b0,b1,b2,b3) (MY_UTF16_HIGH_HEAD(b1) && MY_UTF16_LOW_HEAD(b3))
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf16le_general_ci
#define DEFINE_STRNXFRM_UNICODE
#define DEFINE_STRNXFRM_UNICODE_NOPAD
#define MY_MB_WC(cs, pwc, s, e) (cs->cset->mb_wc(cs, pwc, s, e))
#define OPTIMIZE_ASCII 0
#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
#define UNICASE_PAGE0 my_unicase_default_page00
#define UNICASE_PAGES my_unicase_default_pages
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) my_weight_mb2_utf16mb2_general_ci(b1,b0)
#define WEIGHT_MB4(b0,b1,b2,b3) MY_CS_REPLACEMENT_CHARACTER
......@@ -1826,7 +1842,7 @@ static MY_COLLATION_HANDLER my_collation_utf16le_general_ci_handler =
NULL, /* init */
my_strnncoll_utf16le_general_ci,
my_strnncollsp_utf16le_general_ci,
my_strnxfrm_unicode,
my_strnxfrm_utf16le_general_ci,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_utf16_ci,
......@@ -1858,7 +1874,7 @@ static MY_COLLATION_HANDLER my_collation_utf16le_general_nopad_ci_handler =
NULL, /* init */
my_strnncoll_utf16le_general_ci,
my_strnncollsp_utf16le_general_nopad_ci,
my_strnxfrm_unicode_nopad,
my_strnxfrm_nopad_utf16le_general_ci,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_utf16_ci,
......@@ -2073,12 +2089,19 @@ static inline int my_weight_utf32_general_ci(uchar b0, uchar b1,
my_wc_t wc= MY_UTF32_WC4(b0, b1, b2, b3);
if (wc <= 0xFFFF)
{
MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8];
MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
return (int) (page ? page[wc & 0xFF].sort : wc);
}
return MY_CS_REPLACEMENT_CHARACTER;
}
#define MY_FUNCTION_NAME(x) my_ ## x ## _utf32_general_ci
#define DEFINE_STRNXFRM_UNICODE
#define DEFINE_STRNXFRM_UNICODE_NOPAD
#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_utf32_quick(pwc, s, e)
#define OPTIMIZE_ASCII 0
#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
#define UNICASE_PAGE0 my_unicase_default_page00
#define UNICASE_PAGES my_unicase_default_pages
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB4(b0,b1,b2,b3) my_weight_utf32_general_ci(b0, b1, b2, b3)
#include "strcoll.ic"
......@@ -2642,7 +2665,7 @@ static MY_COLLATION_HANDLER my_collation_utf32_general_ci_handler =
NULL, /* init */
my_strnncoll_utf32_general_ci,
my_strnncollsp_utf32_general_ci,
my_strnxfrm_unicode,
my_strnxfrm_utf32_general_ci,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_utf32_ci,
......@@ -2674,7 +2697,7 @@ static MY_COLLATION_HANDLER my_collation_utf32_general_nopad_ci_handler =
NULL, /* init */
my_strnncoll_utf32_general_ci,
my_strnncollsp_utf32_general_nopad_ci,
my_strnxfrm_unicode_nopad,
my_strnxfrm_nopad_utf32_general_ci,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_utf32_ci,
......@@ -2941,20 +2964,30 @@ static const uchar to_upper_ucs2[] = {
static inline int my_weight_mb2_ucs2_general_ci(uchar b0, uchar b1)
{
my_wc_t wc= UCS2_CODE(b0, b1);
MY_UNICASE_CHARACTER *page= my_unicase_default.page[wc >> 8];
MY_UNICASE_CHARACTER *page= my_unicase_default_pages[wc >> 8];
return (int) (page ? page[wc & 0xFF].sort : wc);
}
#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1)
#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_general_ci
#define DEFINE_STRNXFRM_UNICODE
#define DEFINE_STRNXFRM_UNICODE_NOPAD
#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_ucs2_quick(pwc, s, e)
#define OPTIMIZE_ASCII 0
#define UNICASE_MAXCHAR MY_UNICASE_INFO_DEFAULT_MAXCHAR
#define UNICASE_PAGE0 my_unicase_default_page00
#define UNICASE_PAGES my_unicase_default_pages
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) my_weight_mb2_ucs2_general_ci(b0,b1)
#include "strcoll.ic"
#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1)
#define MY_FUNCTION_NAME(x) my_ ## x ## _ucs2_bin
#define DEFINE_STRNXFRM_UNICODE_BIN2
#define MY_MB_WC(cs, pwc, s, e) my_mb_wc_ucs2_quick(pwc, s, e)
#define OPTIMIZE_ASCII 0
#define WEIGHT_ILSEQ(x) (0xFF0000 + (uchar) (x))
#define WEIGHT_MB2(b0,b1) UCS2_CODE(b0,b1)
#include "strcoll.ic"
......@@ -3222,7 +3255,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_ci_handler =
NULL, /* init */
my_strnncoll_ucs2_general_ci,
my_strnncollsp_ucs2_general_ci,
my_strnxfrm_unicode,
my_strnxfrm_ucs2_general_ci,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_ucs2_ci,
......@@ -3238,7 +3271,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_bin_handler =
NULL, /* init */
my_strnncoll_ucs2_bin,
my_strnncollsp_ucs2_bin,
my_strnxfrm_unicode,
my_strnxfrm_ucs2_bin,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_ucs2_bin,
......@@ -3254,7 +3287,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_general_nopad_ci_handler =
NULL, /* init */
my_strnncoll_ucs2_general_ci,
my_strnncollsp_ucs2_general_nopad_ci,
my_strnxfrm_unicode_nopad,
my_strnxfrm_nopad_ucs2_general_ci,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_ucs2_ci,
......@@ -3270,7 +3303,7 @@ static MY_COLLATION_HANDLER my_collation_ucs2_nopad_bin_handler =
NULL, /* init */
my_strnncoll_ucs2_bin,
my_strnncollsp_ucs2_nopad_bin,
my_strnxfrm_unicode_nopad,
my_strnxfrm_nopad_ucs2_bin,
my_strnxfrmlen_unicode,
my_like_range_generic,
my_wildcmp_ucs2_bin,
......
#ifndef CTYPE_UNIDATA_H_INCLUDED
#define CTYPE_UNIDATA_H_INCLUDED
/*
Copyright (c) 2018 MariaDB Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#define MY_UNICASE_INFO_DEFAULT_MAXCHAR 0xFFFF
extern MY_UNICASE_CHARACTER my_unicase_default_page00[256];
extern MY_UNICASE_CHARACTER *my_unicase_default_pages[256];
size_t my_strxfrm_pad_nweights_unicode(uchar *str, uchar *strend, size_t nweights);
size_t my_strxfrm_pad_unicode(uchar *str, uchar *strend);
#define PUT_WC_BE2_HAVE_1BYTE(dst, de, wc) \
do { *dst++= (uchar) (wc >> 8); if (dst < de) *dst++= (uchar) (wc & 0xFF); } while(0)
#endif /* CTYPE_UNIDATA_H_INCLUDED */
This diff is collapsed.
......@@ -15,11 +15,18 @@
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef MY_FUNCTION_NAME
#error MY_FUNCTION_NAME is not defined
#endif
/*
Define strnncoll() and strnncollsp() by default,
unless "#define DEFINE_STRNNCOLL 0" is specified.
*/
#ifndef DEFINE_STRNNCOLL
#define DEFINE_STRNNCOLL 1
#endif
/*
The weight for automatically padded spaces when comparing strings with
......@@ -54,6 +61,8 @@
#endif
#if DEFINE_STRNNCOLL
/**
Scan a valid character, or a bad byte, or an auto-padded space
from a string and calculate the weight of the scanned sequence.
......@@ -278,6 +287,8 @@ MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
}
#endif
#endif /* DEFINE_STRNNCOLL */
#ifdef DEFINE_STRNXFRM
#ifndef WEIGHT_MB2_FRM
......@@ -322,11 +333,261 @@ MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
#endif /* DEFINE_STRNXFRM */
#if defined(DEFINE_STRNXFRM_UNICODE) || defined(DEFINE_STRNXFRM_UNICODE_NOPAD)
/*
Store sorting weights using 2 bytes per character.
This function is shared between
- utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
which support BMP only (U+0000..U+FFFF).
- utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
which map all supplementary characters to weight 0xFFFD.
*/
#ifndef MY_MB_WC
#error MY_MB_WC must be defined for DEFINE_STRNXFRM_UNICODE
#endif
#ifndef OPTIMIZE_ASCII
#error OPTIMIZE_ASCII must be defined for DEFINE_STRNXFRM_UNICODE
#endif
#ifndef UNICASE_MAXCHAR
#error UNICASE_MAXCHAR must be defined for DEFINE_STRNXFRM_UNICODE
#endif
#ifndef UNICASE_PAGE0
#error UNICASE_PAGE0 must be defined for DEFINE_STRNXFRM_UNICODE
#endif
#ifndef UNICASE_PAGES
#error UNICASE_PAGES must be defined for DEFINE_STRNXFRM_UNICODE
#endif
static size_t
MY_FUNCTION_NAME(strnxfrm_internal)(CHARSET_INFO *cs,
uchar *dst, uchar *de,
uint *nweights,
const uchar *src, const uchar *se)
{
my_wc_t UNINIT_VAR(wc);
uchar *dst0= dst;
DBUG_ASSERT(src || !se);
DBUG_ASSERT((cs->state & MY_CS_LOWER_SORT) == 0);
DBUG_ASSERT(0x7F <= UNICASE_MAXCHAR);
for (; dst < de && *nweights; (*nweights)--)
{
int res;
#if OPTIMIZE_ASCII
if (src >= se)
break;
if (src[0] <= 0x7F)
{
wc= UNICASE_PAGE0[*src++].sort;
PUT_WC_BE2_HAVE_1BYTE(dst, de, wc);
continue;
}
#endif
if ((res= MY_MB_WC(cs, &wc, src, se)) <= 0)
break;
src+= res;
if (wc <= UNICASE_MAXCHAR)
{
MY_UNICASE_CHARACTER *page;
if ((page= UNICASE_PAGES[wc >> 8]))
wc= page[wc & 0xFF].sort;
}
else
wc= MY_CS_REPLACEMENT_CHARACTER;
PUT_WC_BE2_HAVE_1BYTE(dst, de, wc);
}
return dst - dst0;
}
static size_t
MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags)
{
uchar *dst0= dst;
uchar *de= dst + dstlen;
dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights,
src, src + srclen);
DBUG_ASSERT(dst <= de); /* Safety */
if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights);
my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
dst+= my_strxfrm_pad_unicode(dst, de);
return dst - dst0;
}
#ifdef DEFINE_STRNXFRM_UNICODE_NOPAD
static size_t
MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs,
uchar *dst, size_t dstlen,
uint nweights,
const uchar *src, size_t srclen, uint flags)
{
uchar *dst0= dst;
uchar *de= dst + dstlen;
dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights,
src, src + srclen);
DBUG_ASSERT(dst <= de); /* Safety */
if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
{
size_t len= de - dst;
set_if_smaller(len, nweights * 2);
memset(dst, 0x00, len);
dst+= len;
}
my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
{
memset(dst, 0x00, de - dst);
dst= de;
}
return dst - dst0;
}
#endif
#endif /* DEFINE_STRNXFRM_UNICODE || DEFINE_STRNXFRM_UNICODE_NOPAD */
#ifdef DEFINE_STRNXFRM_UNICODE_BIN2
/*
Store sorting weights using 2 bytes per character.
These functions are shared between
- utf8mb3_general_ci, utf8_bin, ucs2_general_ci, ucs2_bin
which support BMP only (U+0000..U+FFFF).
- utf8mb4_general_ci, utf16_general_ci, utf32_general_ci,
which map all supplementary characters to weight 0xFFFD.
*/
#ifndef MY_MB_WC
#error MY_MB_WC must be defined for DEFINE_STRNXFRM_UNICODE_BIN2
#endif
#ifndef OPTIMIZE_ASCII
#error OPTIMIZE_ASCII must be defined for DEFINE_STRNXFRM_UNICODE_BIN2
#endif
static size_t
MY_FUNCTION_NAME(strnxfrm_internal)(CHARSET_INFO *cs,
uchar *dst, uchar *de,
uint *nweights,
const uchar *src,
const uchar *se)
{
my_wc_t UNINIT_VAR(wc);
uchar *dst0= dst;
DBUG_ASSERT(src || !se);
for (; dst < de && *nweights; (*nweights)--)
{
int res;
#if OPTIMIZE_ASCII
if (src >= se)
break;
if (src[0] <= 0x7F)
{
wc= *src++;
PUT_WC_BE2_HAVE_1BYTE(dst, de, wc);
continue;
}
#endif
if ((res= MY_MB_WC(cs, &wc, src, se)) <= 0)
break;
src+= res;
if (wc > 0xFFFF)
wc= MY_CS_REPLACEMENT_CHARACTER;
PUT_WC_BE2_HAVE_1BYTE(dst, de, wc);
}
return dst - dst0;
}
static size_t
MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags)
{
uchar *dst0= dst;
uchar *de= dst + dstlen;
dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights,
src, src + srclen);
DBUG_ASSERT(dst <= de); /* Safety */
if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
dst+= my_strxfrm_pad_nweights_unicode(dst, de, nweights);
my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
dst+= my_strxfrm_pad_unicode(dst, de);
return dst - dst0;
}
static size_t
MY_FUNCTION_NAME(strnxfrm_nopad)(CHARSET_INFO *cs,
uchar *dst, size_t dstlen, uint nweights,
const uchar *src, size_t srclen, uint flags)
{
uchar *dst0= dst;
uchar *de= dst + dstlen;
dst+= MY_FUNCTION_NAME(strnxfrm_internal)(cs, dst, de, &nweights,
src, src + srclen);
DBUG_ASSERT(dst <= de); /* Safety */
if (dst < de && nweights && (flags & MY_STRXFRM_PAD_WITH_SPACE))
{
size_t len= de - dst;
set_if_smaller(len, nweights * 2);
memset(dst, 0x00, len);
dst+= len;
}
my_strxfrm_desc_and_reverse(dst0, dst, flags, 0);
if ((flags & MY_STRXFRM_PAD_TO_MAXLEN) && dst < de)
{
memset(dst, 0x00, de - dst);
dst= de;
}
return dst - dst0;
}
#endif /* DEFINE_STRNXFRM_UNICODE_BIN2 */
/*
We usually include this file at least two times from the same source file,
for the _ci and the _bin collations. Prepare for the second inclusion.
*/
#undef MY_FUNCTION_NAME
#undef MY_MB_WC
#undef OPTIMIZE_ASCII
#undef UNICASE_MAXCHAR
#undef UNICASE_PAGE0
#undef UNICASE_PAGES
#undef WEIGHT_ILSEQ
#undef WEIGHT_MB1
#undef WEIGHT_MB2
......@@ -335,4 +596,8 @@ MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
#undef WEIGHT_PAD_SPACE
#undef WEIGHT_MB2_FRM
#undef DEFINE_STRNXFRM
#undef DEFINE_STRNXFRM_UNICODE
#undef DEFINE_STRNXFRM_UNICODE_NOPAD
#undef DEFINE_STRNXFRM_UNICODE_BIN2
#undef DEFINE_STRNNCOLL
#undef DEFINE_STRNNCOLLSP_NOPAD
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment