Commit 475c6ec5 authored by Alexander Barkov's avatar Alexander Barkov

MDEV-17474 Change Unicode collation implementation from "handler" to "inline" style (part#2)

Additional changes:

1. Adding a fast path for ASCII characters
2. Adding dedicated MY_COLLATION_HANDLERs for collations with no contractions
   (for utf8 and for utf8mb4 character sets). The choice between
   the full-featured handler and the "no contraction" handler is
   made at the collation initialization time.
parent d88c136b
Branches unavailable
Tags unavailable
No related merge requests found
......@@ -31409,6 +31409,28 @@ my_uca_can_be_previous_context_tail(const MY_CONTRACTIONS *list, my_wc_t wc)
}
/**
Check if a character needs previous/next context handling:
- can be a previois context tail
- can be a contraction start
@param level Pointer to an UCA weight level data
@param wc Code point
@return
@retval FALSE - does not need context handling
@retval TRUE - needs context handing
*/
static inline my_bool
my_uca_needs_context_handling(const MY_UCA_WEIGHT_LEVEL *level, my_wc_t wc)
{
return level->contractions.nitems > 0 &&
level->contractions.flags[wc & MY_UCA_CNT_FLAG_MASK] &
(MY_UCA_PREVIOUS_CONTEXT_TAIL | MY_UCA_CNT_HEAD);
}
/**
Compare two wide character strings, wide analog to strncmp().
......@@ -31543,6 +31565,60 @@ my_uca_previous_context_find(my_uca_scanner *scanner,
return NULL;
}
/*
Find a context dependent weight of a character.
@param scanner - UCA weight scanner. The caller should set
its members "page" and "code" to the previous character
(or to zeros if there is no a previous character).
@param wc - an array of wide characters which has at least
MY_UCA_MAX_CONTRACTION elements, where wc[0] is set
to the current character (whose weight is being resolved).
The values of wc[i>0] is not important, but if wc[0]
appears to be a known contraction head, the function
will collect further contraction parts into wc[i>0].
If wc[0] and the previous character make a previous context
pair, then wc[1] is set to the previous character.
@retval NULL if could not find any contextual weights for wc[0]
@retval non null pointer to a zero-terminated weight string otherwise
*/
static inline uint16 *
my_uca_context_weight_find(my_uca_scanner *scanner, my_wc_t *wc)
{
uint16 *cweight;
DBUG_ASSERT(scanner->level->contractions.nitems);
/*
If we have scanned a character which can have previous context,
and there were some more characters already before,
then reconstruct codepoint of the previous character
from "page" and "code" into w[1], and verify that {wc[1], wc[0]}
together form a real previous context pair.
Note, we support only 2-character long sequences with previous
context at the moment. CLDR does not have longer sequences.
*/
if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
wc[0]) &&
scanner->wbeg != nochar && /* if not the very first character */
my_uca_can_be_previous_context_head(&scanner->level->contractions,
(wc[1]= ((scanner->page << 8) +
scanner->code))) &&
(cweight= my_uca_previous_context_find(scanner, wc[1], wc[0])))
{
scanner->page= scanner->code= 0; /* Clear for the next character */
return cweight;
}
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
wc[0]))
{
/* Check if w[0] starts a contraction */
if ((cweight= my_uca_scanner_contraction_find(scanner, wc)))
return cweight;
}
return NULL;
}
/****************************************************************/
/**
......@@ -31934,6 +32010,23 @@ int my_wildcmp_uca(CHARSET_INFO *cs,
}
/*
Tests if an optimized "no contraction" handler can be used for
the given collation.
*/
static my_bool
my_uca_collation_can_optimize_no_contractions(CHARSET_INFO *cs)
{
uint i;
for (i= 0; i < cs->levels_for_order ; i++)
{
if (my_uca_have_contractions_quick(&cs->uca->level[i]))
return FALSE;
}
return TRUE;
}
/*
Collation language is implemented according to
subset of ICU Collation Customization (tailorings):
......@@ -33644,6 +33737,31 @@ static size_t my_strnxfrmlen_any_uca_multilevel(CHARSET_INFO *cs, size_t len)
}
/*
This structure is used at the collation initialization time, to switch
from a full-featured collation handler to a "no contraction" collation
handler if the collation is known not to have any contractions.
*/
typedef struct
{
MY_COLLATION_HANDLER *pad;
MY_COLLATION_HANDLER *nopad;
MY_COLLATION_HANDLER *multilevel_pad;
MY_COLLATION_HANDLER *multilevel_nopad;
} MY_COLLATION_HANDLER_PACKAGE;
static void my_uca_handler_map(struct charset_info_st *cs,
const MY_COLLATION_HANDLER_PACKAGE *from,
const MY_COLLATION_HANDLER_PACKAGE *to)
{
if (cs->coll == from->pad) cs->coll= to->pad;
else if (cs->coll == from->nopad) cs->coll= to->nopad;
else if (cs->coll == from->multilevel_pad) cs->coll= to->multilevel_pad;
else if (cs->coll == from->multilevel_nopad) cs->coll= to->multilevel_nopad;
}
/*
Define generic collation handlers for multi-level collations with tailoring:
......@@ -33656,6 +33774,9 @@ static size_t my_strnxfrmlen_any_uca_multilevel(CHARSET_INFO *cs, size_t len)
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _generic
#define MY_MB_WC(scanner, wc, beg, end) (scanner->cs->cset->mb_wc(scanner->cs, wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"
......@@ -33758,6 +33879,9 @@ create_tailoring(struct charset_info_st *cs,
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _ucs2
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_ucs2_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"
......@@ -34711,13 +34835,39 @@ struct charset_info_st my_charset_ucs2_unicode_520_nopad_ci=
#ifdef HAVE_CHARSET_utf8
static my_bool
my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader);
#include "ctype-utf8.h"
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb3
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3
#include "ctype-uca.ic"
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb3
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb3_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 0
#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb3
#include "ctype-uca.ic"
static my_bool
my_uca_coll_init_utf8mb3(struct charset_info_st *cs, MY_CHARSET_LOADER *loader)
{
if (my_coll_init_uca(cs, loader))
return TRUE;
if (my_uca_collation_can_optimize_no_contractions(cs))
my_uca_handler_map(cs, &my_uca_package_utf8mb3,
&my_uca_package_no_contractions_utf8mb3);
return FALSE;
}
/*
We consider bytes with code more than 127 as a letter.
This guarantees that word boundaries work fine with regular
......@@ -35690,11 +35840,38 @@ struct charset_info_st my_charset_utf8_unicode_520_nopad_ci=
#ifdef HAVE_CHARSET_utf8mb4
static my_bool
my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader);
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf8mb4
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4
#include "ctype-uca.ic"
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _no_contractions_utf8mb4
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf8mb4_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_mb
#define MY_UCA_ASCII_OPTIMIZE 1
#define MY_UCA_COMPILE_CONTRACTIONS 0
#define MY_UCA_COLL_INIT my_uca_coll_init_utf8mb4
#include "ctype-uca.ic"
static my_bool
my_uca_coll_init_utf8mb4(struct charset_info_st *cs, MY_CHARSET_LOADER *loader)
{
if (my_coll_init_uca(cs, loader))
return TRUE;
if (my_uca_collation_can_optimize_no_contractions(cs))
my_uca_handler_map(cs, &my_uca_package_utf8mb4,
&my_uca_package_no_contractions_utf8mb4);
return FALSE;
}
extern MY_CHARSET_HANDLER my_charset_utf8mb4_handler;
......@@ -36646,6 +36823,9 @@ struct charset_info_st my_charset_utf8mb4_unicode_520_nopad_ci=
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf32
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf32_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"
......@@ -37601,6 +37781,9 @@ struct charset_info_st my_charset_utf32_unicode_520_nopad_ci=
#define MY_FUNCTION_NAME(x) my_uca_ ## x ## _utf16
#define MY_MB_WC(scanner, wc, beg, end) (my_mb_wc_utf16_quick(wc, beg, end))
#define MY_LIKE_RANGE my_like_range_generic
#define MY_UCA_ASCII_OPTIMIZE 0
#define MY_UCA_COMPILE_CONTRACTIONS 1
#define MY_UCA_COLL_INIT my_coll_init_uca
#include "ctype-uca.ic"
......@@ -25,6 +25,15 @@
#ifndef MY_LIKE_RANGE
#error MY_LIKE_RANGE is not defined
#endif
#ifndef MY_UCA_ASCII_OPTIMIZE
#error MY_ASCII_OPTIMIZE is not defined
#endif
#ifndef MY_UCA_COMPILE_CONTRACTIONS
#error MY_UCA_COMPILE_CONTRACTIONS is not defined
#endif
#ifndef MY_UCA_COLL_INIT
#error MY_UCA_COLL_INIT is not defined
#endif
static inline int
......@@ -46,6 +55,32 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
int mblen;
/* Get next character */
#if MY_UCA_ASCII_OPTIMIZE
/* Get next ASCII character */
if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
{
wc[0]= scanner->sbeg[0];
scanner->sbeg+= 1;
#if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(scanner->level, wc[0]))
{
uint16 *cweight= my_uca_context_weight_find(scanner, wc);
if (cweight)
return *cweight;
}
#endif
scanner->page= 0;
scanner->code= (int) wc[0];
scanner->wbeg= scanner->level->weights[0] + scanner->code * scanner->level->lengths[0];
if (scanner->wbeg[0])
return *scanner->wbeg++;
continue;
}
else
#endif
/* Get next MB character */
if (((mblen= MY_MB_WC(scanner, wc, scanner->sbeg,
scanner->send)) <= 0))
{
......@@ -76,37 +111,14 @@ MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner)
return 0xFFFD;
}
if (my_uca_have_contractions_quick(scanner->level))
#if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(scanner->level, wc[0]))
{
uint16 *cweight;
/*
If we have scanned a character which can have previous context,
and there were some more characters already before,
then reconstruct codepoint of the previous character
from "page" and "code" into w[1], and verify that {wc[1], wc[0]}
together form a real previous context pair.
Note, we support only 2-character long sequences with previous
context at the moment. CLDR does not have longer sequences.
*/
if (my_uca_can_be_previous_context_tail(&scanner->level->contractions,
wc[0]) &&
scanner->wbeg != nochar && /* if not the very first character */
my_uca_can_be_previous_context_head(&scanner->level->contractions,
(wc[1]= ((scanner->page << 8) +
scanner->code))) &&
(cweight= my_uca_previous_context_find(scanner, wc[1], wc[0])))
{
scanner->page= scanner->code= 0; /* Clear for the next character */
return *cweight;
}
else if (my_uca_can_be_contraction_head(&scanner->level->contractions,
wc[0]))
{
/* Check if w[0] starts a contraction */
if ((cweight= my_uca_scanner_contraction_find(scanner, wc)))
uint16 *cweight= my_uca_context_weight_find(scanner, wc);
if (cweight)
return *cweight;
}
}
#endif
/* Process single character */
scanner->page= wc[0] >> 8;
......@@ -685,7 +697,7 @@ MY_FUNCTION_NAME(strnxfrm_multilevel)(CHARSET_INFO *cs,
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)=
{
my_coll_init_uca,
MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll),
MY_FUNCTION_NAME(strnncollsp),
MY_FUNCTION_NAME(strnxfrm),
......@@ -706,7 +718,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler)=
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)=
{
my_coll_init_uca,
MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll),
MY_FUNCTION_NAME(strnncollsp_nopad),
MY_FUNCTION_NAME(strnxfrm_nopad),
......@@ -725,7 +737,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad)=
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)=
{
my_coll_init_uca,
MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll_multilevel),
MY_FUNCTION_NAME(strnncollsp_multilevel),
MY_FUNCTION_NAME(strnxfrm_multilevel),
......@@ -744,7 +756,7 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_multilevel)=
*/
MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)=
{
my_coll_init_uca,
MY_UCA_COLL_INIT,
MY_FUNCTION_NAME(strnncoll_multilevel),
MY_FUNCTION_NAME(strnncollsp_nopad_multilevel),
MY_FUNCTION_NAME(strnxfrm_multilevel),
......@@ -758,6 +770,18 @@ MY_COLLATION_HANDLER MY_FUNCTION_NAME(collation_handler_nopad_multilevel)=
};
MY_COLLATION_HANDLER_PACKAGE MY_FUNCTION_NAME(package)=
{
&MY_FUNCTION_NAME(collation_handler),
&MY_FUNCTION_NAME(collation_handler_nopad),
&MY_FUNCTION_NAME(collation_handler_multilevel),
&MY_FUNCTION_NAME(collation_handler_nopad_multilevel)
};
#undef MY_FUNCTION_NAME
#undef MY_MB_WC
#undef MY_LIKE_RANGE
#undef MY_UCA_ASCII_OPTIMIZE
#undef MY_UCA_COMPILE_CONTRACTIONS
#undef MY_UCA_COLL_INIT
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment