Commit 3c3b11c3 authored by Magne Mahre's avatar Magne Mahre

Bug #14637: trim trailing spaces processes data only byte wise

(From: gkodinov)
Use and int * where possible to scan for trailing space in a
string instead of always iterating char-by-char.
Using the attached benchmark file on a 32 bit Intel Core 2 
Duo CPU I've got 43485 ms run with the fix compared to 44373 
without it.

Backported to 5.6.0 (next-mr-runtime)
6.0-codebase revid: 2476.1362.1


include/m_string.h:
  scan for space through ints
strings/ctype-bin.c:
  scan for space through ints
strings/ctype-latin1.c:
  scan for space through ints
strings/ctype-mb.c:
  scan for space through ints
strings/ctype-simple.c:
  scan for space through ints
parent 1c3c72ce
......@@ -266,4 +266,74 @@ typedef struct st_mysql_lex_string LEX_STRING;
#define USTRING_WITH_LEN(X) ((uchar*) X), ((size_t) (sizeof(X) - 1))
#define C_STRING_WITH_LEN(X) ((char *) (X)), ((size_t) (sizeof(X) - 1))
/* SPACE_INT is a word that contains only spaces */
#if SIZEOF_INT == 4
#define SPACE_INT 0x20202020
#elif SIZEOF_INT == 8
#define SPACE_INT 0x2020202020202020
#else
#error define the appropriate constant for a word full of spaces
#endif
/**
Skip trailing space.
On most systems reading memory in larger chunks (ideally equal to the size of
the chinks that the machine physically reads from memory) causes fewer memory
access loops and hence increased performance.
This is why the 'int' type is used : it's closest to that (according to how
it's defined in C).
So when we determine the amount of whitespace at the end of a string we do
the following :
1. We divide the string into 3 zones :
a) from the start of the string (__start) to the first multiple
of sizeof(int) (__start_words)
b) from the end of the string (__end) to the last multiple of sizeof(int)
(__end_words)
c) a zone that is aligned to sizeof(int) and can be safely accessed
through an int *
2. We start comparing backwards from (c) char-by-char. If all we find is
space then we continue
3. If there are elements in zone (b) we compare them as unsigned ints to a
int mask (SPACE_INT) consisting of all spaces
4. Finally we compare the remaining part (a) of the string char by char.
This covers for the last non-space unsigned int from 3. (if any)
This algorithm works well for relatively larger strings, but it will slow
the things down for smaller strings (because of the additional calculations
and checks compared to the naive method). Thus the barrier of length 20
is added.
@param ptr pointer to the input string
@param len the length of the string
@return the last non-space character
*/
static inline const uchar *skip_trailing_space(const uchar *ptr,size_t len)
{
const uchar *start= ptr;
const uchar *end= ptr + len;
if (len > 20)
{
const uchar *end_words= (const uchar *)
(((intptr)end) / SIZEOF_INT * SIZEOF_INT);
const uchar *start_words= (const uchar *)
((((intptr)start) + SIZEOF_INT - 1) / SIZEOF_INT * SIZEOF_INT);
DBUG_ASSERT(((intptr)start) >= SIZEOF_INT);
if (end_words > start)
{
while (end > end_words && end[-1] == 0x20)
end--;
if (end[-1] == 0x20 && start_words < end_words)
while (end > start_words && ((unsigned *)end)[-1] == SPACE_INT)
end -= SIZEOF_INT;
}
}
while (end > start && end[-1] == 0x20)
end--;
return (end);
}
#endif
......@@ -278,14 +278,11 @@ void my_hash_sort_8bit_bin(CHARSET_INFO *cs __attribute__((unused)),
{
const uchar *pos = key;
key+= len;
/*
Remove trailing spaces. We have to do this to be able to compare
'A ' and 'A' as identical
*/
while (key > pos && key[-1] == ' ')
key--;
key= skip_trailing_space(key, len);
for (; pos < (uchar*) key ; pos++)
{
......
......@@ -678,13 +678,12 @@ void my_hash_sort_latin1_de(CHARSET_INFO *cs __attribute__((unused)),
const uchar *key, size_t len,
ulong *nr1, ulong *nr2)
{
const uchar *end= key+len;
const uchar *end;
/*
Remove end space. We have to do this to be able to compare
'AE' and '' as identical
*/
while (end > key && end[-1] == ' ')
end--;
end= skip_trailing_space(key, len);
for (; key < end ; key++)
{
......
......@@ -469,14 +469,11 @@ static void my_hash_sort_mb_bin(CHARSET_INFO *cs __attribute__((unused)),
{
const uchar *pos = key;
key+= len;
/*
Remove trailing spaces. We have to do this to be able to compare
'A ' and 'A' as identical
*/
while (key > pos && key[-1] == ' ')
key--;
key= skip_trailing_space(key, len);
for (; pos < (uchar*) key ; pos++)
{
......
......@@ -304,14 +304,13 @@ void my_hash_sort_simple(CHARSET_INFO *cs,
ulong *nr1, ulong *nr2)
{
register uchar *sort_order=cs->sort_order;
const uchar *end= key + len;
const uchar *end;
/*
Remove end space. We have to do this to be able to compare
'A ' and 'A' as identical
*/
while (end > key && end[-1] == ' ')
end--;
end= skip_trailing_space(key, len);
for (; key < (uchar*) end ; key++)
{
......@@ -1165,9 +1164,8 @@ size_t my_well_formed_len_8bit(CHARSET_INFO *cs __attribute__((unused)),
size_t my_lengthsp_8bit(CHARSET_INFO *cs __attribute__((unused)),
const char *ptr, size_t length)
{
const char *end= ptr+length;
while (end > ptr && end[-1] == ' ')
end--;
const char *end;
end= (const char *) skip_trailing_space((const uchar *)ptr, length);
return (size_t) (end-ptr);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment