Commit 6c5c1fd5 authored by Vladislav Vaintroub's avatar Vladislav Vaintroub

MDEV-4742 - remove leading zero handling, and cleanups.

Leading zeros added a single byte overhead per numeric string,
even when they were. Sorting leading zeros offers only for little value
(except determinism in sort). I decided to drop it for now, we can be
like ICU, which drops leading zeros, in numeric sorting,
even with IDENTICAL collation strength.


Also, disabled virtual stored columns (thus also indexes), on Serg's request
Hopefully it is temporarily, and will be reenabled soon, when everyone is
as happy with key generation algorithm as I am.
parent 167d2509
......@@ -2,14 +2,6 @@ SET NAMES utf8mb4;
SELECT NATURAL_SORT_KEY(NULL);
NATURAL_SORT_KEY(NULL)
NULL
SELECT '' c WHERE 0 UNION VALUES('a10'),('a9'),('a1000'), ('a0'),('b'),('b0') ORDER BY NATURAL_SORT_KEY(c);
c
a0
a9
a10
a1000
b
b0
SELECT NATURAL_SORT_KEY(repeat('a1',@@max_allowed_packet/2-1));
NATURAL_SORT_KEY(repeat('a1',@@max_allowed_packet/2-1))
NULL
......@@ -22,14 +14,10 @@ Warnings:
Warning 1301 Result of natural_sort_key() was larger than max_allowed_packet (16777216) - truncated
CREATE TABLE t1(
c VARCHAR(30) CHARACTER SET latin1 COLLATE latin1_bin,
k VARCHAR(45) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci AS (NATURAL_SORT_KEY(CONVERT(c USING utf8mb4))) INVISIBLE,
KEY(k,c)) ENGINE=InnoDB;
k VARCHAR(60) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci AS (NATURAL_SORT_KEY(CONVERT(c USING utf8mb4))) VIRTUAL INVISIBLE);
INSERT INTO t1 values
('A1'),('a1'),('A100'),('a100'),('A2'),('ä2'),('a2'),('A99'),
('äb'),('B1'),('B100'),('B9'),('C'),('100');
EXPLAIN SELECT c FROM t1 ORDER BY k,c;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 index NULL k 216 NULL 14 Using index
#Natural sort order.
SELECT c FROM t1 ORDER BY k,c;
c
......@@ -69,55 +57,136 @@ SHOW CREATE TABLE t2;
Table Create Table
t2 CREATE TABLE `t2` (
`c` varchar(30) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL,
`NATURAL_SORT_KEY(c)` varchar(60) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL
`NATURAL_SORT_KEY(c)` varchar(45) CHARACTER SET latin1 COLLATE latin1_bin DEFAULT NULL
) ENGINE=MyISAM DEFAULT CHARSET=latin1
DROP TABLE t1,t2;
SELECT RPAD(val,28,' ') value , RPAD(NATURAL_SORT_KEY(val),35,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead
CREATE TABLE t1(c VARCHAR(1), k VARCHAR(2) AS (NATURAL_SORT_KEY(c)) STORED);
ERROR HY000: Function or expression 'natural_sort_key()' cannot be used in the GENERATED ALWAYS AS clause of `k`
SELECT RPAD(val,28,' ') value , RPAD(NATURAL_SORT_KEY(val),36,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead
FROM
(
SELECT 0 val
UNION VALUES ('1'),('01'),('0001')
SELECT '0' val
UNION SELECT seq FROM seq_1_to_9
UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_27
) AS numbers ORDER BY sortkey;
value sortkey encoding_overhead
0 000 2
1 010 2
01 011 1
0001 013 -1
10 1100 2
100 21000 2
1000 310000 2
10000 4100000 2
100000 51000000 2
1000000 610000000 2
10000000 7100000000 2
100000000 81000000000 2
1000000000 9010000000000 3
10000000000 91100000000000 3
100000000000 921000000000000 3
1000000000000 9310000000000000 3
10000000000000 94100000000000000 3
100000000000000 951000000000000000 3
1000000000000000 9610000000000000000 3
10000000000000000 97100000000000000000 3
100000000000000000 981000000000000000000 3
1000000000000000000 99010000000000000000000 4
10000000000000000000 991100000000000000000000 4
100000000000000000000 9921000000000000000000000 4
1000000000000000000000 99310000000000000000000000 4
10000000000000000000000 994100000000000000000000000 4
100000000000000000000000 9951000000000000000000000000 4
1000000000000000000000000 99610000000000000000000000000 4
10000000000000000000000000 997100000000000000000000000000 4
100000000000000000000000000 9981000000000000000000000000000 4
1000000000000000000000000000 99901271000000000000000000000000000 8
SELECT val
FROM
(
SELECT 0 val WHERE 0
UNION VALUES ('1/'),('01'),('1a')
) AS strings ORDER BY NATURAL_SORT_KEY(val);
val
1/
1a
01
0 00 1
1 01 1
2 02 1
3 03 1
4 04 1
5 05 1
6 06 1
7 07 1
8 08 1
9 09 1
10 110 1
100 2100 1
1000 31000 1
10000 410000 1
100000 5100000 1
1000000 61000000 1
10000000 710000000 1
100000000 8100000000 1
1000000000 901000000000 2
10000000000 9110000000000 2
100000000000 92100000000000 2
1000000000000 931000000000000 2
10000000000000 9410000000000000 2
100000000000000 95100000000000000 2
1000000000000000 961000000000000000 2
10000000000000000 9710000000000000000 2
100000000000000000 98100000000000000000 2
1000000000000000000 9901000000000000000000 3
10000000000000000000 99110000000000000000000 3
100000000000000000000 992100000000000000000000 3
1000000000000000000000 9931000000000000000000000 3
10000000000000000000000 99410000000000000000000000 3
100000000000000000000000 995100000000000000000000000 3
1000000000000000000000000 9961000000000000000000000000 3
10000000000000000000000000 99710000000000000000000000000 3
100000000000000000000000000 998100000000000000000000000000 3
1000000000000000000000000000 99901271000000000000000000000000000 7
SELECT natural_sort_key('1') = natural_sort_key('0001');
natural_sort_key('1') = natural_sort_key('0001')
1
SELECT natural_sort_key('1.1') = natural_sort_key('1.00001');
natural_sort_key('1.1') = natural_sort_key('1.00001')
1
SELECT RPAD(val,20,' ') value, NATURAL_SORT_KEY(val) FROM
(SELECT '' val WHERE 0 UNION VALUES
('fred'),
('pic2'),
('pic100a'),
('pic120'),
('pic121'),
('jane'),
('tom'),
('pic02a'),
('pic3'),
('pic4'),
('1-20'),
('pic100'),
('pic02000'),
('10-20'),
('1-02'),
('1-2'),
('pic01'),
('pic02'),
('pic 6'),
('pic 7'),
('pic 5'),
('pic05'),
('pic 5 '),
('pic 5 something'),
('pic 4 else'),
('2000-1-10'),
('1999-12-25'),
('1999-3-3'),
('2000-3-23'),
('2000-1-2'),
('100.200.300.400'),
('100.50.60.70'),
('100.8.9.0'),
('a1b1'),
('a01b2'),
('a1b2'),
('a01b3')
)AS data ORDER BY 2,1;
value NATURAL_SORT_KEY(val)
1-02 01-02
1-2 01-02
1-20 01-120
10-20 110-120
100.8.9.0 2100.08.09.00
100.50.60.70 2100.150.160.170
100.200.300.400 2100.2200.2300.2400
1999-3-3 31999-03-03
1999-12-25 31999-112-125
2000-1-2 32000-01-02
2000-1-10 32000-01-110
2000-3-23 32000-03-123
a1b1 a01b01
a01b2 a01b02
a1b2 a01b02
a01b3 a01b03
fred fred
jane jane
pic 7 pic 07
pic 4 else pic 04 else
pic 5 pic 05
pic 5 something pic 05 something
pic 6 pic 06
pic01 pic01
pic02 pic02
pic2 pic02
pic02a pic02a
pic3 pic03
pic4 pic04
pic05 pic05
pic100 pic2100
pic100a pic2100a
pic120 pic2120
pic121 pic2121
pic02000 pic32000
tom tom
--source include/have_innodb.inc
--source include/have_sequence.inc
SET NAMES utf8mb4;
SELECT NATURAL_SORT_KEY(NULL);
# Sort without tables
SELECT '' c WHERE 0 UNION VALUES('a10'),('a9'),('a1000'), ('a0'),('b'),('b0') ORDER BY NATURAL_SORT_KEY(c);
#Test that max packet overflow produces NULL plus warning
SELECT NATURAL_SORT_KEY(repeat('a1',@@max_allowed_packet/2-1));
SELECT NATURAL_SORT_KEY(repeat('1',@@max_allowed_packet-1));
#Test with virtual(index only) key
#Test with virtual
CREATE TABLE t1(
c VARCHAR(30) CHARACTER SET latin1 COLLATE latin1_bin,
k VARCHAR(45) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci AS (NATURAL_SORT_KEY(CONVERT(c USING utf8mb4))) INVISIBLE,
KEY(k,c)) ENGINE=InnoDB;
k VARCHAR(60) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci AS (NATURAL_SORT_KEY(CONVERT(c USING utf8mb4))) VIRTUAL INVISIBLE);
INSERT INTO t1 values
('A1'),('a1'),('A100'),('a100'),('A2'),('ä2'),('a2'),('A99'),
('äb'),('B1'),('B100'),('B9'),('C'),('100');
EXPLAIN SELECT c FROM t1 ORDER BY k,c;
-- echo #Natural sort order.
# We sort by 2 colums, for stable sort,as we do not currenly have a case and accent insensitive Unicode collation.
SELECT c FROM t1 ORDER BY k,c;
......@@ -30,21 +26,63 @@ CREATE TABLE t2 AS SELECT c, NATURAL_SORT_KEY(c) FROM t1 WHERE 0;
SHOW CREATE TABLE t2;
DROP TABLE t1,t2;
#Show encoding of numbers, with some leading whitespace.
SELECT RPAD(val,28,' ') value , RPAD(NATURAL_SORT_KEY(val),35,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead
#Virtual STORED is temporarily disabled
--error ER_GENERATED_COLUMN_FUNCTION_IS_NOT_ALLOWED
CREATE TABLE t1(c VARCHAR(1), k VARCHAR(2) AS (NATURAL_SORT_KEY(c)) STORED);
#Show encoding of numbers.
SELECT RPAD(val,28,' ') value , RPAD(NATURAL_SORT_KEY(val),36,' ') sortkey , LENGTH(NATURAL_SORT_KEY(val)) - LENGTH(val) encoding_overhead
FROM
(
SELECT 0 val
UNION VALUES ('1'),('01'),('0001')
SELECT '0' val
UNION SELECT seq FROM seq_1_to_9
UNION SELECT CONCAT('1',repeat('0',seq)) FROM seq_1_to_27
) AS numbers ORDER BY sortkey;
# leading zeros ignored
SELECT natural_sort_key('1') = natural_sort_key('0001');
SELECT natural_sort_key('1.1') = natural_sort_key('1.00001');
#Test that sort order with leading zeros (lead zeros sort larger)
SELECT val
FROM
(
SELECT 0 val WHERE 0
UNION VALUES ('1/'),('01'),('1a')
) AS strings ORDER BY NATURAL_SORT_KEY(val);
# Some examples from https://github.com/sourcefrog/natsort/
# words
SELECT RPAD(val,20,' ') value, NATURAL_SORT_KEY(val) FROM
(SELECT '' val WHERE 0 UNION VALUES
('fred'),
('pic2'),
('pic100a'),
('pic120'),
('pic121'),
('jane'),
('tom'),
('pic02a'),
('pic3'),
('pic4'),
('1-20'),
('pic100'),
('pic02000'),
('10-20'),
('1-02'),
('1-2'),
('pic01'),
('pic02'),
('pic 6'),
('pic 7'),
('pic 5'),
('pic05'),
('pic 5 '),
('pic 5 something'),
('pic 4 else'),
('2000-1-10'),
('1999-12-25'),
('1999-3-3'),
('2000-3-23'),
('2000-1-2'),
('100.200.300.400'),
('100.50.60.70'),
('100.8.9.0'),
('a1b1'),
('a01b2'),
('a1b2'),
('a01b3')
)AS data ORDER BY 2,1;
......@@ -5450,7 +5450,7 @@ String *Item_temptable_rowid::val_str(String *str)
small, as it only computes variable *length prefixes*.
@param[in] n - the number
@param[in] s - output buffer (should be at least 26 bytes large)
@param[in] s - output string
@return - length of encoding
......@@ -5467,14 +5467,14 @@ String *Item_temptable_rowid::val_str(String *str)
Output calculated as concat('99', '0' + n -18)'
Output range '990'-'998'
- n is from 28 to SIZE_T_MAX
- n is from 27 to SIZE_T_MAX
Output starts with '999',
then log10(n) is encoded as 2-digit decimal number
then the number itself is added.
Example : for 28 key is concat('999', '01' , '28')
i.e '9990128'
Key legth is 5 + ceil(log10(n))
Key length is 5 + ceil(log10(n))
Output range is
(64bit)'9990128' - '9991918446744073709551615'
......@@ -5482,26 +5482,28 @@ String *Item_temptable_rowid::val_str(String *str)
*/
/* Largest length of encoded string.*/
#define NATSORT_BUFSIZE 26
static size_t natsort_encode_length(size_t n, char *s)
static size_t natsort_encode_length_max(size_t n)
{
return (n < 27) ? n/9+1 : 26;
}
static void natsort_encode_length(size_t n, String* out)
{
if (n < 27)
{
size_t n_nines= n / 9;
if (n_nines)
memset(s, '9', n_nines);
s[n_nines]= char(n % 9 + '0');
s[n_nines + 1]= 0;
return n_nines + 1;
if (n >= 9)
out->fill(out->length() + n/9,'9');
out->append(char(n % 9 + '0'));
return;
}
memset(s, '9', 3);
size_t log10n= 0;
for (size_t tmp= n / 10; tmp; tmp/= 10)
log10n++;
s[3]= '0' + (char) (log10n / 10);
s[4]= '0' + (char) (log10n % 10);
return longlong10_to_str(n, s + 5, 10) - s;
out->fill(out->length() + 3, '9');
out->append('0' + (char) (log10n / 10));
out->append('0' + (char) (log10n % 10));
out->append_ulonglong(n);
}
enum class NATSORT_ERR
......@@ -5520,8 +5522,6 @@ enum class NATSORT_ERR
@param[in] n_digits - length of the string,
in characters, not counting leading zeros.
@param[in] n_lead_zeros - leading zeros count.
@param[out] out - String to write to. The string should
have enough preallocated space to fit the encoded key.
......@@ -5530,58 +5530,43 @@ enum class NATSORT_ERR
NATSORT_ERR::KEY_TOO_LARGE - out string does not have enough
space left to accomodate the key.
Note:
Special case, where there are only leading zeros
n_digits == 0 and n_leading_zeros > 0
will treated as-if in = "0", n_digits = 1, n_leading_zeros
is decremented.
The resulting encoding of the numeric string is then
CONCAT(natsort_encode_length(n_digits), in,
natsort_encode_length(n_leading_zeros))
CONCAT(natsort_encode_length(n_digits), in)
*/
static NATSORT_ERR natsort_encode_numeric_string(const char *in,
size_t n_digits,
size_t n_leading_zeros,
String *out)
{
char buf[NATSORT_BUFSIZE];
DBUG_ASSERT(in);
DBUG_ASSERT(n_digits);
size_t len;
len= natsort_encode_length(n_digits - 1, buf);
if (out->length() + len + n_digits > out->alloced_length())
if (out->length() + natsort_encode_length_max(n_digits - 1) + n_digits >
out->alloced_length())
return NATSORT_ERR::KEY_TOO_LARGE;
out->append(buf, len);
natsort_encode_length(n_digits - 1, out);
out->append(in, n_digits);
len= natsort_encode_length(n_leading_zeros, buf);
if (out->length() + len > out->alloced_length())
return NATSORT_ERR::KEY_TOO_LARGE;
out->append(buf, len);
return NATSORT_ERR::SUCCESS;
}
/*
Calculate max size of the natsort key.
A digit expands to 3 chars - length_prefix, the digit, lead_zero_cnt(0)
A digit in string expands to 2 chars length_prefix , and the digit
With even length L=2N, the largest key corresponds to input string
in form REPEAT('<letter><digit>',N) and the length of a key is
3N + N = 4*N = 2*L
in form REPEAT(<digit><letter>,N) and the length of a key is
2N + N = 3N
With odd input length L=2N+1, largest key is built by appending
a digit at the end, with key length 3N+2
With odd input length L=2*N+1, largest key corresponds to
CONCAT(<digit>,REPEAT('<letter><digit>', N)
with key length is 3 + 4*N = 2*L+1
*/
static size_t natsort_max_key_size(size_t input_size)
{
return input_size * 2 + input_size % 2;
return input_size + (input_size + 1)/2 ;
}
/**
......@@ -5600,8 +5585,8 @@ static NATSORT_ERR to_natsort_key(const String *in, String *out,
size_t n_digits= 0;
size_t n_lead_zeros= 0;
size_t num_start;
size_t reserve_length= std::min(natsort_max_key_size(in->length()),
max_key_size);
size_t reserve_length= std::min(
natsort_max_key_size(in->length()) + MAX_BIGINT_WIDTH + 2, max_key_size);
out->length(0);
out->set_charset(in->charset());
......@@ -5624,7 +5609,7 @@ static NATSORT_ERR to_natsort_key(const String *in, String *out,
n_digits= 1;
}
NATSORT_ERR err= natsort_encode_numeric_string(
in->ptr() + num_start, n_digits, n_lead_zeros, out);
in->ptr() + num_start, n_digits, out);
if (err != NATSORT_ERR::SUCCESS)
return err;
......@@ -5720,6 +5705,16 @@ bool Item_func_natural_sort_key::fix_length_and_dec(void)
return false;
}
/**
Disable use in stored virtual functions. Temporarily(?), until
the encoding is stable.
*/
bool Item_func_natural_sort_key::check_vcol_func_processor(void *arg)
{
return mark_unsupported_function(func_name(), "()", arg,
VCOL_NON_DETERMINISTIC);
}
#ifdef WITH_WSREP
#include "wsrep_mysqld.h"
......
......@@ -287,6 +287,8 @@ class Item_func_natural_sort_key : public Item_str_func
{
return get_item_copy<Item_func_natural_sort_key>(thd, this);
}
bool check_vcol_func_processor(void *arg) override;
};
class Item_func_concat :public Item_str_func
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment