Commit c870a43f authored by Alexander Barkov's avatar Alexander Barkov

MDEV-4425 Regexp enhancements

Do not pass PCRE_UCP flag for binary data.
This makes bytes 0x80..FF not to belong to 
generic character classes \d (digit) and \w (word character).

SELECT 0xFF RLIKE '\\w';
 -> 0

Note, this change does not affect non-binary data,
which is still examined with the PCRE_UCP flag by default.
parent b801d553
......@@ -234,6 +234,12 @@ class ch ch RLIKE class
\p{Tamil} 㐗 0
\p{Tamil} 갷 0
DROP TABLE t1, t2;
SELECT 0xFF RLIKE '\\w';
0xFF RLIKE '\\w'
0
SELECT 0xFF RLIKE '(*UCP)\\w';
0xFF RLIKE '(*UCP)\\w'
1
SELECT '\n' RLIKE '(*CR)';
'\n' RLIKE '(*CR)'
1
......
......@@ -46,6 +46,10 @@ INSERT INTO t2 VALUES ('[[:alpha:]]'),('[[:digit:]]');
SELECT class, ch, ch RLIKE class FROM t1, t2 ORDER BY class, BINARY ch;
DROP TABLE t1, t2;
# Checking that UCP is disabled by default for binary data
SELECT 0xFF RLIKE '\\w';
SELECT 0xFF RLIKE '(*UCP)\\w';
# newline character
SELECT '\n' RLIKE '(*CR)';
SELECT '\n' RLIKE '(*LF)';
......
......@@ -1511,8 +1511,9 @@ class Regexp_processor_pcre
{}
void init(CHARSET_INFO *data_charset, int extra_flags, uint nsubpatterns)
{
m_library_flags= PCRE_UCP | extra_flags |
(data_charset != &my_charset_bin ? PCRE_UTF8 : 0) |
m_library_flags= extra_flags |
(data_charset != &my_charset_bin ?
(PCRE_UTF8 | PCRE_UCP) : 0) |
((data_charset->state &
(MY_CS_BINSORT | MY_CS_CSSORT)) ? 0 : PCRE_CASELESS);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment