Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
M
MariaDB
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
nexedi
MariaDB
Commits
5a2b1ba6
Commit
5a2b1ba6
authored
Jun 03, 2004
by
unknown
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Unicode collations: WL#916
XML and "collation customization" language parsers.
parent
2a32bb2b
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
547 additions
and
8 deletions
+547
-8
mysys/charset.c
mysys/charset.c
+496
-6
strings/ctype.c
strings/ctype.c
+51
-2
No files found.
mysys/charset.c
View file @
5a2b1ba6
...
...
@@ -21,6 +21,344 @@
#include <my_dir.h>
#include <my_xml.h>
/*
Collation language is implemented according to
subset of ICU Collation Customization (tailorings):
http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
Collation language elements:
Delimiters:
space - skipped
<char> := A-Z | a-z | \uXXXX
Shift command:
<shift> := & - reset at this letter.
Diff command:
<d1> := < - Identifies a primary difference.
<d2> := << - Identifies a secondary difference.
<d3> := <<< - Idenfifies a tertiary difference.
Collation rules:
<ruleset> := <rule> { <ruleset> }
<rule> := <d1> <string>
| <d2> <string>
| <d3> <string>
| <shift> <char>
<string> := <char> [ <string> ]
An example, Polish collation:
&A < \u0105 <<< \u0104
&C < \u0107 <<< \u0106
&E < \u0119 <<< \u0118
&L < \u0142 <<< \u0141
&N < \u0144 <<< \u0143
&O < \u00F3 <<< \u00D3
&S < \u015B <<< \u015A
&Z < \u017A <<< \u017B
*/
typedef
enum
my_coll_lexem_num_en
{
MY_COLL_LEXEM_EOF
=
0
,
MY_COLL_LEXEM_DIFF
=
1
,
MY_COLL_LEXEM_SHIFT
=
4
,
MY_COLL_LEXEM_CHAR
=
5
,
MY_COLL_LEXEM_ERROR
=
6
}
my_coll_lexem_num
;
typedef
struct
my_coll_lexem_st
{
const
char
*
beg
;
const
char
*
end
;
const
char
*
prev
;
int
diff
;
int
code
;
}
MY_COLL_LEXEM
;
/*
Initialize collation rule lexical anilizer
SYNOPSIS
my_coll_lexem_init
lexem Lex analizer to init
str Const string to parse
strend End of the string
USAGE
RETURN VALUES
N/A
*/
static
void
my_coll_lexem_init
(
MY_COLL_LEXEM
*
lexem
,
const
char
*
str
,
const
char
*
strend
)
{
lexem
->
beg
=
str
;
lexem
->
prev
=
str
;
lexem
->
end
=
strend
;
lexem
->
diff
=
0
;
lexem
->
code
=
0
;
}
/*
Print collation customization expression parse error, with context.
SYNOPSIS
my_coll_lexem_print_error
lexem Lex analizer to take context from
errstr sting to write error to
errsize errstr size
txt error message
USAGE
RETURN VALUES
N/A
*/
static
void
my_coll_lexem_print_error
(
MY_COLL_LEXEM
*
lexem
,
char
*
errstr
,
size_t
errsize
,
const
char
*
txt
)
{
char
tail
[
30
];
size_t
len
=
lexem
->
end
-
lexem
->
prev
;
strmake
(
tail
,
lexem
->
prev
,
min
(
len
,
sizeof
(
tail
)
-
1
));
errstr
[
errsize
-
1
]
=
'\0'
;
my_snprintf
(
errstr
,
errsize
-
1
,
"%s at '%s'"
,
txt
,
tail
);
}
/*
Convert a hex digit into its numeric value
SYNOPSIS
ch2x
ch hex digit to convert
USAGE
RETURN VALUES
an integer value in the range 0..15
-1 on error
*/
static
int
ch2x
(
int
ch
)
{
if
(
ch
>=
'0'
&&
ch
<=
'9'
)
return
ch
-
'0'
;
if
(
ch
>=
'a'
&&
ch
<=
'f'
)
return
10
+
ch
-
'a'
;
if
(
ch
>=
'A'
&&
ch
<=
'Z'
)
return
10
+
ch
-
'A'
;
return
-
1
;
}
/*
Collation language lexical parser:
Scans the next lexem.
SYNOPSIS
my_coll_lexem_next
lexem Lex analizer, previously initialized by
my_coll_lexem_init.
USAGE
Call this function in a loop
RETURN VALUES
Lexem number: eof, diff, shift, char or error.
*/
static
my_coll_lexem_num
my_coll_lexem_next
(
MY_COLL_LEXEM
*
lexem
)
{
for
(
;
lexem
->
beg
<
lexem
->
end
;
lexem
->
beg
++
)
{
lexem
->
prev
=
lexem
->
beg
;
if
(
lexem
->
beg
[
0
]
==
' '
||
lexem
->
beg
[
0
]
==
'\t'
||
lexem
->
beg
[
0
]
==
'\r'
||
lexem
->
beg
[
0
]
==
'\n'
)
continue
;
if
(
lexem
->
beg
[
0
]
==
'&'
)
{
lexem
->
beg
++
;
return
MY_COLL_LEXEM_SHIFT
;
}
if
(
lexem
->
beg
[
0
]
==
'<'
)
{
for
(
lexem
->
beg
++
,
lexem
->
diff
=
1
;
(
lexem
->
beg
<
lexem
->
end
)
&&
(
lexem
->
beg
[
0
]
==
'<'
)
&&
(
lexem
->
diff
<
3
);
lexem
->
beg
++
,
lexem
->
diff
++
);
return
MY_COLL_LEXEM_DIFF
;
}
if
((
lexem
->
beg
[
0
]
>=
'a'
&&
lexem
->
beg
[
0
]
<=
'z'
)
||
(
lexem
->
beg
[
0
]
>=
'A'
&&
lexem
->
beg
[
0
]
<=
'Z'
))
{
lexem
->
code
=
lexem
->
beg
[
0
];
lexem
->
beg
++
;
return
MY_COLL_LEXEM_CHAR
;
}
if
((
lexem
->
beg
[
0
]
==
'\\'
)
&&
(
lexem
->
beg
+
2
<
lexem
->
end
)
&&
(
lexem
->
beg
[
1
]
==
'u'
))
{
int
ch
;
lexem
->
code
=
0
;
for
(
lexem
->
beg
+=
2
;
(
lexem
->
beg
<
lexem
->
end
)
&&
((
ch
=
ch2x
(
lexem
->
beg
[
0
]))
>=
0
)
;
lexem
->
beg
++
)
{
lexem
->
code
=
(
lexem
->
code
<<
4
)
+
ch
;
}
return
MY_COLL_LEXEM_CHAR
;
}
return
MY_COLL_LEXEM_ERROR
;
}
return
MY_COLL_LEXEM_EOF
;
}
/*
Collation rule item
*/
typedef
struct
my_coll_rule_item_st
{
uint
base
;
/* Base character */
uint
curr
;
/* Current character */
int
diff
[
3
];
/* Primary, Secondary and Tertiary difference */
}
MY_COLL_RULE
;
/*
Collation language syntax parser.
Uses lexical parser.
SYNOPSIS
my_coll_rule_parse
rule Collation rule list to load to.
str A string containin collation language expression.
strend End of the string.
USAGE
RETURN VALUES
0 - OK
1 - ERROR, e.g. too many items.
*/
static
int
my_coll_rule_parse
(
MY_COLL_RULE
*
rule
,
size_t
mitems
,
const
char
*
str
,
const
char
*
strend
,
char
*
errstr
,
size_t
errsize
)
{
MY_COLL_LEXEM
lexem
;
my_coll_lexem_num
lexnum
;
my_coll_lexem_num
prevlexnum
=
MY_COLL_LEXEM_ERROR
;
MY_COLL_RULE
item
;
int
state
=
0
;
size_t
nitems
=
0
;
/* Init all variables */
errstr
[
0
]
=
'\0'
;
bzero
(
&
item
,
sizeof
(
item
));
my_coll_lexem_init
(
&
lexem
,
str
,
strend
);
while
((
lexnum
=
my_coll_lexem_next
(
&
lexem
)))
{
if
(
lexnum
==
MY_COLL_LEXEM_ERROR
)
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"Unknown character"
);
return
-
1
;
}
switch
(
state
)
{
case
0
:
if
(
lexnum
!=
MY_COLL_LEXEM_SHIFT
)
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"& expected"
);
return
-
1
;
}
prevlexnum
=
lexnum
;
state
=
2
;
continue
;
case
1
:
if
(
lexnum
!=
MY_COLL_LEXEM_SHIFT
&&
lexnum
!=
MY_COLL_LEXEM_DIFF
)
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"& or < expected"
);
return
-
1
;
}
prevlexnum
=
lexnum
;
state
=
2
;
continue
;
case
2
:
if
(
lexnum
!=
MY_COLL_LEXEM_CHAR
)
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"character expected"
);
return
-
1
;
}
if
(
prevlexnum
==
MY_COLL_LEXEM_SHIFT
)
{
item
.
base
=
lexem
.
code
;
item
.
diff
[
0
]
=
0
;
item
.
diff
[
1
]
=
0
;
item
.
diff
[
2
]
=
0
;
}
else
if
(
prevlexnum
==
MY_COLL_LEXEM_DIFF
)
{
item
.
curr
=
lexem
.
code
;
if
(
lexem
.
diff
==
3
)
{
item
.
diff
[
2
]
++
;
}
else
if
(
lexem
.
diff
==
2
)
{
item
.
diff
[
1
]
++
;
item
.
diff
[
2
]
=
0
;
}
else
if
(
lexem
.
diff
==
1
)
{
item
.
diff
[
0
]
++
;
item
.
diff
[
1
]
=
0
;
item
.
diff
[
2
]
=
0
;
}
if
(
nitems
>=
mitems
)
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"Too many rules"
);
return
-
1
;
}
rule
[
nitems
++
]
=
item
;
}
else
{
my_coll_lexem_print_error
(
&
lexem
,
errstr
,
errsize
-
1
,
"Should never happen"
);
return
-
1
;
}
state
=
1
;
continue
;
}
}
return
(
size_t
)
nitems
;
}
typedef
struct
{
int
nchars
;
...
...
@@ -284,6 +622,144 @@ static int simple_cs_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
}
#ifdef HAVE_CHARSET_ucs2
#define MY_MAX_COLL_RULE 64
/*
This function copies an UCS2 collation from
the default Unicode Collation Algorithm (UCA)
weights applying tailorings, i.e. a set of
alternative weights for some characters.
The default UCA weights are stored in my_charset_ucs2_general_uca.
They consist of 256 pages, 256 character each.
If a page is not overwritten by tailoring rules,
it is copies as is from UCA as is.
If a page contains some overwritten characters, it is
allocated. Untouched characters are copied from the
default weights.
*/
static
int
ucs2_copy_data
(
CHARSET_INFO
*
to
,
CHARSET_INFO
*
from
)
{
MY_COLL_RULE
rule
[
MY_MAX_COLL_RULE
];
char
errstr
[
128
];
uchar
*
newlengths
;
uint16
**
newweights
;
const
uchar
*
deflengths
=
my_charset_ucs2_general_uca
.
sort_order
;
uint16
**
defweights
=
my_charset_ucs2_general_uca
.
sort_order_big
;
int
rc
,
i
;
to
->
number
=
from
->
number
?
from
->
number
:
to
->
number
;
if
(
from
->
csname
)
if
(
!
(
to
->
csname
=
my_once_strdup
(
from
->
csname
,
MYF
(
MY_WME
))))
goto
err
;
if
(
from
->
name
)
if
(
!
(
to
->
name
=
my_once_strdup
(
from
->
name
,
MYF
(
MY_WME
))))
goto
err
;
if
(
from
->
comment
)
if
(
!
(
to
->
comment
=
my_once_strdup
(
from
->
comment
,
MYF
(
MY_WME
))))
goto
err
;
to
->
strxfrm_multiply
=
my_charset_ucs2_general_uca
.
strxfrm_multiply
;
to
->
min_sort_char
=
my_charset_ucs2_general_uca
.
min_sort_char
;
to
->
max_sort_char
=
my_charset_ucs2_general_uca
.
max_sort_char
;
to
->
mbminlen
=
2
;
to
->
mbmaxlen
=
2
;
/* Parse ICU Collation Customization expression */
if
((
rc
=
my_coll_rule_parse
(
rule
,
MY_MAX_COLL_RULE
,
from
->
sort_order
,
from
->
sort_order
+
strlen
(
from
->
sort_order
),
errstr
,
sizeof
(
errstr
)))
<=
0
)
{
/*
TODO: add error message reporting.
printf("Error: %d '%s'\n", rc, errstr);
*/
return
1
;
}
if
(
!
(
newweights
=
(
uint16
**
)
my_once_alloc
(
256
*
sizeof
(
uint16
*
),
MYF
(
MY_WME
))))
goto
err
;
bzero
(
newweights
,
256
*
sizeof
(
uint16
*
));
if
(
!
(
newlengths
=
(
uchar
*
)
my_once_memdup
(
deflengths
,
256
,
MYF
(
MY_WME
))))
goto
err
;
/*
Calculate maximum lenghts for the pages
which will be overwritten.
*/
for
(
i
=
0
;
i
<
rc
;
i
++
)
{
uint
pageb
=
(
rule
[
i
].
base
>>
8
)
&
0xFF
;
uint
pagec
=
(
rule
[
i
].
curr
>>
8
)
&
0xFF
;
if
(
newlengths
[
pagec
]
<
deflengths
[
pageb
])
newlengths
[
pagec
]
=
deflengths
[
pageb
];
}
for
(
i
=
0
;
i
<
rc
;
i
++
)
{
uint
pageb
=
(
rule
[
i
].
base
>>
8
)
&
0xFF
;
uint
pagec
=
(
rule
[
i
].
curr
>>
8
)
&
0xFF
;
uint
chb
,
chc
;
if
(
!
newweights
[
pagec
])
{
/* Alloc new page and copy the default UCA weights */
uint
size
=
256
*
newlengths
[
pagec
]
*
sizeof
(
uint16
);
if
(
!
(
newweights
[
pagec
]
=
(
uint16
*
)
my_once_alloc
(
size
,
MYF
(
MY_WME
))))
goto
err
;
bzero
((
void
*
)
newweights
[
pagec
],
size
);
for
(
chc
=
0
;
chc
<
256
;
chc
++
)
{
memcpy
(
newweights
[
pagec
]
+
chc
*
newlengths
[
pagec
],
defweights
[
pagec
]
+
chc
*
deflengths
[
pagec
],
deflengths
[
pagec
]
*
sizeof
(
uint16
));
}
}
/*
Aply the alternative rule:
shift to the base character and primary difference.
*/
chc
=
rule
[
i
].
curr
&
0xFF
;
chb
=
rule
[
i
].
base
&
0xFF
;
memcpy
(
newweights
[
pagec
]
+
chc
*
newlengths
[
pagec
],
defweights
[
pageb
]
+
chb
*
deflengths
[
pageb
],
deflengths
[
pageb
]
*
sizeof
(
uint16
));
/* Apply primary difference */
newweights
[
pagec
][
chc
*
newlengths
[
pagec
]]
+=
rule
[
i
].
diff
[
0
];
}
/* Copy non-overwritten pages from the default UCA weights */
for
(
i
=
0
;
i
<
256
;
i
++
)
if
(
!
newweights
[
i
])
newweights
[
i
]
=
defweights
[
i
];
to
->
sort_order
=
newlengths
;
to
->
sort_order_big
=
newweights
;
return
0
;
err:
return
1
;
}
#endif
static
my_bool
simple_cs_is_full
(
CHARSET_INFO
*
cs
)
{
return
((
cs
->
csname
&&
cs
->
tab_to_uni
&&
cs
->
ctype
&&
cs
->
to_upper
&&
...
...
@@ -314,6 +790,19 @@ static int add_collation(CHARSET_INFO *cs)
all_charsets
[
cs
->
number
]
->
state
|=
cs
->
state
;
if
(
!
(
all_charsets
[
cs
->
number
]
->
state
&
MY_CS_COMPILED
))
{
if
(
!
strcmp
(
cs
->
csname
,
"ucs2"
)
)
{
#ifdef HAVE_CHARSET_ucs2
CHARSET_INFO
*
new
=
all_charsets
[
cs
->
number
];
new
->
cset
=
my_charset_ucs2_general_uca
.
cset
;
new
->
coll
=
my_charset_ucs2_general_uca
.
coll
;
if
(
ucs2_copy_data
(
new
,
cs
))
return
MY_XML_ERROR
;
new
->
state
|=
MY_CS_AVAILABLE
|
MY_CS_LOADED
;
#endif
}
else
{
simple_cs_init_functions
(
all_charsets
[
cs
->
number
]);
if
(
simple_cs_copy_data
(
all_charsets
[
cs
->
number
],
cs
))
...
...
@@ -324,6 +813,7 @@ static int add_collation(CHARSET_INFO *cs)
}
all_charsets
[
cs
->
number
]
->
state
|=
MY_CS_AVAILABLE
;
}
}
else
{
/*
...
...
strings/ctype.c
View file @
5a2b1ba6
...
...
@@ -22,6 +22,23 @@
#endif
/*
This files implements routines which parse XML based
character set and collation description files.
Unicode collations are encoded according to
Unicode Technical Standard #35
Locale Data Markup Language (LDML)
http://www.unicode.org/reports/tr35/
and converted into ICU string according to
Collation Customization
http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
*/
static
char
*
mstr
(
char
*
str
,
const
char
*
src
,
uint
l1
,
uint
l2
)
{
...
...
@@ -54,6 +71,11 @@ struct my_cs_file_section_st
#define _CS_PRIMARY_ID 15
#define _CS_BINARY_ID 16
#define _CS_CSDESCRIPT 17
#define _CS_RESET 18
#define _CS_DIFF1 19
#define _CS_DIFF2 20
#define _CS_DIFF3 21
static
struct
my_cs_file_section_st
sec
[]
=
{
...
...
@@ -83,6 +105,10 @@ static struct my_cs_file_section_st sec[] =
{
_CS_ORDER
,
"charsets.charset.collation.order"
},
{
_CS_FLAG
,
"charsets.charset.collation.flag"
},
{
_CS_COLLMAP
,
"charsets.charset.collation.map"
},
{
_CS_RESET
,
"charsets.charset.collation.rules.reset"
},
{
_CS_DIFF1
,
"charsets.charset.collation.rules.p"
},
{
_CS_DIFF2
,
"charsets.charset.collation.rules.s"
},
{
_CS_DIFF3
,
"charsets.charset.collation.rules.t"
},
{
0
,
NULL
}
};
...
...
@@ -109,6 +135,7 @@ typedef struct my_cs_file_info
uchar
sort_order
[
MY_CS_SORT_ORDER_TABLE_SIZE
];
uint16
tab_to_uni
[
MY_CS_TO_UNI_TABLE_SIZE
];
char
comment
[
MY_CS_CSDESCR_SIZE
];
size_t
sort_order_length
;
CHARSET_INFO
cs
;
int
(
*
add_collation
)(
CHARSET_INFO
*
cs
);
}
MY_CHARSET_LOADER
;
...
...
@@ -156,9 +183,11 @@ static int cs_enter(MY_XML_PARSER *st,const char *attr, uint len)
struct
my_cs_file_section_st
*
s
=
cs_file_sec
(
attr
,
len
);
if
(
s
&&
(
s
->
state
==
_CS_CHARSET
))
{
bzero
(
&
i
->
cs
,
sizeof
(
i
->
cs
));
}
if
(
s
&&
(
s
->
state
==
_CS_COLLATION
))
i
->
sort_order_length
=
0
;
return
MY_XML_OK
;
}
...
...
@@ -242,6 +271,26 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, uint len)
fill_uchar
(
i
->
ctype
,
MY_CS_CTYPE_TABLE_SIZE
,
attr
,
len
);
i
->
cs
.
ctype
=
i
->
ctype
;
break
;
case
_CS_RESET
:
case
_CS_DIFF1
:
case
_CS_DIFF2
:
case
_CS_DIFF3
:
{
/*
Convert collation description from
Locale Data Markup Language (LDML)
into ICU Collation Customization expression.
*/
char
arg
[
16
];
const
char
*
cmd
[]
=
{
"&"
,
"<"
,
"<<"
,
"<<<"
};
i
->
cs
.
sort_order
=
i
->
sort_order
;
mstr
(
arg
,
attr
,
len
,
sizeof
(
arg
)
-
1
);
if
(
i
->
sort_order_length
+
20
<
sizeof
(
i
->
sort_order
))
{
char
*
dst
=
i
->
sort_order_length
+
i
->
sort_order
;
i
->
sort_order_length
+=
sprintf
(
dst
,
" %s %s"
,
cmd
[
state
-
_CS_RESET
],
arg
);
}
}
}
return
MY_XML_OK
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment