New UTF8 charset

parent 1b54e7c1
......@@ -1872,7 +1872,7 @@ CHARSETS_AVAILABLE="armscii8 big5 cp1251 cp1257
latin1 latin1_de latin2 latin5 sjis swe7 tis620 ujis
usa7 utf8 win1250 win1251ukr"
CHARSETS_DEPRECATED="win1251"
CHARSETS_COMPLEX="big5 czech euc_kr gb2312 gbk latin1_de sjis tis620 ujis"
CHARSETS_COMPLEX="big5 czech euc_kr gb2312 gbk latin1_de sjis tis620 ujis utf8"
DEFAULT_CHARSET=latin1
AC_DIVERT_POP
......
......@@ -29,6 +29,22 @@ extern "C" {
#define CHARSET_DIR "charsets/"
#define my_wc_t ulong
typedef struct unicase_info_st {
uint16 toupper;
uint16 tolower;
uint16 sort;
} MY_UNICASE_INFO;
#define MY_CS_ILSEQ 0
#define MY_CS_ILUNI 0
#define MY_CS_TOOSMALL -1
#define MY_CS_TOOFEW(n) (-1-(n))
typedef struct charset_info_st
{
uint number;
......@@ -48,9 +64,9 @@ typedef struct charset_info_st
char *, char *, uint *, uint *);
uint mbmaxlen;
int (*ismbchar)(const char *, const char *);
my_bool (*ismbhead)(uint);
int (*mbcharlen)(uint);
int (*ismbchar)(struct charset_info_st *, const char *, const char *);
my_bool (*ismbhead)(struct charset_info_st *, uint);
int (*mbcharlen)(struct charset_info_st *, uint);
/* Functions for case convertion */
void (*caseup_str)(struct charset_info_st *, char *);
......@@ -107,9 +123,9 @@ extern int my_strnncoll_big5(CHARSET_INFO *,const uchar *, uint, const uchar
extern int my_strnxfrm_big5(CHARSET_INFO *,uchar *, uint, const uchar *, uint);
extern my_bool my_like_range_big5(CHARSET_INFO *,const char *, uint, pchar, uint,
char *, char *, uint *, uint *);
extern int ismbchar_big5(const char *, const char *);
extern my_bool ismbhead_big5(uint);
extern int mbcharlen_big5(uint);
extern int ismbchar_big5(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_big5(CHARSET_INFO *, uint);
extern int mbcharlen_big5(CHARSET_INFO *, uint);
#endif
#ifdef HAVE_CHARSET_czech
......@@ -125,17 +141,17 @@ extern my_bool my_like_range_czech(CHARSET_INFO *,
#ifdef HAVE_CHARSET_euc_kr
/* declarations for the euc_kr character set */
extern uchar ctype_euc_kr[], to_lower_euc_kr[], to_upper_euc_kr[], sort_order_euc_kr[];
extern int ismbchar_euc_kr(const char *, const char *);
extern my_bool ismbhead_euc_kr(uint);
extern int mbcharlen_euc_kr(uint);
extern int ismbchar_euc_kr(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_euc_kr(CHARSET_INFO *, uint);
extern int mbcharlen_euc_kr(CHARSET_INFO *, uint);
#endif
#ifdef HAVE_CHARSET_gb2312
/* declarations for the gb2312 character set */
extern uchar ctype_gb2312[], to_lower_gb2312[], to_upper_gb2312[], sort_order_gb2312[];
extern int ismbchar_gb2312(const char *, const char *);
extern my_bool ismbhead_gb2312(uint);
extern int mbcharlen_gb2312(uint);
extern int ismbchar_gb2312(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_gb2312(CHARSET_INFO *, uint);
extern int mbcharlen_gb2312(CHARSET_INFO *, uint);
#endif
#ifdef HAVE_CHARSET_gbk
......@@ -145,9 +161,9 @@ extern int my_strnncoll_gbk(CHARSET_INFO *, const uchar *, uint, const uchar
extern int my_strnxfrm_gbk(CHARSET_INFO *, uchar *, uint, const uchar *, uint);
extern my_bool my_like_range_gbk(CHARSET_INFO *, const char *, uint, pchar, uint,
char *, char *, uint *, uint *);
extern int ismbchar_gbk(const char *, const char *);
extern my_bool ismbhead_gbk(uint);
extern int mbcharlen_gbk(uint);
extern int ismbchar_gbk(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_gbk(CHARSET_INFO *, uint);
extern int mbcharlen_gbk(CHARSET_INFO *, uint);
#endif
#ifdef HAVE_CHARSET_latin1_de
......@@ -166,9 +182,9 @@ extern int my_strnncoll_sjis(CHARSET_INFO *, const uchar *, uint, const ucha
extern int my_strnxfrm_sjis(CHARSET_INFO *, uchar *, uint, const uchar *, uint);
extern my_bool my_like_range_sjis(CHARSET_INFO *, const char *, uint, pchar, uint,
char *, char *, uint *, uint *);
extern int ismbchar_sjis(const char *, const char *);
extern my_bool ismbhead_sjis(uint);
extern int mbcharlen_sjis(uint);
extern int ismbchar_sjis(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_sjis(CHARSET_INFO *, uint);
extern int mbcharlen_sjis(CHARSET_INFO *, uint);
#endif
#ifdef HAVE_CHARSET_tis620
......@@ -183,11 +199,38 @@ extern my_bool my_like_range_tis620(CHARSET_INFO *, const char *, uint, pchar, u
#ifdef HAVE_CHARSET_ujis
/* declarations for the ujis character set */
extern uchar ctype_ujis[], to_lower_ujis[], to_upper_ujis[], sort_order_ujis[];
extern int ismbchar_ujis(const char *, const char *);
extern my_bool ismbhead_ujis(uint);
extern int mbcharlen_ujis(uint);
extern int ismbchar_ujis(CHARSET_INFO *, const char *, const char *);
extern my_bool ismbhead_ujis(CHARSET_INFO *, uint);
extern int mbcharlen_ujis(CHARSET_INFO *, uint);
#endif
#ifdef HAVE_CHARSET_utf8
extern uchar ctype_utf8[];
extern uchar to_lower_utf8[];
extern uchar to_upper_utf8[];
int my_strnncoll_utf8(CHARSET_INFO *cs,
const uchar *s, uint s_len, const uchar *t, uint t_len);
int my_strnxfrm_utf8(CHARSET_INFO *cs,
uchar *dest, uint destlen, const uchar *src, uint srclen);
int my_ismbchar_utf8(CHARSET_INFO *cs, const char *b, const char *e);
my_bool my_ismbhead_utf8(CHARSET_INFO * cs, uint ch);
int my_mbcharlen_utf8(CHARSET_INFO *cs, uint c);
void my_caseup_str_utf8(CHARSET_INFO * cs, char * s);
void my_casedn_str_utf8(CHARSET_INFO *cs, char * s);
void my_caseup_utf8(CHARSET_INFO *cs, char *s, uint len);
void my_casedn_utf8(CHARSET_INFO *cs, char *s, uint len);
int my_strcasecmp_utf8(CHARSET_INFO *cs, const char *s, const char *t);
int my_strncasecmp_utf8(CHARSET_INFO *cs, const char *s,const char *t,uint l);
int my_utf8_uni (CHARSET_INFO *cs, my_wc_t *p, const uchar *s, const uchar *e);
int my_uni_utf8 (CHARSET_INFO *cs, my_wc_t pwc , uchar *b, uchar *e);
#endif
#define _U 01 /* Upper case */
#define _L 02 /* Lower case */
......@@ -229,9 +272,9 @@ extern int mbcharlen_ujis(uint);
((s)->like_range((s), (a), (b), (c), (d), (e), (f), (g), (h)))
#define use_mb(s) ((s)->ismbchar != NULL)
#define my_ismbchar(s, a, b) ((s)->ismbchar((a), (b)))
#define my_ismbhead(s, a) ((s)->ismbhead((a)))
#define my_mbcharlen(s, a) ((s)->mbcharlen((a)))
#define my_ismbchar(s, a, b) ((s)->ismbchar((s), (a), (b)))
#define my_ismbhead(s, a) ((s)->ismbhead((s), (a)))
#define my_mbcharlen(s, a) ((s)->mbcharlen((s),(a)))
#define my_caseup(s, a, l) ((s)->caseup((s), (a), (l)))
#define my_casedn(s, a, l) ((s)->casedn((s), (a), (l)))
......
......@@ -41,7 +41,7 @@ mystringsobjects = strmov.lo strxmov.lo strxnmov.lo strnmov.lo \
ctype.lo ctype-simple.lo ctype-mb.lo \
ctype-big5.lo ctype-czech.lo ctype-euc_kr.lo \
ctype-gb2312.lo ctype-gbk.lo ctype-latin1_de.lo \
ctype-sjis.lo ctype-tis620.lo ctype-ujis.lo
ctype-sjis.lo ctype-tis620.lo ctype-ujis.lo ctype-utf8.lo
mystringsextra= strto.c
dbugobjects = dbug.lo # IT IS IN SAFEMALLOC.C sanity.lo
......
......@@ -57,7 +57,10 @@ void unireg_init(ulong options)
for (cs=compiled_charsets; cs->number; cs++)
{
uchar max_char=cs->sort_order[(uchar) cs->max_sort_char];
uchar max_char;
if (!cs->sort_order)
continue;
cs->sort_order[(uchar) cs->max_sort_char];
for (i = 0; i < 256; i++)
{
if ((uchar) cs->sort_order[i] > max_char)
......
......@@ -37,3 +37,4 @@ cp1257 29
latin5 30
latin1_de 31
armscii8 32
utf8 33
......@@ -22,19 +22,19 @@ pkglib_LIBRARIES = libmystrings.a
# Exact one of ASSEMBLER_X
if ASSEMBLER_x86
ASRCS = strings-x86.s longlong2str-x86.s
CSRCS = bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.c
CSRCS = bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.c ctype-utf8.c
else
if ASSEMBLER_sparc
# These file MUST all be on the same line!! Otherwise automake
# generats a very broken makefile
ASRCS = bmove_upp-sparc.s strappend-sparc.s strend-sparc.s strinstr-sparc.s strmake-sparc.s strmov-sparc.s strnmov-sparc.s strstr-sparc.s strxmov-sparc.s
CSRCS = strcont.c strfill.c strcend.c is_prefix.c longlong2str.c bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.c
CSRCS = strcont.c strfill.c strcend.c is_prefix.c longlong2str.c bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.cctype-utf8.c
else
#no assembler
ASRCS =
# These file MUST all be on the same line!! Otherwise automake
# generats a very broken makefile
CSRCS = strxmov.c bmove_upp.c strappend.c strcont.c strend.c strfill.c strcend.c is_prefix.c strstr.c strinstr.c strmake.c strnmov.c strmov.c longlong2str.c bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.c
CSRCS = strxmov.c bmove_upp.c strappend.c strcont.c strend.c strfill.c strcend.c is_prefix.c strstr.c strinstr.c strmake.c strnmov.c strmov.c longlong2str.c bfill.c bmove.c bmove512.c bchange.c strxnmov.c int2str.c str2int.c r_strinstr.c atof.c bcmp.c strtol.c strtoul.c strtoll.c strtoull.c llstr.c strnlen.c ctype.c ctype-simple.c ctype-mb.c ctype-big5.c ctype-czech.c ctype-euc_kr.c ctype-gb2312.c ctype-gbk.c ctype-latin1_de.c ctype-sjis.c ctype-tis620.c ctype-ujis.c ctype-utf8.c
endif
endif
......@@ -42,7 +42,7 @@ libmystrings_a_SOURCES = $(ASRCS) $(CSRCS)
noinst_PROGRAMS = conf_to_src
# Default charset definitions
EXTRA_DIST = ctype-big5.c ctype-czech.c ctype-euc_kr.c \
ctype-gb2312.c ctype-gbk.c ctype-sjis.c \
ctype-gb2312.c ctype-gbk.c ctype-sjis.c ctype-utf8.c \
ctype-tis620.c ctype-ujis.c ctype-latin1_de.c \
strto.c strings-x86.s longlong2str-x86.s \
strxmov.c bmove_upp.c strappend.c strcont.c strend.c \
......
......@@ -378,17 +378,17 @@ my_bool my_like_range_big5(CHARSET_INFO *cs,
return 0;
}
int ismbchar_big5(const char* p, const char *e)
int ismbchar_big5(CHARSET_INFO *cs,const char* p, const char *e)
{
return (isbig5head(*(p)) && (e)-(p)>1 && isbig5tail(*((p)+1))? 2: 0);
}
my_bool ismbhead_big5(uint c)
my_bool ismbhead_big5(CHARSET_INFO *cs, uint c)
{
return isbig5head(c);
}
int mbcharlen_big5(uint c)
int mbcharlen_big5(CHARSET_INFO *cs, uint c)
{
return (isbig5head(c)? 2: 0);
}
......
......@@ -183,19 +183,19 @@ uchar NEAR sort_order_euc_kr[]=
#define iseuc_kr(c) ((0xa1<=(uchar)(c) && (uchar)(c)<=0xfe))
int ismbchar_euc_kr(const char* p, const char *e)
int ismbchar_euc_kr(CHARSET_INFO *cs,const char* p, const char *e)
{
return ((*(uchar*)(p)<0x80)? 0:\
iseuc_kr(*(p)) && (e)-(p)>1 && iseuc_kr(*((p)+1))? 2:\
0);
}
my_bool ismbhead_euc_kr(uint c)
my_bool ismbhead_euc_kr(CHARSET_INFO *cs,uint c)
{
return (iseuc_kr(c));
}
int mbcharlen_euc_kr(uint c)
int mbcharlen_euc_kr(CHARSET_INFO *cs,uint c)
{
return (iseuc_kr(c) ? 2 : 0);
}
......
......@@ -166,17 +166,17 @@ uchar NEAR sort_order_gb2312[]=
#define isgb2312tail(c) (0xa1<=(uchar)(c) && (uchar)(c)<=0xfe)
int ismbchar_gb2312(const char* p, const char *e)
int ismbchar_gb2312(CHARSET_INFO *cs,const char* p, const char *e)
{
return (isgb2312head(*(p)) && (e)-(p)>1 && isgb2312tail(*((p)+1))? 2: 0);
}
my_bool ismbhead_gb2312(uint c)
my_bool ismbhead_gb2312(CHARSET_INFO *cs,uint c)
{
return isgb2312head(c);
}
int mbcharlen_gb2312(uint c)
int mbcharlen_gb2312(CHARSET_INFO *cs,uint c)
{
return (isgb2312head(c)? 2:0);
}
......
......@@ -2704,17 +2704,17 @@ extern my_bool my_like_range_gbk(CHARSET_INFO *cs,
}
int ismbchar_gbk(const char* p, const char *e)
int ismbchar_gbk(CHARSET_INFO *cs,const char* p, const char *e)
{
return (isgbkhead(*(p)) && (e)-(p)>1 && isgbktail(*((p)+1))? 2: 0);
}
my_bool ismbhead_gbk(uint c)
my_bool ismbhead_gbk(CHARSET_INFO *cs,uint c)
{
return isgbkhead(c);
}
int mbcharlen_gbk(uint c)
int mbcharlen_gbk(CHARSET_INFO *cs,uint c)
{
return (isgbkhead(c)? 2:0);
}
......
......@@ -183,17 +183,17 @@ uchar NEAR sort_order_sjis[]=
(0x80<=(c) && (c)<=0xfc))
int ismbchar_sjis(const char* p, const char *e)
int ismbchar_sjis(CHARSET_INFO *cs,const char* p, const char *e)
{
return (issjishead((uchar) *p) && (e-p)>1 && issjistail((uchar)p[1]) ? 2: 0);
}
my_bool ismbhead_sjis(uint c)
my_bool ismbhead_sjis(CHARSET_INFO *cs,uint c)
{
return issjishead((uchar) c);
}
int mbcharlen_sjis(uint c)
int mbcharlen_sjis(CHARSET_INFO *cs,uint c)
{
return (issjishead((uchar) c) ? 2: 0);
}
......@@ -208,8 +208,8 @@ int my_strnncoll_sjis(CHARSET_INFO *cs,
const uchar *e1 = s1 + len1;
const uchar *e2 = s2 + len2;
while (s1 < e1 && s2 < e2) {
if (ismbchar_sjis((char*) s1, (char*) e1) &&
ismbchar_sjis((char*) s2, (char*) e2)) {
if (ismbchar_sjis(cs,(char*) s1, (char*) e1) &&
ismbchar_sjis(cs,(char*) s2, (char*) e2)) {
uint c1 = sjiscode(*s1, *(s1+1));
uint c2 = sjiscode(*s2, *(s2+1));
if (c1 != c2)
......@@ -233,7 +233,7 @@ int my_strnxfrm_sjis(CHARSET_INFO *cs,
uchar *d_end = dest + len;
uchar *s_end = (uchar*) src + srclen;
while (dest < d_end && src < s_end) {
if (ismbchar_sjis((char*) src, (char*) s_end)) {
if (ismbchar_sjis(cs,(char*) src, (char*) s_end)) {
*dest++ = *src++;
if (dest < d_end && src < s_end)
*dest++ = *src++;
......@@ -275,7 +275,7 @@ my_bool my_like_range_sjis(CHARSET_INFO *cs,
char *min_end=min_str+res_length;
while (ptr < end && min_str < min_end) {
if (ismbchar_sjis(ptr, end)) {
if (ismbchar_sjis(cs, ptr, end)) {
*min_str++ = *max_str++ = *ptr++;
if (min_str < min_end)
*min_str++ = *max_str++ = *ptr++;
......@@ -283,7 +283,7 @@ my_bool my_like_range_sjis(CHARSET_INFO *cs,
}
if (*ptr == escape && ptr+1 < end) {
ptr++; /* Skip escape */
if (ismbchar_sjis(ptr, end))
if (ismbchar_sjis(cs, ptr, end))
*min_str++ = *max_str++ = *ptr++;
if (min_str < min_end)
*min_str++ = *max_str++ = *ptr++;
......
......@@ -183,7 +183,7 @@ uchar NEAR sort_order_ujis[]=
#define isujis_ss3(c) (((c)&0xff) == 0x8f)
int ismbchar_ujis(const char* p, const char *e)
int ismbchar_ujis(CHARSET_INFO *cs,const char* p, const char *e)
{
return ((*(uchar*)(p)<0x80)? 0:\
isujis(*(p)) && (e)-(p)>1 && isujis(*((p)+1))? 2:\
......@@ -192,12 +192,12 @@ int ismbchar_ujis(const char* p, const char *e)
0);
}
my_bool ismbhead_ujis(uint c)
my_bool ismbhead_ujis(CHARSET_INFO *cs,uint c)
{
return (isujis(c) || isujis_ss2(c) || isujis_ss3(c));
}
int mbcharlen_ujis(uint c)
int mbcharlen_ujis(CHARSET_INFO *cs,uint c)
{
return (isujis(c)? 2: isujis_ss2(c)? 2: isujis_ss3(c)? 3: 0);
}
......
This diff is collapsed.
......@@ -2660,6 +2660,32 @@ CHARSET_INFO compiled_charsets[] = {
},
#endif
#ifdef HAVE_CHARSET_utf8
{
33, /* number */
"utf8", /* name */
ctype_utf8, /* ctype */
to_lower_utf8, /* to_lower */
to_upper_utf8, /* to_upper */
to_upper_utf8, /* sort_order */
1, /* strxfrm_multiply */
my_strnncoll_utf8, /* strnncoll */
my_strnxfrm_utf8, /* strnxfrm */
NULL, /* like_range */
6, /* mbmaxlen */
my_ismbchar_utf8, /* ismbchar */
my_ismbhead_utf8, /* ismbhead */
my_mbcharlen_utf8, /* mbcharlen */
my_caseup_str_utf8,
my_casedn_str_utf8,
my_caseup_utf8,
my_casedn_utf8,
my_strcasecmp_utf8,
my_strncasecmp_utf8,
0
},
#endif
#ifdef HAVE_CHARSET_usa7
{
11, /* number */
......@@ -2790,6 +2816,7 @@ CHARSET_INFO compiled_charsets[] = {
},
#endif
{
0, /* end-of-list marker */
NullS,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment