Unicode collations: WL#916

XML and "collation customization" language parsers.

Unicode collations: WL#916
XML and "collation customization" language parsers.
5a2b1ba6 · unknown · 2a32bb2b · 5a2b1ba6 · 5a2b1ba6
Commit 5a2b1ba6 authored Jun 03, 2004 by unknown
Hide whitespace changes
Inline Side-by-side

Showing with 547 additions and 8 deletions

mysys/charset.c mysys/charset.c +496 -6

strings/ctype.c strings/ctype.c +51 -2

No files found.
--- a/mysys/charset.c
+++ b/mysys/charset.c
@@ -21,6 +21,344 @@
 #include <my_dir.h>
 #include <my_xml.h>

+
+/*
+  Collation language is implemented according to
+  subset of ICU Collation Customization (tailorings):
+  http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
+  
+  Collation language elements:
+  Delimiters:
+    space   - skipped
+  
+  <char> :=  A-Z | a-z | \uXXXX
+  
+  Shift command:
+    <shift>  := &       - reset at this letter. 
+  
+  Diff command:
+    <d1> :=  <     - Identifies a primary difference.
+    <d2> :=  <<    - Identifies a secondary difference.
+    <d3> := <<<    - Idenfifies a tertiary difference.
+  
+  
+  Collation rules:
+    <ruleset> :=  <rule>  { <ruleset> }
+    
+    <rule> :=   <d1>    <string>
+              | <d2>    <string>
+              | <d3>    <string>
+              | <shift> <char>
+    
+    <string> := <char> [ <string> ]
+
+  An example, Polish collation:
+  
+    &A < \u0105 <<< \u0104
+    &C < \u0107 <<< \u0106
+    &E < \u0119 <<< \u0118
+    &L < \u0142 <<< \u0141
+    &N < \u0144 <<< \u0143
+    &O < \u00F3 <<< \u00D3
+    &S < \u015B <<< \u015A
+    &Z < \u017A <<< \u017B    
+*/
+
+
+typedef enum my_coll_lexem_num_en
+{
+  MY_COLL_LEXEM_EOF	= 0,
+  MY_COLL_LEXEM_DIFF	= 1, 
+  MY_COLL_LEXEM_SHIFT	= 4,
+  MY_COLL_LEXEM_CHAR	= 5,
+  MY_COLL_LEXEM_ERROR	= 6
+} my_coll_lexem_num;
+
+
+typedef struct my_coll_lexem_st
+{
+  const char *beg;
+  const char *end;
+  const char *prev;
+  int   diff;
+  int   code;
+} MY_COLL_LEXEM;
+
+
+/*
+  Initialize collation rule lexical anilizer
+  
+  SYNOPSIS
+    my_coll_lexem_init
+    lexem                Lex analizer to init
+    str                  Const string to parse
+    strend               End of the string
+  USAGE
+  
+  RETURN VALUES
+    N/A
+*/
+
+static void my_coll_lexem_init(MY_COLL_LEXEM *lexem,
+                               const char *str, const char *strend)
+{
+  lexem->beg= str;
+  lexem->prev= str;
+  lexem->end= strend;
+  lexem->diff= 0;
+  lexem->code= 0;
+}
+
+
+/*
+  Print collation customization expression parse error, with context.
+  
+  SYNOPSIS
+    my_coll_lexem_print_error
+    lexem                Lex analizer to take context from
+    errstr               sting to write error to
+    errsize              errstr size
+    txt                  error message
+  USAGE
+  
+  RETURN VALUES
+    N/A
+*/
+
+static void my_coll_lexem_print_error(MY_COLL_LEXEM *lexem,
+                                      char *errstr, size_t errsize,
+                                      const char *txt)
+{
+  char tail[30];
+  size_t len= lexem->end - lexem->prev;
+  strmake (tail, lexem->prev, min(len, sizeof(tail)-1));
+  errstr[errsize-1]= '\0';
+  my_snprintf(errstr,errsize-1,"%s at '%s'", txt, tail);
+}
+
+
+/*
+  Convert a hex digit into its numeric value
+  
+  SYNOPSIS
+    ch2x
+    ch                   hex digit to convert
+  USAGE
+  
+  RETURN VALUES
+    an integer value in the range 0..15
+    -1 on error
+*/
+
+static int ch2x(int ch)
+{
+  if (ch >= '0' && ch <= '9')
+    return ch - '0';
+  
+  if (ch >= 'a' && ch <= 'f')
+    return 10 + ch - 'a';
+  
+  if (ch >= 'A' && ch <= 'Z')
+    return 10 + ch - 'A';
+  
+  return -1;
+}
+
+
+/*
+  Collation language lexical parser:
+  Scans the next lexem.
+  
+  SYNOPSIS
+    my_coll_lexem_next
+    lexem                Lex analizer, previously initialized by 
+                         my_coll_lexem_init.
+  USAGE
+    Call this function in a loop
+    
+  RETURN VALUES
+    Lexem number: eof, diff, shift, char or error.
+*/
+
+static my_coll_lexem_num my_coll_lexem_next(MY_COLL_LEXEM *lexem)
+{
+  for ( ;lexem->beg < lexem->end ; lexem->beg++)
+  {
+    lexem->prev= lexem->beg;
+    if (lexem->beg[0] == ' '  || lexem->beg[0] == '\t' || 
+        lexem->beg[0] == '\r' || lexem->beg[0] == '\n')
+      continue;
+    
+    if (lexem->beg[0] == '&')
+    {
+      lexem->beg++;
+      return MY_COLL_LEXEM_SHIFT;
+    }
+    
+    if (lexem->beg[0] == '<')
+    {
+      for (lexem->beg++, lexem->diff=1; 
+           (lexem->beg < lexem->end) && 
+           (lexem->beg[0] == '<') && (lexem->diff<3);
+           lexem->beg++, lexem->diff++);
+        return MY_COLL_LEXEM_DIFF;
+    }
+    
+    if ((lexem->beg[0] >= 'a' && lexem->beg[0] <= 'z') ||
+        (lexem->beg[0] >= 'A' && lexem->beg[0] <= 'Z'))
+    {
+      lexem->code= lexem->beg[0];
+      lexem->beg++;
+      return MY_COLL_LEXEM_CHAR;
+    }
+    
+    if ((lexem->beg[0] == '\\') && 
+        (lexem->beg+2 < lexem->end) && 
+        (lexem->beg[1] == 'u'))
+    {
+      int ch;
+      
+      lexem->code= 0;
+      for (lexem->beg+=2; 
+           (lexem->beg < lexem->end) && ((ch= ch2x(lexem->beg[0])) >= 0) ; 
+           lexem->beg++)
+      {
+        lexem->code= (lexem->code << 4) + ch;
+      }
+      return MY_COLL_LEXEM_CHAR;
+    }
+    
+    return MY_COLL_LEXEM_ERROR;
+  }
+  return MY_COLL_LEXEM_EOF;
+}
+
+
+/*
+  Collation rule item
+*/
+
+typedef struct my_coll_rule_item_st
+{
+  uint base;     /* Base character                             */
+  uint curr;     /* Current character                          */
+  int diff[3];   /* Primary, Secondary and Tertiary difference */
+} MY_COLL_RULE;
+
+
+/*
+  Collation language syntax parser.
+  Uses lexical parser.
+  
+  SYNOPSIS
+    my_coll_rule_parse
+    rule                 Collation rule list to load to.
+    str                  A string containin collation language expression.
+    strend               End of the string.
+  USAGE
+    
+  RETURN VALUES
+    0 - OK
+    1 - ERROR, e.g. too many items.
+*/
+
+static int my_coll_rule_parse(MY_COLL_RULE *rule, size_t mitems,
+                              const char *str, const char *strend,
+                              char *errstr, size_t errsize)
+{
+  MY_COLL_LEXEM lexem;
+  my_coll_lexem_num lexnum;
+  my_coll_lexem_num prevlexnum= MY_COLL_LEXEM_ERROR;
+  MY_COLL_RULE item; 
+  int state= 0;
+  size_t nitems= 0;
+  
+  /* Init all variables */
+  errstr[0]= '\0';
+  bzero(&item, sizeof(item));
+  my_coll_lexem_init(&lexem, str, strend);
+  
+  while ((lexnum= my_coll_lexem_next(&lexem)))
+  {
+    if (lexnum == MY_COLL_LEXEM_ERROR)
+    {
+      my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Unknown character");
+      return -1;
+    }
+    
+    switch (state) {
+    case 0:
+      if (lexnum != MY_COLL_LEXEM_SHIFT)
+      {
+        my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& expected");
+        return -1;
+      }
+      prevlexnum= lexnum;
+      state= 2;
+      continue;
+      
+    case 1:
+      if (lexnum != MY_COLL_LEXEM_SHIFT && lexnum != MY_COLL_LEXEM_DIFF)
+      {
+        my_coll_lexem_print_error(&lexem,errstr,errsize-1,"& or < expected");
+        return -1;
+      }
+      prevlexnum= lexnum;
+      state= 2;
+      continue;
+      
+    case 2:
+      if (lexnum != MY_COLL_LEXEM_CHAR)
+      {
+        my_coll_lexem_print_error(&lexem,errstr,errsize-1,"character expected");
+        return -1;
+      }
+      
+      if (prevlexnum == MY_COLL_LEXEM_SHIFT)
+      {
+        item.base= lexem.code;
+        item.diff[0]= 0;
+        item.diff[1]= 0;
+        item.diff[2]= 0;
+      }
+      else if (prevlexnum == MY_COLL_LEXEM_DIFF)
+      {
+        item.curr= lexem.code;
+        if (lexem.diff == 3)
+        {
+          item.diff[2]++;
+        }
+        else if (lexem.diff == 2)
+        {
+          item.diff[1]++;
+          item.diff[2]= 0;
+        }
+        else if (lexem.diff == 1)
+        {
+          item.diff[0]++;
+          item.diff[1]= 0;
+          item.diff[2]= 0;
+        }
+        if (nitems >= mitems)
+        {
+          my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Too many rules");
+          return -1;
+        }
+        rule[nitems++]= item;
+      }
+      else
+      {
+        my_coll_lexem_print_error(&lexem,errstr,errsize-1,"Should never happen");
+        return -1;
+      }
+      state= 1;
+      continue;
+    }
+  }
+  return (size_t) nitems;
+}
+
+
 typedef struct
 {
  int		nchars;
@@ -284,6 +622,144 @@ err:
 }


+#ifdef HAVE_CHARSET_ucs2
+
+#define MY_MAX_COLL_RULE 64
+
+/*
+  This function copies an UCS2 collation from
+  the default Unicode Collation Algorithm (UCA)
+  weights applying tailorings, i.e. a set of
+  alternative weights for some characters. 
+  
+  The default UCA weights are stored in my_charset_ucs2_general_uca.
+  They consist of 256 pages, 256 character each.
+  
+  If a page is not overwritten by tailoring rules,
+  it is copies as is from UCA as is.
+  
+  If a page contains some overwritten characters, it is
+  allocated. Untouched characters are copied from the
+  default weights.
+*/
+
+static int ucs2_copy_data(CHARSET_INFO *to, CHARSET_INFO *from)
+{
+  MY_COLL_RULE rule[MY_MAX_COLL_RULE];
+  char errstr[128];
+  uchar   *newlengths;
+  uint16 **newweights;
+  const uchar *deflengths= my_charset_ucs2_general_uca.sort_order;
+  uint16     **defweights= my_charset_ucs2_general_uca.sort_order_big;
+  int rc, i;
+  
+  to->number= from->number ? from->number : to->number;
+  
+  if (from->csname)
+    if (!(to->csname= my_once_strdup(from->csname,MYF(MY_WME))))
+      goto err;
+  
+  if (from->name)
+    if (!(to->name= my_once_strdup(from->name,MYF(MY_WME))))
+      goto err;
+  
+  if (from->comment)
+    if (!(to->comment= my_once_strdup(from->comment,MYF(MY_WME))))
+      goto err;
+  
+  to->strxfrm_multiply= my_charset_ucs2_general_uca.strxfrm_multiply;
+  to->min_sort_char= my_charset_ucs2_general_uca.min_sort_char;
+  to->max_sort_char= my_charset_ucs2_general_uca.max_sort_char;
+  to->mbminlen= 2;
+  to->mbmaxlen= 2;
+  
+  
+  /* Parse ICU Collation Customization expression */
+  if ((rc= my_coll_rule_parse(rule, MY_MAX_COLL_RULE,
+                              from->sort_order,
+                              from->sort_order + strlen(from->sort_order),
+                              errstr, sizeof(errstr))) <= 0)
+  {
+    /* 
+      TODO: add error message reporting.
+      printf("Error: %d '%s'\n", rc, errstr);
+    */
+    return 1;
+  }
+  
+  
+  if (!(newweights= (uint16**) my_once_alloc(256*sizeof(uint16*),MYF(MY_WME))))
+    goto err;
+  bzero(newweights, 256*sizeof(uint16*));
+  
+  if (!(newlengths= (uchar*) my_once_memdup(deflengths,256,MYF(MY_WME))))
+    goto err;
+  
+  /*
+    Calculate maximum lenghts for the pages
+    which will be overwritten.
+  */
+  for (i=0; i < rc; i++)
+  {
+    uint pageb= (rule[i].base >> 8) & 0xFF;
+    uint pagec= (rule[i].curr >> 8) & 0xFF;
+    
+    if (newlengths[pagec] < deflengths[pageb])
+      newlengths[pagec]= deflengths[pageb];
+  }
+  
+  for (i=0; i < rc;  i++)
+  {
+    uint pageb= (rule[i].base >> 8) & 0xFF;
+    uint pagec= (rule[i].curr >> 8) & 0xFF;
+    uint chb, chc;
+    
+    if (!newweights[pagec])
+    {
+      /* Alloc new page and copy the default UCA weights */
+      uint size= 256*newlengths[pagec]*sizeof(uint16);
+      
+      if (!(newweights[pagec]= (uint16*) my_once_alloc(size,MYF(MY_WME))))
+        goto err;
+      bzero((void*) newweights[pagec], size);
+      
+      for (chc=0 ; chc < 256; chc++)
+      {
+        memcpy(newweights[pagec] + chc*newlengths[pagec],
+               defweights[pagec] + chc*deflengths[pagec],
+               deflengths[pagec]*sizeof(uint16));
+      }
+    }
+    
+    /* 
+      Aply the alternative rule:
+      shift to the base character and primary difference.
+    */
+    chc= rule[i].curr & 0xFF;
+    chb= rule[i].base & 0xFF;
+    memcpy(newweights[pagec] + chc*newlengths[pagec],
+           defweights[pageb] + chb*deflengths[pageb],
+           deflengths[pageb]*sizeof(uint16));
+    /* Apply primary difference */
+    newweights[pagec][chc*newlengths[pagec]]+= rule[i].diff[0];
+  }
+  
+  /* Copy non-overwritten pages from the default UCA weights */
+  for (i= 0; i < 256 ; i++)
+    if (!newweights[i])
+      newweights[i]= defweights[i];
+  
+  to->sort_order= newlengths;
+  to->sort_order_big= newweights;
+  
+  return 0;
+  
+err:
+  return 1;
+}
+#endif
+
+
 static my_bool simple_cs_is_full(CHARSET_INFO *cs)
 {
  return ((cs->csname && cs->tab_to_uni && cs->ctype && cs->to_upper &&
@@ -315,14 +791,28 @@ static int add_collation(CHARSET_INFO *cs)
    
    if (!(all_charsets[cs->number]->state & MY_CS_COMPILED))
    {
-      simple_cs_init_functions(all_charsets[cs->number]);
-      if (simple_cs_copy_data(all_charsets[cs->number],cs))
-	return MY_XML_ERROR;
-      if (simple_cs_is_full(all_charsets[cs->number]))
+      if (!strcmp(cs->csname,"ucs2") )
      {
-        all_charsets[cs->number]->state |= MY_CS_LOADED;
+#ifdef HAVE_CHARSET_ucs2
+        CHARSET_INFO *new= all_charsets[cs->number];
+        new->cset= my_charset_ucs2_general_uca.cset;
+        new->coll= my_charset_ucs2_general_uca.coll;
+        if (ucs2_copy_data(new, cs))
+          return MY_XML_ERROR;
+        new->state |= MY_CS_AVAILABLE | MY_CS_LOADED;
+#endif        
+      }
+      else
+      {
+        simple_cs_init_functions(all_charsets[cs->number]);
+        if (simple_cs_copy_data(all_charsets[cs->number],cs))
+	  return MY_XML_ERROR;
+        if (simple_cs_is_full(all_charsets[cs->number]))
+        {
+          all_charsets[cs->number]->state |= MY_CS_LOADED;
+        }
+        all_charsets[cs->number]->state|= MY_CS_AVAILABLE;
      }
-      all_charsets[cs->number]->state|= MY_CS_AVAILABLE;
    }
    else
    {

--- a/strings/ctype.c
+++ b/strings/ctype.c
@@ -22,6 +22,23 @@
 #endif


+/*
+
+  This files implements routines which parse XML based
+  character set and collation description files.
+  
+  Unicode collations are encoded according to
+  
+    Unicode Technical Standard #35
+    Locale Data Markup Language (LDML)
+    http://www.unicode.org/reports/tr35/
+  
+  and converted into ICU string according to
+  
+    Collation Customization
+    http://oss.software.ibm.com/icu/userguide/Collate_Customization.html
+  
+*/

 static char *mstr(char *str,const char *src,uint l1,uint l2)
 {
@@ -54,6 +71,11 @@ struct my_cs_file_section_st
 #define _CS_PRIMARY_ID	15
 #define _CS_BINARY_ID	16
 #define _CS_CSDESCRIPT	17
+#define _CS_RESET	18
+#define	_CS_DIFF1	19
+#define	_CS_DIFF2	20
+#define	_CS_DIFF3	21
+

 static struct my_cs_file_section_st sec[] =
 {
@@ -83,6 +105,10 @@ static struct my_cs_file_section_st sec[] =
  {_CS_ORDER,		"charsets.charset.collation.order"},
  {_CS_FLAG,		"charsets.charset.collation.flag"},
  {_CS_COLLMAP,		"charsets.charset.collation.map"},
+  {_CS_RESET,		"charsets.charset.collation.rules.reset"},
+  {_CS_DIFF1,		"charsets.charset.collation.rules.p"},
+  {_CS_DIFF2,		"charsets.charset.collation.rules.s"},
+  {_CS_DIFF3,		"charsets.charset.collation.rules.t"},
  {0,	NULL}
 };

@@ -109,6 +135,7 @@ typedef struct my_cs_file_info
  uchar  sort_order[MY_CS_SORT_ORDER_TABLE_SIZE];
  uint16 tab_to_uni[MY_CS_TO_UNI_TABLE_SIZE];
  char   comment[MY_CS_CSDESCR_SIZE];
+  size_t sort_order_length;
  CHARSET_INFO cs;
  int (*add_collation)(CHARSET_INFO *cs);
 } MY_CHARSET_LOADER;
@@ -156,9 +183,11 @@ static int cs_enter(MY_XML_PARSER *st,const char *attr, uint len)
  struct my_cs_file_section_st *s= cs_file_sec(attr,len);
  
  if ( s && (s->state == _CS_CHARSET))
-  {
    bzero(&i->cs,sizeof(i->cs));
-  }
+  
+  if (s && (s->state == _CS_COLLATION))
+    i->sort_order_length= 0;
+
  return MY_XML_OK;
 }

@@ -242,6 +271,26 @@ static int cs_value(MY_XML_PARSER *st,const char *attr, uint len)
    fill_uchar(i->ctype,MY_CS_CTYPE_TABLE_SIZE,attr,len);
    i->cs.ctype=i->ctype;
    break;
+  case _CS_RESET:
+  case _CS_DIFF1:
+  case _CS_DIFF2:
+  case _CS_DIFF3:
+    {
+      /*
+        Convert collation description from
+        Locale Data Markup Language (LDML)
+        into ICU Collation Customization expression.
+      */
+      char arg[16];
+      const char *cmd[]= {"&","<","<<","<<<"};
+      i->cs.sort_order= i->sort_order;
+      mstr(arg,attr,len,sizeof(arg)-1);
+      if (i->sort_order_length + 20 < sizeof(i->sort_order))
+      {
+        char *dst= i->sort_order_length + i->sort_order;
+        i->sort_order_length+= sprintf(dst," %s %s",cmd[state-_CS_RESET],arg);
+      }
+    }
  }
  return MY_XML_OK;
 }