Commit a8384c68 authored by Olaf Weber's avatar Olaf Weber Committed by Theodore Ts'o

unicode: reduce the size of utf8data[]

Remove the Hangul decompositions from the utf8data trie, and do
algorithmic decomposition to calculate them on the fly. To store the
decomposition the caller of utf8lookup()/utf8nlookup() must provide a
12-byte buffer, which is used to synthesize a leaf with the
decomposition. This significantly reduces the size of the utf8data[]
array.

Changes made by Gabriel:
  Rebase to mainline
  Fix checkpatch errors
  Extract robustness fixes and merge back to original mkutf8data.c patch
  Regenerate utf8data.h
Signed-off-by: default avatarOlaf Weber <olaf@sgi.com>
Signed-off-by: default avatarGabriel Krisman Bertazi <krisman@collabora.co.uk>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 44594c2f
...@@ -46,8 +46,8 @@ cd to this directory (fs/unicode) and run this command: ...@@ -46,8 +46,8 @@ cd to this directory (fs/unicode) and run this command:
make C=../.. objdir=../.. utf8data.h.new make C=../.. objdir=../.. utf8data.h.new
After sanity checking the newly generated utf8data.h.new file (the After sanity checking the newly generated utf8data.h.new file (the
version generated from the 11.0.0 UCD should be 13,834 lines long, and version generated from the 11.0.0 UCD should be 4,061 lines long, and
have a total size of 1104k) and/or comparing it with the older version have a total size of 320k) and/or comparing it with the older version
of utf8data.h, rename it to utf8data.h. of utf8data.h, rename it to utf8data.h.
If you are a kernel developer updating to a newer version of the If you are a kernel developer updating to a newer version of the
......
...@@ -98,6 +98,38 @@ static inline int utf8clen(const char *s) ...@@ -98,6 +98,38 @@ static inline int utf8clen(const char *s)
return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
} }
/*
* Decode a 3-byte UTF-8 sequence.
*/
static unsigned int
utf8decode3(const char *str)
{
unsigned int uc;
uc = *str++ & 0x0F;
uc <<= 6;
uc |= *str++ & 0x3F;
uc <<= 6;
uc |= *str++ & 0x3F;
return uc;
}
/*
* Encode a 3-byte UTF-8 sequence.
*/
static int
utf8encode3(char *str, unsigned int val)
{
str[2] = (val & 0x3F) | 0x80;
val >>= 6;
str[1] = (val & 0x3F) | 0x80;
val >>= 6;
str[0] = val | 0xE0;
return 3;
}
/* /*
* utf8trie_t * utf8trie_t
* *
...@@ -159,7 +191,8 @@ typedef const unsigned char utf8trie_t; ...@@ -159,7 +191,8 @@ typedef const unsigned char utf8trie_t;
* characters with the Default_Ignorable_Code_Point property. * characters with the Default_Ignorable_Code_Point property.
* These do affect normalization, as they all have CCC 0. * These do affect normalization, as they all have CCC 0.
* *
* The decompositions in the trie have been fully expanded. * The decompositions in the trie have been fully expanded, with the
* exception of Hangul syllables, which are decomposed algorithmically.
* *
* Casefolding, if applicable, is also done using decompositions. * Casefolding, if applicable, is also done using decompositions.
* *
...@@ -179,6 +212,105 @@ typedef const unsigned char utf8leaf_t; ...@@ -179,6 +212,105 @@ typedef const unsigned char utf8leaf_t;
#define STOPPER (0) #define STOPPER (0)
#define DECOMPOSE (255) #define DECOMPOSE (255)
/* Marker for hangul syllable decomposition. */
#define HANGUL ((char)(255))
/* Size of the synthesized leaf used for Hangul syllable decomposition. */
#define UTF8HANGULLEAF (12)
/*
* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
*
* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
*
* SBase = 0xAC00
* LBase = 0x1100
* VBase = 0x1161
* TBase = 0x11A7
* LCount = 19
* VCount = 21
* TCount = 28
* NCount = 588 (VCount * TCount)
* SCount = 11172 (LCount * NCount)
*
* Decomposition:
* SIndex = s - SBase
*
* LV (Canonical/Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* LPart = LBase + LIndex
* VPart = VBase + VIndex
*
* LVT (Canonical)
* LVIndex = (SIndex / TCount) * TCount
* TIndex = (Sindex % TCount)
* LVPart = SBase + LVIndex
* TPart = TBase + TIndex
*
* LVT (Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* TIndex = (Sindex % TCount)
* LPart = LBase + LIndex
* VPart = VBase + VIndex
* if (TIndex == 0) {
* d = <LPart, VPart>
* } else {
* TPart = TBase + TIndex
* d = <LPart, TPart, VPart>
* }
*/
/* Constants */
#define SB (0xAC00)
#define LB (0x1100)
#define VB (0x1161)
#define TB (0x11A7)
#define LC (19)
#define VC (21)
#define TC (28)
#define NC (VC * TC)
#define SC (LC * NC)
/* Algorithmic decomposition of hangul syllable. */
static utf8leaf_t *
utf8hangul(const char *str, unsigned char *hangul)
{
unsigned int si;
unsigned int li;
unsigned int vi;
unsigned int ti;
unsigned char *h;
/* Calculate the SI, LI, VI, and TI values. */
si = utf8decode3(str) - SB;
li = si / NC;
vi = (si % NC) / TC;
ti = si % TC;
/* Fill in base of leaf. */
h = hangul;
LEAF_GEN(h) = 2;
LEAF_CCC(h) = DECOMPOSE;
h += 2;
/* Add LPart, a 3-byte UTF-8 sequence. */
h += utf8encode3((char *)h, li + LB);
/* Add VPart, a 3-byte UTF-8 sequence. */
h += utf8encode3((char *)h, vi + VB);
/* Add TPart if required, also a 3-byte UTF-8 sequence. */
if (ti)
h += utf8encode3((char *)h, ti + TB);
/* Terminate string. */
h[0] = '\0';
return hangul;
}
/* /*
* Use trie to scan s, touching at most len bytes. * Use trie to scan s, touching at most len bytes.
* Returns the leaf if one exists, NULL otherwise. * Returns the leaf if one exists, NULL otherwise.
...@@ -187,8 +319,8 @@ typedef const unsigned char utf8leaf_t; ...@@ -187,8 +319,8 @@ typedef const unsigned char utf8leaf_t;
* is well-formed and corresponds to a known unicode code point. The * is well-formed and corresponds to a known unicode code point. The
* shorthand for this will be "is valid UTF-8 unicode". * shorthand for this will be "is valid UTF-8 unicode".
*/ */
static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, static utf8leaf_t *utf8nlookup(const struct utf8data *data,
size_t len) unsigned char *hangul, const char *s, size_t len)
{ {
utf8trie_t *trie = NULL; utf8trie_t *trie = NULL;
int offlen; int offlen;
...@@ -228,8 +360,7 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, ...@@ -228,8 +360,7 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s,
trie++; trie++;
} else { } else {
/* No right node. */ /* No right node. */
node = 0; return NULL;
trie = NULL;
} }
} else { } else {
/* Left leg */ /* Left leg */
...@@ -239,8 +370,7 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, ...@@ -239,8 +370,7 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s,
trie += offlen + 1; trie += offlen + 1;
} else if (*trie & RIGHTPATH) { } else if (*trie & RIGHTPATH) {
/* No left node. */ /* No left node. */
node = 0; return NULL;
trie = NULL;
} else { } else {
/* Left node after this node */ /* Left node after this node */
node = (*trie & TRIENODE); node = (*trie & TRIENODE);
...@@ -248,6 +378,14 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, ...@@ -248,6 +378,14 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s,
} }
} }
} }
/*
* Hangul decomposition is done algorithmically. These are the
* codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
* always 3 bytes long, so s has been advanced twice, and the
* start of the sequence is at s-2.
*/
if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
trie = utf8hangul(s - 2, hangul);
return trie; return trie;
} }
...@@ -257,9 +395,10 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s, ...@@ -257,9 +395,10 @@ static utf8leaf_t *utf8nlookup(const struct utf8data *data, const char *s,
* *
* Forwards to utf8nlookup(). * Forwards to utf8nlookup().
*/ */
static utf8leaf_t *utf8lookup(const struct utf8data *data, const char *s) static utf8leaf_t *utf8lookup(const struct utf8data *data,
unsigned char *hangul, const char *s)
{ {
return utf8nlookup(data, s, (size_t)-1); return utf8nlookup(data, hangul, s, (size_t)-1);
} }
/* /*
...@@ -272,11 +411,13 @@ int utf8agemax(const struct utf8data *data, const char *s) ...@@ -272,11 +411,13 @@ int utf8agemax(const struct utf8data *data, const char *s)
utf8leaf_t *leaf; utf8leaf_t *leaf;
int age = 0; int age = 0;
int leaf_age; int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
if (!data) if (!data)
return -1; return -1;
while (*s) { while (*s) {
leaf = utf8lookup(data, s); leaf = utf8lookup(data, hangul, s);
if (!leaf) if (!leaf)
return -1; return -1;
...@@ -299,12 +440,13 @@ int utf8agemin(const struct utf8data *data, const char *s) ...@@ -299,12 +440,13 @@ int utf8agemin(const struct utf8data *data, const char *s)
utf8leaf_t *leaf; utf8leaf_t *leaf;
int age; int age;
int leaf_age; int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
if (!data) if (!data)
return -1; return -1;
age = data->maxage; age = data->maxage;
while (*s) { while (*s) {
leaf = utf8lookup(data, s); leaf = utf8lookup(data, hangul, s);
if (!leaf) if (!leaf)
return -1; return -1;
leaf_age = utf8agetab[LEAF_GEN(leaf)]; leaf_age = utf8agetab[LEAF_GEN(leaf)];
...@@ -325,11 +467,13 @@ int utf8nagemax(const struct utf8data *data, const char *s, size_t len) ...@@ -325,11 +467,13 @@ int utf8nagemax(const struct utf8data *data, const char *s, size_t len)
utf8leaf_t *leaf; utf8leaf_t *leaf;
int age = 0; int age = 0;
int leaf_age; int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
if (!data) if (!data)
return -1; return -1;
while (len && *s) { while (len && *s) {
leaf = utf8nlookup(data, s, len); leaf = utf8nlookup(data, hangul, s, len);
if (!leaf) if (!leaf)
return -1; return -1;
leaf_age = utf8agetab[LEAF_GEN(leaf)]; leaf_age = utf8agetab[LEAF_GEN(leaf)];
...@@ -351,12 +495,13 @@ int utf8nagemin(const struct utf8data *data, const char *s, size_t len) ...@@ -351,12 +495,13 @@ int utf8nagemin(const struct utf8data *data, const char *s, size_t len)
utf8leaf_t *leaf; utf8leaf_t *leaf;
int leaf_age; int leaf_age;
int age; int age;
unsigned char hangul[UTF8HANGULLEAF];
if (!data) if (!data)
return -1; return -1;
age = data->maxage; age = data->maxage;
while (len && *s) { while (len && *s) {
leaf = utf8nlookup(data, s, len); leaf = utf8nlookup(data, hangul, s, len);
if (!leaf) if (!leaf)
return -1; return -1;
leaf_age = utf8agetab[LEAF_GEN(leaf)]; leaf_age = utf8agetab[LEAF_GEN(leaf)];
...@@ -379,11 +524,12 @@ ssize_t utf8len(const struct utf8data *data, const char *s) ...@@ -379,11 +524,12 @@ ssize_t utf8len(const struct utf8data *data, const char *s)
{ {
utf8leaf_t *leaf; utf8leaf_t *leaf;
size_t ret = 0; size_t ret = 0;
unsigned char hangul[UTF8HANGULLEAF];
if (!data) if (!data)
return -1; return -1;
while (*s) { while (*s) {
leaf = utf8lookup(data, s); leaf = utf8lookup(data, hangul, s);
if (!leaf) if (!leaf)
return -1; return -1;
if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
...@@ -406,11 +552,12 @@ ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) ...@@ -406,11 +552,12 @@ ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len)
{ {
utf8leaf_t *leaf; utf8leaf_t *leaf;
size_t ret = 0; size_t ret = 0;
unsigned char hangul[UTF8HANGULLEAF];
if (!data) if (!data)
return -1; return -1;
while (len && *s) { while (len && *s) {
leaf = utf8nlookup(data, s, len); leaf = utf8nlookup(data, hangul, s, len);
if (!leaf) if (!leaf)
return -1; return -1;
if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) if (utf8agetab[LEAF_GEN(leaf)] > data->maxage)
...@@ -533,10 +680,12 @@ int utf8byte(struct utf8cursor *u8c) ...@@ -533,10 +680,12 @@ int utf8byte(struct utf8cursor *u8c)
} }
/* Look up the data for the current character. */ /* Look up the data for the current character. */
if (u8c->p) if (u8c->p) {
leaf = utf8lookup(u8c->data, u8c->s); leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
else } else {
leaf = utf8nlookup(u8c->data, u8c->s, u8c->len); leaf = utf8nlookup(u8c->data, u8c->hangul,
u8c->s, u8c->len);
}
/* No leaf found implies that the input is a binary blob. */ /* No leaf found implies that the input is a binary blob. */
if (!leaf) if (!leaf)
...@@ -557,7 +706,9 @@ int utf8byte(struct utf8cursor *u8c) ...@@ -557,7 +706,9 @@ int utf8byte(struct utf8cursor *u8c)
ccc = STOPPER; ccc = STOPPER;
goto ccc_mismatch; goto ccc_mismatch;
} }
leaf = utf8lookup(u8c->data, u8c->s);
leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s);
ccc = LEAF_CCC(leaf);
} }
/* /*
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -76,6 +76,9 @@ extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len); ...@@ -76,6 +76,9 @@ extern int utf8nagemin(const struct utf8data *data, const char *s, size_t len);
extern ssize_t utf8len(const struct utf8data *data, const char *s); extern ssize_t utf8len(const struct utf8data *data, const char *s);
extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len); extern ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len);
/* Needed in struct utf8cursor below. */
#define UTF8HANGULLEAF (12)
/* /*
* Cursor structure used by the normalizer. * Cursor structure used by the normalizer.
*/ */
...@@ -89,6 +92,7 @@ struct utf8cursor { ...@@ -89,6 +92,7 @@ struct utf8cursor {
unsigned int slen; unsigned int slen;
short int ccc; short int ccc;
short int nccc; short int nccc;
unsigned char hangul[UTF8HANGULLEAF];
}; };
/* /*
......
...@@ -182,10 +182,14 @@ typedef unsigned char utf8leaf_t; ...@@ -182,10 +182,14 @@ typedef unsigned char utf8leaf_t;
#define MAXCCC (254) #define MAXCCC (254)
#define STOPPER (0) #define STOPPER (0)
#define DECOMPOSE (255) #define DECOMPOSE (255)
#define HANGUL ((char)(255))
#define UTF8HANGULLEAF (12)
struct tree; struct tree;
static utf8leaf_t *utf8nlookup(struct tree *, const char *, size_t); static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
static utf8leaf_t *utf8lookup(struct tree *, const char *); const char *, size_t);
static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
unsigned char *utf8data; unsigned char *utf8data;
size_t utf8data_size; size_t utf8data_size;
...@@ -333,6 +337,8 @@ static int utf32valid(unsigned int unichar) ...@@ -333,6 +337,8 @@ static int utf32valid(unsigned int unichar)
return unichar < 0x110000; return unichar < 0x110000;
} }
#define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
#define NODE 1 #define NODE 1
#define LEAF 0 #define LEAF 0
...@@ -463,7 +469,7 @@ static void tree_walk(struct tree *tree) ...@@ -463,7 +469,7 @@ static void tree_walk(struct tree *tree)
indent+1); indent+1);
leaves += 1; leaves += 1;
} else if (node->right) { } else if (node->right) {
assert(node->rightnode==NODE); assert(node->rightnode == NODE);
indent += 1; indent += 1;
node = node->right; node = node->right;
break; break;
...@@ -857,7 +863,7 @@ static void mark_nodes(struct tree *tree) ...@@ -857,7 +863,7 @@ static void mark_nodes(struct tree *tree)
} }
} }
} else if (node->right) { } else if (node->right) {
assert(node->rightnode==NODE); assert(node->rightnode == NODE);
node = node->right; node = node->right;
continue; continue;
} }
...@@ -909,7 +915,7 @@ static void mark_nodes(struct tree *tree) ...@@ -909,7 +915,7 @@ static void mark_nodes(struct tree *tree)
} }
} }
} else if (node->right) { } else if (node->right) {
assert(node->rightnode==NODE); assert(node->rightnode == NODE);
node = node->right; node = node->right;
if (!node->mark && node->parent->mark && if (!node->mark && node->parent->mark &&
!node->parent->left) { !node->parent->left) {
...@@ -992,7 +998,7 @@ static int index_nodes(struct tree *tree, int index) ...@@ -992,7 +998,7 @@ static int index_nodes(struct tree *tree, int index)
index += tree->leaf_size(node->right); index += tree->leaf_size(node->right);
count++; count++;
} else if (node->right) { } else if (node->right) {
assert(node->rightnode==NODE); assert(node->rightnode == NODE);
indent += 1; indent += 1;
node = node->right; node = node->right;
break; break;
...@@ -1013,6 +1019,25 @@ static int index_nodes(struct tree *tree, int index) ...@@ -1013,6 +1019,25 @@ static int index_nodes(struct tree *tree, int index)
return index; return index;
} }
/*
* Mark the nodes in a subtree, helper for size_nodes().
*/
static int mark_subtree(struct node *node)
{
int changed;
if (!node || node->mark)
return 0;
node->mark = 1;
node->index = node->parent->index;
changed = 1;
if (node->leftnode == NODE)
changed += mark_subtree(node->left);
if (node->rightnode == NODE)
changed += mark_subtree(node->right);
return changed;
}
/* /*
* Compute the size of nodes and leaves. We start by assuming that * Compute the size of nodes and leaves. We start by assuming that
* each node needs to store a three-byte offset. The indexes of the * each node needs to store a three-byte offset. The indexes of the
...@@ -1031,6 +1056,7 @@ static int size_nodes(struct tree *tree) ...@@ -1031,6 +1056,7 @@ static int size_nodes(struct tree *tree)
unsigned int bitmask; unsigned int bitmask;
unsigned int pathbits; unsigned int pathbits;
unsigned int pathmask; unsigned int pathmask;
unsigned int nbit;
int changed; int changed;
int offset; int offset;
int size; int size;
...@@ -1058,22 +1084,40 @@ static int size_nodes(struct tree *tree) ...@@ -1058,22 +1084,40 @@ static int size_nodes(struct tree *tree)
size = 1; size = 1;
} else { } else {
if (node->rightnode == NODE) { if (node->rightnode == NODE) {
/*
* If the right node is not marked,
* look for a corresponding node in
* the next tree. Such a node need
* not exist.
*/
right = node->right; right = node->right;
next = tree->next; next = tree->next;
while (!right->mark) { while (!right->mark) {
assert(next); assert(next);
n = next->root; n = next->root;
while (n->bitnum != node->bitnum) { while (n->bitnum != node->bitnum) {
if (pathbits & (1<<n->bitnum)) nbit = 1 << n->bitnum;
if (!(pathmask & nbit))
break;
if (pathbits & nbit) {
if (n->rightnode == LEAF)
break;
n = n->right; n = n->right;
else } else {
if (n->leftnode == LEAF)
break;
n = n->left; n = n->left;
} }
}
if (n->bitnum != node->bitnum)
break;
n = n->right; n = n->right;
assert(right->bitnum == n->bitnum);
right = n; right = n;
next = next->next; next = next->next;
} }
/* Make sure the right node is marked. */
if (!right->mark)
changed += mark_subtree(right);
offset = right->index - node->index; offset = right->index - node->index;
} else { } else {
offset = *tree->leaf_index(tree, node->right); offset = *tree->leaf_index(tree, node->right);
...@@ -1115,7 +1159,7 @@ static int size_nodes(struct tree *tree) ...@@ -1115,7 +1159,7 @@ static int size_nodes(struct tree *tree)
if (node->rightnode == LEAF) { if (node->rightnode == LEAF) {
assert(node->right); assert(node->right);
} else if (node->right) { } else if (node->right) {
assert(node->rightnode==NODE); assert(node->rightnode == NODE);
indent += 1; indent += 1;
node = node->right; node = node->right;
break; break;
...@@ -1148,8 +1192,15 @@ static void emit(struct tree *tree, unsigned char *data) ...@@ -1148,8 +1192,15 @@ static void emit(struct tree *tree, unsigned char *data)
int offset; int offset;
int index; int index;
int indent; int indent;
int size;
int bytes;
int leaves;
int nodes[4];
unsigned char byte; unsigned char byte;
nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
leaves = 0;
bytes = 0;
index = tree->index; index = tree->index;
data += index; data += index;
indent = 1; indent = 1;
...@@ -1158,7 +1209,10 @@ static void emit(struct tree *tree, unsigned char *data) ...@@ -1158,7 +1209,10 @@ static void emit(struct tree *tree, unsigned char *data)
if (tree->childnode == LEAF) { if (tree->childnode == LEAF) {
assert(tree->root); assert(tree->root);
tree->leaf_emit(tree->root, data); tree->leaf_emit(tree->root, data);
return; size = tree->leaf_size(tree->root);
index += size;
leaves++;
goto done;
} }
assert(tree->childnode == NODE); assert(tree->childnode == NODE);
...@@ -1185,6 +1239,7 @@ static void emit(struct tree *tree, unsigned char *data) ...@@ -1185,6 +1239,7 @@ static void emit(struct tree *tree, unsigned char *data)
offlen = 2; offlen = 2;
else else
offlen = 3; offlen = 3;
nodes[offlen]++;
offset = node->offset; offset = node->offset;
byte |= offlen << OFFLEN_SHIFT; byte |= offlen << OFFLEN_SHIFT;
*data++ = byte; *data++ = byte;
...@@ -1197,12 +1252,14 @@ static void emit(struct tree *tree, unsigned char *data) ...@@ -1197,12 +1252,14 @@ static void emit(struct tree *tree, unsigned char *data)
} else if (node->left) { } else if (node->left) {
if (node->leftnode == NODE) if (node->leftnode == NODE)
byte |= TRIENODE; byte |= TRIENODE;
nodes[0]++;
*data++ = byte; *data++ = byte;
index++; index++;
} else if (node->right) { } else if (node->right) {
byte |= RIGHTNODE; byte |= RIGHTNODE;
if (node->rightnode == NODE) if (node->rightnode == NODE)
byte |= TRIENODE; byte |= TRIENODE;
nodes[0]++;
*data++ = byte; *data++ = byte;
index++; index++;
} else { } else {
...@@ -1217,7 +1274,10 @@ static void emit(struct tree *tree, unsigned char *data) ...@@ -1217,7 +1274,10 @@ static void emit(struct tree *tree, unsigned char *data)
assert(node->left); assert(node->left);
data = tree->leaf_emit(node->left, data = tree->leaf_emit(node->left,
data); data);
index += tree->leaf_size(node->left); size = tree->leaf_size(node->left);
index += size;
bytes += size;
leaves++;
} else if (node->left) { } else if (node->left) {
assert(node->leftnode == NODE); assert(node->leftnode == NODE);
indent += 1; indent += 1;
...@@ -1231,9 +1291,12 @@ static void emit(struct tree *tree, unsigned char *data) ...@@ -1231,9 +1291,12 @@ static void emit(struct tree *tree, unsigned char *data)
assert(node->right); assert(node->right);
data = tree->leaf_emit(node->right, data = tree->leaf_emit(node->right,
data); data);
index += tree->leaf_size(node->right); size = tree->leaf_size(node->right);
index += size;
bytes += size;
leaves++;
} else if (node->right) { } else if (node->right) {
assert(node->rightnode==NODE); assert(node->rightnode == NODE);
indent += 1; indent += 1;
node = node->right; node = node->right;
break; break;
...@@ -1245,6 +1308,15 @@ static void emit(struct tree *tree, unsigned char *data) ...@@ -1245,6 +1308,15 @@ static void emit(struct tree *tree, unsigned char *data)
indent -= 1; indent -= 1;
} }
} }
done:
if (verbose > 0) {
printf("Emitted %d (%d) leaves",
leaves, bytes);
printf(" %d (%d+%d+%d+%d) nodes",
nodes[0] + nodes[1] + nodes[2] + nodes[3],
nodes[0], nodes[1], nodes[2], nodes[3]);
printf(" %d total\n", index - tree->index);
}
} }
/* ------------------------------------------------------------------ */ /* ------------------------------------------------------------------ */
...@@ -1346,8 +1418,12 @@ static void nfdi_print(void *l, int indent) ...@@ -1346,8 +1418,12 @@ static void nfdi_print(void *l, int indent)
printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
leaf->code, leaf->ccc, leaf->gen); leaf->code, leaf->ccc, leaf->gen);
if (leaf->utf8nfdi)
if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
else if (leaf->utf8nfdi)
printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
printf("\n"); printf("\n");
} }
...@@ -1357,8 +1433,11 @@ static void nfdicf_print(void *l, int indent) ...@@ -1357,8 +1433,11 @@ static void nfdicf_print(void *l, int indent)
printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
leaf->code, leaf->ccc, leaf->gen); leaf->code, leaf->ccc, leaf->gen);
if (leaf->utf8nfdicf) if (leaf->utf8nfdicf)
printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf); printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
else if (leaf->utf8nfdi) else if (leaf->utf8nfdi)
printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
printf("\n"); printf("\n");
...@@ -1388,9 +1467,11 @@ static int correction_mark(void *l) ...@@ -1388,9 +1467,11 @@ static int correction_mark(void *l)
static int nfdi_size(void *l) static int nfdi_size(void *l)
{ {
struct unicode_data *leaf = l; struct unicode_data *leaf = l;
int size = 2; int size = 2;
if (leaf->utf8nfdi)
if (HANGUL_SYLLABLE(leaf->code))
size += 1;
else if (leaf->utf8nfdi)
size += strlen(leaf->utf8nfdi) + 1; size += strlen(leaf->utf8nfdi) + 1;
return size; return size;
} }
...@@ -1398,9 +1479,11 @@ static int nfdi_size(void *l) ...@@ -1398,9 +1479,11 @@ static int nfdi_size(void *l)
static int nfdicf_size(void *l) static int nfdicf_size(void *l)
{ {
struct unicode_data *leaf = l; struct unicode_data *leaf = l;
int size = 2; int size = 2;
if (leaf->utf8nfdicf)
if (HANGUL_SYLLABLE(leaf->code))
size += 1;
else if (leaf->utf8nfdicf)
size += strlen(leaf->utf8nfdicf) + 1; size += strlen(leaf->utf8nfdicf) + 1;
else if (leaf->utf8nfdi) else if (leaf->utf8nfdi)
size += strlen(leaf->utf8nfdi) + 1; size += strlen(leaf->utf8nfdi) + 1;
...@@ -1427,7 +1510,11 @@ static unsigned char *nfdi_emit(void *l, unsigned char *data) ...@@ -1427,7 +1510,11 @@ static unsigned char *nfdi_emit(void *l, unsigned char *data)
unsigned char *s; unsigned char *s;
*data++ = leaf->gen; *data++ = leaf->gen;
if (leaf->utf8nfdi) {
if (HANGUL_SYLLABLE(leaf->code)) {
*data++ = DECOMPOSE;
*data++ = HANGUL;
} else if (leaf->utf8nfdi) {
*data++ = DECOMPOSE; *data++ = DECOMPOSE;
s = (unsigned char*)leaf->utf8nfdi; s = (unsigned char*)leaf->utf8nfdi;
while ((*data++ = *s++) != 0) while ((*data++ = *s++) != 0)
...@@ -1444,7 +1531,11 @@ static unsigned char *nfdicf_emit(void *l, unsigned char *data) ...@@ -1444,7 +1531,11 @@ static unsigned char *nfdicf_emit(void *l, unsigned char *data)
unsigned char *s; unsigned char *s;
*data++ = leaf->gen; *data++ = leaf->gen;
if (leaf->utf8nfdicf) {
if (HANGUL_SYLLABLE(leaf->code)) {
*data++ = DECOMPOSE;
*data++ = HANGUL;
} else if (leaf->utf8nfdicf) {
*data++ = DECOMPOSE; *data++ = DECOMPOSE;
s = (unsigned char*)leaf->utf8nfdicf; s = (unsigned char*)leaf->utf8nfdicf;
while ((*data++ = *s++) != 0) while ((*data++ = *s++) != 0)
...@@ -1467,6 +1558,11 @@ static void utf8_create(struct unicode_data *data) ...@@ -1467,6 +1558,11 @@ static void utf8_create(struct unicode_data *data)
unsigned int *um; unsigned int *um;
int i; int i;
if (data->utf8nfdi) {
assert(data->utf8nfdi[0] == HANGUL);
return;
}
u = utf; u = utf;
um = data->utf32nfdi; um = data->utf32nfdi;
if (um) { if (um) {
...@@ -1652,6 +1748,7 @@ static void verify(struct tree *tree) ...@@ -1652,6 +1748,7 @@ static void verify(struct tree *tree)
utf8leaf_t *leaf; utf8leaf_t *leaf;
unsigned int unichar; unsigned int unichar;
char key[4]; char key[4];
unsigned char hangul[UTF8HANGULLEAF];
int report; int report;
int nocf; int nocf;
...@@ -1665,7 +1762,8 @@ static void verify(struct tree *tree) ...@@ -1665,7 +1762,8 @@ static void verify(struct tree *tree)
if (data->correction <= tree->maxage) if (data->correction <= tree->maxage)
data = &unicode_data[unichar]; data = &unicode_data[unichar];
utf8encode(key,unichar); utf8encode(key,unichar);
leaf = utf8lookup(tree, key); leaf = utf8lookup(tree, hangul, key);
if (!leaf) { if (!leaf) {
if (data->gen != -1) if (data->gen != -1)
report++; report++;
...@@ -1679,7 +1777,10 @@ static void verify(struct tree *tree) ...@@ -1679,7 +1777,10 @@ static void verify(struct tree *tree)
if (data->gen != LEAF_GEN(leaf)) if (data->gen != LEAF_GEN(leaf))
report++; report++;
if (LEAF_CCC(leaf) == DECOMPOSE) { if (LEAF_CCC(leaf) == DECOMPOSE) {
if (nocf) { if (HANGUL_SYLLABLE(data->code)) {
if (data->utf8nfdi[0] != HANGUL)
report++;
} else if (nocf) {
if (!data->utf8nfdi) { if (!data->utf8nfdi) {
report++; report++;
} else if (strcmp(data->utf8nfdi, } else if (strcmp(data->utf8nfdi,
...@@ -2323,8 +2424,7 @@ static void corrections_init(void) ...@@ -2323,8 +2424,7 @@ static void corrections_init(void)
* *
*/ */
static void static void hangul_decompose(void)
hangul_decompose(void)
{ {
unsigned int sb = 0xAC00; unsigned int sb = 0xAC00;
unsigned int lb = 0x1100; unsigned int lb = 0x1100;
...@@ -2368,6 +2468,15 @@ hangul_decompose(void) ...@@ -2368,6 +2468,15 @@ hangul_decompose(void)
memcpy(um, mapping, i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um; unicode_data[unichar].utf32nfdicf = um;
/*
* Add a cookie as a reminder that the hangul syllable
* decompositions must not be stored in the generated
* trie.
*/
unicode_data[unichar].utf8nfdi = malloc(2);
unicode_data[unichar].utf8nfdi[0] = HANGUL;
unicode_data[unichar].utf8nfdi[1] = '\0';
if (verbose > 1) if (verbose > 1)
print_utf32nfdi(unichar); print_utf32nfdi(unichar);
...@@ -2493,6 +2602,99 @@ int utf8cursor(struct utf8cursor *, struct tree *, const char *); ...@@ -2493,6 +2602,99 @@ int utf8cursor(struct utf8cursor *, struct tree *, const char *);
int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
int utf8byte(struct utf8cursor *); int utf8byte(struct utf8cursor *);
/*
* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
*
* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
*
* SBase = 0xAC00
* LBase = 0x1100
* VBase = 0x1161
* TBase = 0x11A7
* LCount = 19
* VCount = 21
* TCount = 28
* NCount = 588 (VCount * TCount)
* SCount = 11172 (LCount * NCount)
*
* Decomposition:
* SIndex = s - SBase
*
* LV (Canonical/Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* LPart = LBase + LIndex
* VPart = VBase + VIndex
*
* LVT (Canonical)
* LVIndex = (SIndex / TCount) * TCount
* TIndex = (Sindex % TCount)
* LVPart = SBase + LVIndex
* TPart = TBase + TIndex
*
* LVT (Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* TIndex = (Sindex % TCount)
* LPart = LBase + LIndex
* VPart = VBase + VIndex
* if (TIndex == 0) {
* d = <LPart, VPart>
* } else {
* TPart = TBase + TIndex
* d = <LPart, VPart, TPart>
* }
*/
/* Constants */
#define SB (0xAC00)
#define LB (0x1100)
#define VB (0x1161)
#define TB (0x11A7)
#define LC (19)
#define VC (21)
#define TC (28)
#define NC (VC * TC)
#define SC (LC * NC)
/* Algorithmic decomposition of hangul syllable. */
static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
{
unsigned int si;
unsigned int li;
unsigned int vi;
unsigned int ti;
unsigned char *h;
/* Calculate the SI, LI, VI, and TI values. */
si = utf8decode(str) - SB;
li = si / NC;
vi = (si % NC) / TC;
ti = si % TC;
/* Fill in base of leaf. */
h = hangul;
LEAF_GEN(h) = 2;
LEAF_CCC(h) = DECOMPOSE;
h += 2;
/* Add LPart, a 3-byte UTF-8 sequence. */
h += utf8encode((char *)h, li + LB);
/* Add VPart, a 3-byte UTF-8 sequence. */
h += utf8encode((char *)h, vi + VB);
/* Add TPart if required, also a 3-byte UTF-8 sequence. */
if (ti)
h += utf8encode((char *)h, ti + TB);
/* Terminate string. */
h[0] = '\0';
return hangul;
}
/* /*
* Use trie to scan s, touching at most len bytes. * Use trie to scan s, touching at most len bytes.
* Returns the leaf if one exists, NULL otherwise. * Returns the leaf if one exists, NULL otherwise.
...@@ -2501,7 +2703,8 @@ int utf8byte(struct utf8cursor *); ...@@ -2501,7 +2703,8 @@ int utf8byte(struct utf8cursor *);
* is well-formed and corresponds to a known unicode code point. The * is well-formed and corresponds to a known unicode code point. The
* shorthand for this will be "is valid UTF-8 unicode". * shorthand for this will be "is valid UTF-8 unicode".
*/ */
static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len) static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
const char *s, size_t len)
{ {
utf8trie_t *trie; utf8trie_t *trie;
int offlen; int offlen;
...@@ -2558,6 +2761,14 @@ static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len) ...@@ -2558,6 +2761,14 @@ static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len)
} }
} }
} }
/*
* Hangul decomposition is done algorithmically. These are the
* codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
* always 3 bytes long, so s has been advanced twice, and the
* start of the sequence is at s-2.
*/
if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
trie = utf8hangul(s - 2, hangul);
return trie; return trie;
} }
...@@ -2567,9 +2778,10 @@ static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len) ...@@ -2567,9 +2778,10 @@ static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len)
* *
* Forwards to trie_nlookup(). * Forwards to trie_nlookup().
*/ */
static utf8leaf_t *utf8lookup(struct tree *tree, const char *s) static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
const char *s)
{ {
return utf8nlookup(tree, s, (size_t)-1); return utf8nlookup(tree, hangul, s, (size_t)-1);
} }
/* /*
...@@ -2593,11 +2805,14 @@ int utf8agemax(struct tree *tree, const char *s) ...@@ -2593,11 +2805,14 @@ int utf8agemax(struct tree *tree, const char *s)
utf8leaf_t *leaf; utf8leaf_t *leaf;
int age = 0; int age = 0;
int leaf_age; int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
if (!tree) if (!tree)
return -1; return -1;
while (*s) { while (*s) {
if (!(leaf = utf8lookup(tree, s))) leaf = utf8lookup(tree, hangul, s);
if (!leaf)
return -1; return -1;
leaf_age = ages[LEAF_GEN(leaf)]; leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age > age) if (leaf_age <= tree->maxage && leaf_age > age)
...@@ -2617,12 +2832,14 @@ int utf8agemin(struct tree *tree, const char *s) ...@@ -2617,12 +2832,14 @@ int utf8agemin(struct tree *tree, const char *s)
utf8leaf_t *leaf; utf8leaf_t *leaf;
int age; int age;
int leaf_age; int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
if (!tree) if (!tree)
return -1; return -1;
age = tree->maxage; age = tree->maxage;
while (*s) { while (*s) {
if (!(leaf = utf8lookup(tree, s))) leaf = utf8lookup(tree, hangul, s);
if (!leaf)
return -1; return -1;
leaf_age = ages[LEAF_GEN(leaf)]; leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age < age) if (leaf_age <= tree->maxage && leaf_age < age)
...@@ -2641,11 +2858,14 @@ int utf8nagemax(struct tree *tree, const char *s, size_t len) ...@@ -2641,11 +2858,14 @@ int utf8nagemax(struct tree *tree, const char *s, size_t len)
utf8leaf_t *leaf; utf8leaf_t *leaf;
int age = 0; int age = 0;
int leaf_age; int leaf_age;
unsigned char hangul[UTF8HANGULLEAF];
if (!tree) if (!tree)
return -1; return -1;
while (len && *s) { while (len && *s) {
if (!(leaf = utf8nlookup(tree, s, len))) leaf = utf8nlookup(tree, hangul, s, len);
if (!leaf)
return -1; return -1;
leaf_age = ages[LEAF_GEN(leaf)]; leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age > age) if (leaf_age <= tree->maxage && leaf_age > age)
...@@ -2665,12 +2885,14 @@ int utf8nagemin(struct tree *tree, const char *s, size_t len) ...@@ -2665,12 +2885,14 @@ int utf8nagemin(struct tree *tree, const char *s, size_t len)
utf8leaf_t *leaf; utf8leaf_t *leaf;
int leaf_age; int leaf_age;
int age; int age;
unsigned char hangul[UTF8HANGULLEAF];
if (!tree) if (!tree)
return -1; return -1;
age = tree->maxage; age = tree->maxage;
while (len && *s) { while (len && *s) {
if (!(leaf = utf8nlookup(tree, s, len))) leaf = utf8nlookup(tree, hangul, s, len);
if (!leaf)
return -1; return -1;
leaf_age = ages[LEAF_GEN(leaf)]; leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age < age) if (leaf_age <= tree->maxage && leaf_age < age)
...@@ -2691,11 +2913,13 @@ ssize_t utf8len(struct tree *tree, const char *s) ...@@ -2691,11 +2913,13 @@ ssize_t utf8len(struct tree *tree, const char *s)
{ {
utf8leaf_t *leaf; utf8leaf_t *leaf;
size_t ret = 0; size_t ret = 0;
unsigned char hangul[UTF8HANGULLEAF];
if (!tree) if (!tree)
return -1; return -1;
while (*s) { while (*s) {
if (!(leaf = utf8lookup(tree, s))) leaf = utf8lookup(tree, hangul, s);
if (!leaf)
return -1; return -1;
if (ages[LEAF_GEN(leaf)] > tree->maxage) if (ages[LEAF_GEN(leaf)] > tree->maxage)
ret += utf8clen(s); ret += utf8clen(s);
...@@ -2716,11 +2940,13 @@ ssize_t utf8nlen(struct tree *tree, const char *s, size_t len) ...@@ -2716,11 +2940,13 @@ ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
{ {
utf8leaf_t *leaf; utf8leaf_t *leaf;
size_t ret = 0; size_t ret = 0;
unsigned char hangul[UTF8HANGULLEAF];
if (!tree) if (!tree)
return -1; return -1;
while (len && *s) { while (len && *s) {
if (!(leaf = utf8nlookup(tree, s, len))) leaf = utf8nlookup(tree, hangul, s, len);
if (!leaf)
return -1; return -1;
if (ages[LEAF_GEN(leaf)] > tree->maxage) if (ages[LEAF_GEN(leaf)] > tree->maxage)
ret += utf8clen(s); ret += utf8clen(s);
...@@ -2748,6 +2974,7 @@ struct utf8cursor { ...@@ -2748,6 +2974,7 @@ struct utf8cursor {
short int ccc; short int ccc;
short int nccc; short int nccc;
unsigned int unichar; unsigned int unichar;
unsigned char hangul[UTF8HANGULLEAF];
}; };
/* /*
...@@ -2855,10 +3082,12 @@ int utf8byte(struct utf8cursor *u8c) ...@@ -2855,10 +3082,12 @@ int utf8byte(struct utf8cursor *u8c)
} }
/* Look up the data for the current character. */ /* Look up the data for the current character. */
if (u8c->p) if (u8c->p) {
leaf = utf8lookup(u8c->tree, u8c->s); leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
else } else {
leaf = utf8nlookup(u8c->tree, u8c->s, u8c->len); leaf = utf8nlookup(u8c->tree, u8c->hangul,
u8c->s, u8c->len);
}
/* No leaf found implies that the input is a binary blob. */ /* No leaf found implies that the input is a binary blob. */
if (!leaf) if (!leaf)
...@@ -2878,7 +3107,7 @@ int utf8byte(struct utf8cursor *u8c) ...@@ -2878,7 +3107,7 @@ int utf8byte(struct utf8cursor *u8c)
ccc = STOPPER; ccc = STOPPER;
goto ccc_mismatch; goto ccc_mismatch;
} }
leaf = utf8lookup(u8c->tree, u8c->s); leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
ccc = LEAF_CCC(leaf); ccc = LEAF_CCC(leaf);
} }
u8c->unichar = utf8decode(u8c->s); u8c->unichar = utf8decode(u8c->s);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment