Commit 955405d1 authored by Gabriel Krisman Bertazi's avatar Gabriel Krisman Bertazi Committed by Theodore Ts'o

unicode: introduce UTF-8 character database

The decomposition and casefolding of UTF-8 characters are described in a
prefix tree in utf8data.h, which is a generate from the Unicode
Character Database (UCD), published by the Unicode Consortium, and
should not be edited by hand.  The structures in utf8data.h are meant to
be used for lookup operations by the unicode subsystem, when decoding a
utf-8 string.

mkutf8data.c is the source for a program that generates utf8data.h. It
was written by Olaf Weber from SGI and originally proposed to be merged
into Linux in 2014.  The original proposal performed the compatibility
decomposition, NFKD, but the current version was modified by me to do
canonical decomposition, NFD, as suggested by the community.  The
changes from the original submission are:

  * Rebase to mainline.
  * Fix out-of-tree-build.
  * Update makefile to build 11.0.0 ucd files.
  * drop references to xfs.
  * Convert NFKD to NFD.
  * Merge back robustness fixes from original patch. Requested by
    Dave Chinner.

The original submission is archived at:

<https://linux-xfs.oss.sgi.narkive.com/Xx10wjVY/rfc-unicode-utf-8-support-for-xfs>

The utf8data.h file can be regenerated using the instructions in
fs/unicode/README.utf8data.

- Notes on the update from 8.0.0 to 11.0:

The structure of the ucd files and special cases have not experienced
any changes between versions 8.0.0 and 11.0.0.  8.0.0 saw the addition
of Cherokee LC characters, which is an interesting case for
case-folding.  The update is accompanied by new tests on the test_ucd
module to catch specific cases.  No changes to mkutf8data script were
required for the updates.
Signed-off-by: default avatarGabriel Krisman Bertazi <krisman@collabora.co.uk>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 310a997f
......@@ -317,5 +317,6 @@ endif # NETWORK_FILESYSTEMS
source "fs/nls/Kconfig"
source "fs/dlm/Kconfig"
source "fs/unicode/Kconfig"
endmenu
......@@ -92,6 +92,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/
obj-$(CONFIG_NFSD) += nfsd/
obj-$(CONFIG_LOCKD) += lockd/
obj-$(CONFIG_NLS) += nls/
obj-$(CONFIG_UNICODE) += unicode/
obj-$(CONFIG_SYSV_FS) += sysv/
obj-$(CONFIG_CIFS) += cifs/
obj-$(CONFIG_HPFS_FS) += hpfs/
......
#
# UTF-8 normalization
#
config UNICODE
bool "UTF-8 normalization and casefolding support"
help
Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
support.
# SPDX-License-Identifier: GPL-2.0
# This rule is not invoked during the kernel compilation. It is used to
# regenerate the utf8data.h header file.
utf8data.h.new: *.txt $(objdir)/scripts/mkutf8data
$(objdir)/scripts/mkutf8data \
-a DerivedAge.txt \
-c DerivedCombiningClass.txt \
-p DerivedCoreProperties.txt \
-d UnicodeData.txt \
-f CaseFolding.txt \
-n NormalizationCorrections.txt \
-t NormalizationTest.txt \
-o $@
The utf8data.h file in this directory is generated from the Unicode
Character Database for version 11.0.0 of the Unicode standard.
The full set of files can be found here:
http://www.unicode.org/Public/11.0.0/ucd/
Individual source links:
http://www.unicode.org/Public/11.0.0/ucd/CaseFolding.txt
http://www.unicode.org/Public/11.0.0/ucd/DerivedAge.txt
http://www.unicode.org/Public/11.0.0/ucd/extracted/DerivedCombiningClass.txt
http://www.unicode.org/Public/11.0.0/ucd/DerivedCoreProperties.txt
http://www.unicode.org/Public/11.0.0/ucd/NormalizationCorrections.txt
http://www.unicode.org/Public/11.0.0/ucd/NormalizationTest.txt
http://www.unicode.org/Public/11.0.0/ucd/UnicodeData.txt
md5sums (verify by running "md5sum -c README.utf8data"):
414436796cf097df55f798e1585448ee CaseFolding.txt
6032a595fbb782694456491d86eecfac DerivedAge.txt
3240997d671297ac754ab0d27577acf7 DerivedCombiningClass.txt
2a4fe257d9d8184518e036194d2248ec DerivedCoreProperties.txt
4e7d383fa0dd3cd9d49d64e5b7b7c9e0 NormalizationCorrections.txt
c9500c5b8b88e584469f056023ecc3f2 NormalizationTest.txt
acc291106c3758d2025f8d7bd5518bee UnicodeData.txt
sha1sums (verify by running "sha1sum -c README.utf8data"):
9184727adf7bd20e36312a68581d12ba3ffb9854 CaseFolding.txt
86c55b3eb89de61704da16af9c3f22854f61b57d DerivedAge.txt
b615703f62b1dbc5110e91acc3ff8b3789a067cf DerivedCombiningClass.txt
f8b07ef116d7dc21a94f26e70178ed2acf8713e9 DerivedCoreProperties.txt
a5fafb8998c0b8153a2a58430b8a35c811db0abc NormalizationCorrections.txt
070cdcb00cd4f0860e476750e404c59c2ebe9b25 NormalizationTest.txt
0e060fafb08d6722fbec56d9f9ebe8509f01d0ee UnicodeData.txt
To update to the newer version of the Unicode standard, the latest
released version of the UCD can be found here:
http://www.unicode.org/Public/UCD/latest/
To build the utf8data.h file, from a kernel tree that has been built,
cd to this directory (fs/unicode) and run this command:
make C=../.. objdir=../.. utf8data.h.new
After sanity checking the newly generated utf8data.h.new file (the
version generated from the 11.0.0 UCD should be 13,834 lines long, and
have a total size of 1104k) and/or comparing it with the older version
of utf8data.h, rename it to utf8data.h.
If you are a kernel developer updating to a newer version of the
Unicode Character Database, please update this README.utf8data file
with the version of the UCD that was used, the md5sum and sha1sums of
the *.txt files, before checking in the new versions of the utf8data.h
and README.utf8data files.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -20,6 +20,7 @@ hostprogs-$(CONFIG_ASN1) += asn1_compiler
hostprogs-$(CONFIG_MODULE_SIG) += sign-file
hostprogs-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += extract-cert
hostprogs-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert
hostprogs-$(CONFIG_UNICODE) += mkutf8data
HOSTCFLAGS_sortextable.o = -I$(srctree)/tools/include
HOSTCFLAGS_asn1_compiler.o = -I$(srctree)/include
......
/*
* Copyright (c) 2014 SGI.
* All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it would be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* Generator for a compact trie for unicode normalization */
#include <sys/types.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
/* Default names of the in- and output files. */
#define AGE_NAME "DerivedAge.txt"
#define CCC_NAME "DerivedCombiningClass.txt"
#define PROP_NAME "DerivedCoreProperties.txt"
#define DATA_NAME "UnicodeData.txt"
#define FOLD_NAME "CaseFolding.txt"
#define NORM_NAME "NormalizationCorrections.txt"
#define TEST_NAME "NormalizationTest.txt"
#define UTF8_NAME "utf8data.h"
const char *age_name = AGE_NAME;
const char *ccc_name = CCC_NAME;
const char *prop_name = PROP_NAME;
const char *data_name = DATA_NAME;
const char *fold_name = FOLD_NAME;
const char *norm_name = NORM_NAME;
const char *test_name = TEST_NAME;
const char *utf8_name = UTF8_NAME;
int verbose = 0;
/* An arbitrary line size limit on input lines. */
#define LINESIZE 1024
char line[LINESIZE];
char buf0[LINESIZE];
char buf1[LINESIZE];
char buf2[LINESIZE];
char buf3[LINESIZE];
const char *argv0;
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
/* ------------------------------------------------------------------ */
/*
* Unicode version numbers consist of three parts: major, minor, and a
* revision. These numbers are packed into an unsigned int to obtain
* a single version number.
*
* To save space in the generated trie, the unicode version is not
* stored directly, instead we calculate a generation number from the
* unicode versions seen in the DerivedAge file, and use that as an
* index into a table of unicode versions.
*/
#define UNICODE_MAJ_SHIFT (16)
#define UNICODE_MIN_SHIFT (8)
#define UNICODE_MAJ_MAX ((unsigned short)-1)
#define UNICODE_MIN_MAX ((unsigned char)-1)
#define UNICODE_REV_MAX ((unsigned char)-1)
#define UNICODE_AGE(MAJ,MIN,REV) \
(((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
((unsigned int)(REV)))
unsigned int *ages;
int ages_count;
unsigned int unicode_maxage;
static int age_valid(unsigned int major, unsigned int minor,
unsigned int revision)
{
if (major > UNICODE_MAJ_MAX)
return 0;
if (minor > UNICODE_MIN_MAX)
return 0;
if (revision > UNICODE_REV_MAX)
return 0;
return 1;
}
/* ------------------------------------------------------------------ */
/*
* utf8trie_t
*
* A compact binary tree, used to decode UTF-8 characters.
*
* Internal nodes are one byte for the node itself, and up to three
* bytes for an offset into the tree. The first byte contains the
* following information:
* NEXTBYTE - flag - advance to next byte if set
* BITNUM - 3 bit field - the bit number to tested
* OFFLEN - 2 bit field - number of bytes in the offset
* if offlen == 0 (non-branching node)
* RIGHTPATH - 1 bit field - set if the following node is for the
* right-hand path (tested bit is set)
* TRIENODE - 1 bit field - set if the following node is an internal
* node, otherwise it is a leaf node
* if offlen != 0 (branching node)
* LEFTNODE - 1 bit field - set if the left-hand node is internal
* RIGHTNODE - 1 bit field - set if the right-hand node is internal
*
* Due to the way utf8 works, there cannot be branching nodes with
* NEXTBYTE set, and moreover those nodes always have a righthand
* descendant.
*/
typedef unsigned char utf8trie_t;
#define BITNUM 0x07
#define NEXTBYTE 0x08
#define OFFLEN 0x30
#define OFFLEN_SHIFT 4
#define RIGHTPATH 0x40
#define TRIENODE 0x80
#define RIGHTNODE 0x40
#define LEFTNODE 0x80
/*
* utf8leaf_t
*
* The leaves of the trie are embedded in the trie, and so the same
* underlying datatype, unsigned char.
*
* leaf[0]: The unicode version, stored as a generation number that is
* an index into utf8agetab[]. With this we can filter code
* points based on the unicode version in which they were
* defined. The CCC of a non-defined code point is 0.
* leaf[1]: Canonical Combining Class. During normalization, we need
* to do a stable sort into ascending order of all characters
* with a non-zero CCC that occur between two characters with
* a CCC of 0, or at the begin or end of a string.
* The unicode standard guarantees that all CCC values are
* between 0 and 254 inclusive, which leaves 255 available as
* a special value.
* Code points with CCC 0 are known as stoppers.
* leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
* start of a NUL-terminated string that is the decomposition
* of the character.
* The CCC of a decomposable character is the same as the CCC
* of the first character of its decomposition.
* Some characters decompose as the empty string: these are
* characters with the Default_Ignorable_Code_Point property.
* These do affect normalization, as they all have CCC 0.
*
* The decompositions in the trie have been fully expanded.
*
* Casefolding, if applicable, is also done using decompositions.
*/
typedef unsigned char utf8leaf_t;
#define LEAF_GEN(LEAF) ((LEAF)[0])
#define LEAF_CCC(LEAF) ((LEAF)[1])
#define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
#define MAXGEN (255)
#define MINCCC (0)
#define MAXCCC (254)
#define STOPPER (0)
#define DECOMPOSE (255)
struct tree;
static utf8leaf_t *utf8nlookup(struct tree *, const char *, size_t);
static utf8leaf_t *utf8lookup(struct tree *, const char *);
unsigned char *utf8data;
size_t utf8data_size;
utf8trie_t *nfdi;
utf8trie_t *nfdicf;
/* ------------------------------------------------------------------ */
/*
* UTF8 valid ranges.
*
* The UTF-8 encoding spreads the bits of a 32bit word over several
* bytes. This table gives the ranges that can be held and how they'd
* be represented.
*
* 0x00000000 0x0000007F: 0xxxxxxx
* 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
* 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
* 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* There is an additional requirement on UTF-8, in that only the
* shortest representation of a 32bit value is to be used. A decoder
* must not decode sequences that do not satisfy this requirement.
* Thus the allowed ranges have a lower bound.
*
* 0x00000000 0x0000007F: 0xxxxxxx
* 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
* 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
* 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
* 17 planes of 65536 values. This limits the sequences actually seen
* even more, to just the following.
*
* 0 - 0x7f: 0 0x7f
* 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
* 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
* 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
*
* Even within those ranges not all values are allowed: the surrogates
* 0xd800 - 0xdfff should never be seen.
*
* Note that the longest sequence seen with valid usage is 4 bytes,
* the same a single UTF-32 character. This makes the UTF-8
* representation of Unicode strictly smaller than UTF-32.
*
* The shortest sequence requirement was introduced by:
* Corrigendum #1: UTF-8 Shortest Form
* It can be found here:
* http://www.unicode.org/versions/corrigendum1.html
*
*/
#define UTF8_2_BITS 0xC0
#define UTF8_3_BITS 0xE0
#define UTF8_4_BITS 0xF0
#define UTF8_N_BITS 0x80
#define UTF8_2_MASK 0xE0
#define UTF8_3_MASK 0xF0
#define UTF8_4_MASK 0xF8
#define UTF8_N_MASK 0xC0
#define UTF8_V_MASK 0x3F
#define UTF8_V_SHIFT 6
static int utf8encode(char *str, unsigned int val)
{
int len;
if (val < 0x80) {
str[0] = val;
len = 1;
} else if (val < 0x800) {
str[1] = val & UTF8_V_MASK;
str[1] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[0] = val;
str[0] |= UTF8_2_BITS;
len = 2;
} else if (val < 0x10000) {
str[2] = val & UTF8_V_MASK;
str[2] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[1] = val & UTF8_V_MASK;
str[1] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[0] = val;
str[0] |= UTF8_3_BITS;
len = 3;
} else if (val < 0x110000) {
str[3] = val & UTF8_V_MASK;
str[3] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[2] = val & UTF8_V_MASK;
str[2] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[1] = val & UTF8_V_MASK;
str[1] |= UTF8_N_BITS;
val >>= UTF8_V_SHIFT;
str[0] = val;
str[0] |= UTF8_4_BITS;
len = 4;
} else {
printf("%#x: illegal val\n", val);
len = 0;
}
return len;
}
static unsigned int utf8decode(const char *str)
{
const unsigned char *s = (const unsigned char*)str;
unsigned int unichar = 0;
if (*s < 0x80) {
unichar = *s;
} else if (*s < UTF8_3_BITS) {
unichar = *s++ & 0x1F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s & 0x3F;
} else if (*s < UTF8_4_BITS) {
unichar = *s++ & 0x0F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s++ & 0x3F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s & 0x3F;
} else {
unichar = *s++ & 0x0F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s++ & 0x3F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s++ & 0x3F;
unichar <<= UTF8_V_SHIFT;
unichar |= *s & 0x3F;
}
return unichar;
}
static int utf32valid(unsigned int unichar)
{
return unichar < 0x110000;
}
#define NODE 1
#define LEAF 0
struct tree {
void *root;
int childnode;
const char *type;
unsigned int maxage;
struct tree *next;
int (*leaf_equal)(void *, void *);
void (*leaf_print)(void *, int);
int (*leaf_mark)(void *);
int (*leaf_size)(void *);
int *(*leaf_index)(struct tree *, void *);
unsigned char *(*leaf_emit)(void *, unsigned char *);
int leafindex[0x110000];
int index;
};
struct node {
int index;
int offset;
int mark;
int size;
struct node *parent;
void *left;
void *right;
unsigned char bitnum;
unsigned char nextbyte;
unsigned char leftnode;
unsigned char rightnode;
unsigned int keybits;
unsigned int keymask;
};
/*
* Example lookup function for a tree.
*/
static void *lookup(struct tree *tree, const char *key)
{
struct node *node;
void *leaf = NULL;
node = tree->root;
while (!leaf && node) {
if (node->nextbyte)
key++;
if (*key & (1 << (node->bitnum & 7))) {
/* Right leg */
if (node->rightnode == NODE) {
node = node->right;
} else if (node->rightnode == LEAF) {
leaf = node->right;
} else {
node = NULL;
}
} else {
/* Left leg */
if (node->leftnode == NODE) {
node = node->left;
} else if (node->leftnode == LEAF) {
leaf = node->left;
} else {
node = NULL;
}
}
}
return leaf;
}
/*
* A simple non-recursive tree walker: keep track of visits to the
* left and right branches in the leftmask and rightmask.
*/
static void tree_walk(struct tree *tree)
{
struct node *node;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
int indent = 1;
int nodes, singletons, leaves;
nodes = singletons = leaves = 0;
printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
if (tree->childnode == LEAF) {
assert(tree->root);
tree->leaf_print(tree->root, indent);
leaves = 1;
} else {
assert(tree->childnode == NODE);
node = tree->root;
leftmask = rightmask = 0;
while (node) {
printf("%*snode @ %p bitnum %d nextbyte %d"
" left %p right %p mask %x bits %x\n",
indent, "", node,
node->bitnum, node->nextbyte,
node->left, node->right,
node->keymask, node->keybits);
nodes += 1;
if (!(node->left && node->right))
singletons += 1;
while (node) {
bitmask = 1 << node->bitnum;
if ((leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
tree->leaf_print(node->left,
indent+1);
leaves += 1;
} else if (node->left) {
assert(node->leftnode == NODE);
indent += 1;
node = node->left;
break;
}
}
if ((rightmask & bitmask) == 0) {
rightmask |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
tree->leaf_print(node->right,
indent+1);
leaves += 1;
} else if (node->right) {
assert(node->rightnode==NODE);
indent += 1;
node = node->right;
break;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
indent -= 1;
}
}
}
printf("nodes %d leaves %d singletons %d\n",
nodes, leaves, singletons);
}
/*
* Allocate an initialize a new internal node.
*/
static struct node *alloc_node(struct node *parent)
{
struct node *node;
int bitnum;
node = malloc(sizeof(*node));
node->left = node->right = NULL;
node->parent = parent;
node->leftnode = NODE;
node->rightnode = NODE;
node->keybits = 0;
node->keymask = 0;
node->mark = 0;
node->index = 0;
node->offset = -1;
node->size = 4;
if (node->parent) {
bitnum = parent->bitnum;
if ((bitnum & 7) == 0) {
node->bitnum = bitnum + 7 + 8;
node->nextbyte = 1;
} else {
node->bitnum = bitnum - 1;
node->nextbyte = 0;
}
} else {
node->bitnum = 7;
node->nextbyte = 0;
}
return node;
}
/*
* Insert a new leaf into the tree, and collapse any subtrees that are
* fully populated and end in identical leaves. A nextbyte tagged
* internal node will not be removed to preserve the tree's integrity.
* Note that due to the structure of utf8, no nextbyte tagged node
* will be a candidate for removal.
*/
static int insert(struct tree *tree, char *key, int keylen, void *leaf)
{
struct node *node;
struct node *parent;
void **cursor;
int keybits;
assert(keylen >= 1 && keylen <= 4);
node = NULL;
cursor = &tree->root;
keybits = 8 * keylen;
/* Insert, creating path along the way. */
while (keybits) {
if (!*cursor)
*cursor = alloc_node(node);
node = *cursor;
if (node->nextbyte)
key++;
if (*key & (1 << (node->bitnum & 7)))
cursor = &node->right;
else
cursor = &node->left;
keybits--;
}
*cursor = leaf;
/* Merge subtrees if possible. */
while (node) {
if (*key & (1 << (node->bitnum & 7)))
node->rightnode = LEAF;
else
node->leftnode = LEAF;
if (node->nextbyte)
break;
if (node->leftnode == NODE || node->rightnode == NODE)
break;
assert(node->left);
assert(node->right);
/* Compare */
if (! tree->leaf_equal(node->left, node->right))
break;
/* Keep left, drop right leaf. */
leaf = node->left;
/* Check in parent */
parent = node->parent;
if (!parent) {
/* root of tree! */
tree->root = leaf;
tree->childnode = LEAF;
} else if (parent->left == node) {
parent->left = leaf;
parent->leftnode = LEAF;
if (parent->right) {
parent->keymask = 0;
parent->keybits = 0;
} else {
parent->keymask |= (1 << node->bitnum);
}
} else if (parent->right == node) {
parent->right = leaf;
parent->rightnode = LEAF;
if (parent->left) {
parent->keymask = 0;
parent->keybits = 0;
} else {
parent->keymask |= (1 << node->bitnum);
parent->keybits |= (1 << node->bitnum);
}
} else {
/* internal tree error */
assert(0);
}
free(node);
node = parent;
}
/* Propagate keymasks up along singleton chains. */
while (node) {
parent = node->parent;
if (!parent)
break;
/* Nix the mask for parents with two children. */
if (node->keymask == 0) {
parent->keymask = 0;
parent->keybits = 0;
} else if (parent->left && parent->right) {
parent->keymask = 0;
parent->keybits = 0;
} else {
assert((parent->keymask & node->keymask) == 0);
parent->keymask |= node->keymask;
parent->keymask |= (1 << parent->bitnum);
parent->keybits |= node->keybits;
if (parent->right)
parent->keybits |= (1 << parent->bitnum);
}
node = parent;
}
return 0;
}
/*
* Prune internal nodes.
*
* Fully populated subtrees that end at the same leaf have already
* been collapsed. There are still internal nodes that have for both
* their left and right branches a sequence of singletons that make
* identical choices and end in identical leaves. The keymask and
* keybits collected in the nodes describe the choices made in these
* singleton chains. When they are identical for the left and right
* branch of a node, and the two leaves comare identical, the node in
* question can be removed.
*
* Note that nodes with the nextbyte tag set will not be removed by
* this to ensure tree integrity. Note as well that the structure of
* utf8 ensures that these nodes would not have been candidates for
* removal in any case.
*/
static void prune(struct tree *tree)
{
struct node *node;
struct node *left;
struct node *right;
struct node *parent;
void *leftleaf;
void *rightleaf;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
int count;
if (verbose > 0)
printf("Pruning %s_%x\n", tree->type, tree->maxage);
count = 0;
if (tree->childnode == LEAF)
return;
if (!tree->root)
return;
leftmask = rightmask = 0;
node = tree->root;
while (node) {
if (node->nextbyte)
goto advance;
if (node->leftnode == LEAF)
goto advance;
if (node->rightnode == LEAF)
goto advance;
if (!node->left)
goto advance;
if (!node->right)
goto advance;
left = node->left;
right = node->right;
if (left->keymask == 0)
goto advance;
if (right->keymask == 0)
goto advance;
if (left->keymask != right->keymask)
goto advance;
if (left->keybits != right->keybits)
goto advance;
leftleaf = NULL;
while (!leftleaf) {
assert(left->left || left->right);
if (left->leftnode == LEAF)
leftleaf = left->left;
else if (left->rightnode == LEAF)
leftleaf = left->right;
else if (left->left)
left = left->left;
else if (left->right)
left = left->right;
else
assert(0);
}
rightleaf = NULL;
while (!rightleaf) {
assert(right->left || right->right);
if (right->leftnode == LEAF)
rightleaf = right->left;
else if (right->rightnode == LEAF)
rightleaf = right->right;
else if (right->left)
right = right->left;
else if (right->right)
right = right->right;
else
assert(0);
}
if (! tree->leaf_equal(leftleaf, rightleaf))
goto advance;
/*
* This node has identical singleton-only subtrees.
* Remove it.
*/
parent = node->parent;
left = node->left;
right = node->right;
if (parent->left == node)
parent->left = left;
else if (parent->right == node)
parent->right = left;
else
assert(0);
left->parent = parent;
left->keymask |= (1 << node->bitnum);
node->left = NULL;
while (node) {
bitmask = 1 << node->bitnum;
leftmask &= ~bitmask;
rightmask &= ~bitmask;
if (node->leftnode == NODE && node->left) {
left = node->left;
free(node);
count++;
node = left;
} else if (node->rightnode == NODE && node->right) {
right = node->right;
free(node);
count++;
node = right;
} else {
node = NULL;
}
}
/* Propagate keymasks up along singleton chains. */
node = parent;
/* Force re-check */
bitmask = 1 << node->bitnum;
leftmask &= ~bitmask;
rightmask &= ~bitmask;
for (;;) {
if (node->left && node->right)
break;
if (node->left) {
left = node->left;
node->keymask |= left->keymask;
node->keybits |= left->keybits;
}
if (node->right) {
right = node->right;
node->keymask |= right->keymask;
node->keybits |= right->keybits;
}
node->keymask |= (1 << node->bitnum);
node = node->parent;
/* Force re-check */
bitmask = 1 << node->bitnum;
leftmask &= ~bitmask;
rightmask &= ~bitmask;
}
advance:
bitmask = 1 << node->bitnum;
if ((leftmask & bitmask) == 0 &&
node->leftnode == NODE &&
node->left) {
leftmask |= bitmask;
node = node->left;
} else if ((rightmask & bitmask) == 0 &&
node->rightnode == NODE &&
node->right) {
rightmask |= bitmask;
node = node->right;
} else {
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
}
}
if (verbose > 0)
printf("Pruned %d nodes\n", count);
}
/*
* Mark the nodes in the tree that lead to leaves that must be
* emitted.
*/
static void mark_nodes(struct tree *tree)
{
struct node *node;
struct node *n;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
int marked;
marked = 0;
if (verbose > 0)
printf("Marking %s_%x\n", tree->type, tree->maxage);
if (tree->childnode == LEAF)
goto done;
assert(tree->childnode == NODE);
node = tree->root;
leftmask = rightmask = 0;
while (node) {
bitmask = 1 << node->bitnum;
if ((leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
if (tree->leaf_mark(node->left)) {
n = node;
while (n && !n->mark) {
marked++;
n->mark = 1;
n = n->parent;
}
}
} else if (node->left) {
assert(node->leftnode == NODE);
node = node->left;
continue;
}
}
if ((rightmask & bitmask) == 0) {
rightmask |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
if (tree->leaf_mark(node->right)) {
n = node;
while (n && !n->mark) {
marked++;
n->mark = 1;
n = n->parent;
}
}
} else if (node->right) {
assert(node->rightnode==NODE);
node = node->right;
continue;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
}
/* second pass: left siblings and singletons */
assert(tree->childnode == NODE);
node = tree->root;
leftmask = rightmask = 0;
while (node) {
bitmask = 1 << node->bitnum;
if ((leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
if (tree->leaf_mark(node->left)) {
n = node;
while (n && !n->mark) {
marked++;
n->mark = 1;
n = n->parent;
}
}
} else if (node->left) {
assert(node->leftnode == NODE);
node = node->left;
if (!node->mark && node->parent->mark) {
marked++;
node->mark = 1;
}
continue;
}
}
if ((rightmask & bitmask) == 0) {
rightmask |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
if (tree->leaf_mark(node->right)) {
n = node;
while (n && !n->mark) {
marked++;
n->mark = 1;
n = n->parent;
}
}
} else if (node->right) {
assert(node->rightnode==NODE);
node = node->right;
if (!node->mark && node->parent->mark &&
!node->parent->left) {
marked++;
node->mark = 1;
}
continue;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
}
done:
if (verbose > 0)
printf("Marked %d nodes\n", marked);
}
/*
* Compute the index of each node and leaf, which is the offset in the
* emitted trie. These values must be pre-computed because relative
* offsets between nodes are used to navigate the tree.
*/
static int index_nodes(struct tree *tree, int index)
{
struct node *node;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
int count;
int indent;
/* Align to a cache line (or half a cache line?). */
while (index % 64)
index++;
tree->index = index;
indent = 1;
count = 0;
if (verbose > 0)
printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
if (tree->childnode == LEAF) {
index += tree->leaf_size(tree->root);
goto done;
}
assert(tree->childnode == NODE);
node = tree->root;
leftmask = rightmask = 0;
while (node) {
if (!node->mark)
goto skip;
count++;
if (node->index != index)
node->index = index;
index += node->size;
skip:
while (node) {
bitmask = 1 << node->bitnum;
if (node->mark && (leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
*tree->leaf_index(tree, node->left) =
index;
index += tree->leaf_size(node->left);
count++;
} else if (node->left) {
assert(node->leftnode == NODE);
indent += 1;
node = node->left;
break;
}
}
if (node->mark && (rightmask & bitmask) == 0) {
rightmask |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
*tree->leaf_index(tree, node->right) = index;
index += tree->leaf_size(node->right);
count++;
} else if (node->right) {
assert(node->rightnode==NODE);
indent += 1;
node = node->right;
break;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
indent -= 1;
}
}
done:
/* Round up to a multiple of 16 */
while (index % 16)
index++;
if (verbose > 0)
printf("Final index %d\n", index);
return index;
}
/*
* Compute the size of nodes and leaves. We start by assuming that
* each node needs to store a three-byte offset. The indexes of the
* nodes are calculated based on that, and then this function is
* called to see if the sizes of some nodes can be reduced. This is
* repeated until no more changes are seen.
*/
static int size_nodes(struct tree *tree)
{
struct tree *next;
struct node *node;
struct node *right;
struct node *n;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
unsigned int pathbits;
unsigned int pathmask;
int changed;
int offset;
int size;
int indent;
indent = 1;
changed = 0;
size = 0;
if (verbose > 0)
printf("Sizing %s_%x\n", tree->type, tree->maxage);
if (tree->childnode == LEAF)
goto done;
assert(tree->childnode == NODE);
pathbits = 0;
pathmask = 0;
node = tree->root;
leftmask = rightmask = 0;
while (node) {
if (!node->mark)
goto skip;
offset = 0;
if (!node->left || !node->right) {
size = 1;
} else {
if (node->rightnode == NODE) {
right = node->right;
next = tree->next;
while (!right->mark) {
assert(next);
n = next->root;
while (n->bitnum != node->bitnum) {
if (pathbits & (1<<n->bitnum))
n = n->right;
else
n = n->left;
}
n = n->right;
assert(right->bitnum == n->bitnum);
right = n;
next = next->next;
}
offset = right->index - node->index;
} else {
offset = *tree->leaf_index(tree, node->right);
offset -= node->index;
}
assert(offset >= 0);
assert(offset <= 0xffffff);
if (offset <= 0xff) {
size = 2;
} else if (offset <= 0xffff) {
size = 3;
} else { /* offset <= 0xffffff */
size = 4;
}
}
if (node->size != size || node->offset != offset) {
node->size = size;
node->offset = offset;
changed++;
}
skip:
while (node) {
bitmask = 1 << node->bitnum;
pathmask |= bitmask;
if (node->mark && (leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
} else if (node->left) {
assert(node->leftnode == NODE);
indent += 1;
node = node->left;
break;
}
}
if (node->mark && (rightmask & bitmask) == 0) {
rightmask |= bitmask;
pathbits |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
} else if (node->right) {
assert(node->rightnode==NODE);
indent += 1;
node = node->right;
break;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
pathmask &= ~bitmask;
pathbits &= ~bitmask;
node = node->parent;
indent -= 1;
}
}
done:
if (verbose > 0)
printf("Found %d changes\n", changed);
return changed;
}
/*
* Emit a trie for the given tree into the data array.
*/
static void emit(struct tree *tree, unsigned char *data)
{
struct node *node;
unsigned int leftmask;
unsigned int rightmask;
unsigned int bitmask;
int offlen;
int offset;
int index;
int indent;
unsigned char byte;
index = tree->index;
data += index;
indent = 1;
if (verbose > 0)
printf("Emitting %s_%x\n", tree->type, tree->maxage);
if (tree->childnode == LEAF) {
assert(tree->root);
tree->leaf_emit(tree->root, data);
return;
}
assert(tree->childnode == NODE);
node = tree->root;
leftmask = rightmask = 0;
while (node) {
if (!node->mark)
goto skip;
assert(node->offset != -1);
assert(node->index == index);
byte = 0;
if (node->nextbyte)
byte |= NEXTBYTE;
byte |= (node->bitnum & BITNUM);
if (node->left && node->right) {
if (node->leftnode == NODE)
byte |= LEFTNODE;
if (node->rightnode == NODE)
byte |= RIGHTNODE;
if (node->offset <= 0xff)
offlen = 1;
else if (node->offset <= 0xffff)
offlen = 2;
else
offlen = 3;
offset = node->offset;
byte |= offlen << OFFLEN_SHIFT;
*data++ = byte;
index++;
while (offlen--) {
*data++ = offset & 0xff;
index++;
offset >>= 8;
}
} else if (node->left) {
if (node->leftnode == NODE)
byte |= TRIENODE;
*data++ = byte;
index++;
} else if (node->right) {
byte |= RIGHTNODE;
if (node->rightnode == NODE)
byte |= TRIENODE;
*data++ = byte;
index++;
} else {
assert(0);
}
skip:
while (node) {
bitmask = 1 << node->bitnum;
if (node->mark && (leftmask & bitmask) == 0) {
leftmask |= bitmask;
if (node->leftnode == LEAF) {
assert(node->left);
data = tree->leaf_emit(node->left,
data);
index += tree->leaf_size(node->left);
} else if (node->left) {
assert(node->leftnode == NODE);
indent += 1;
node = node->left;
break;
}
}
if (node->mark && (rightmask & bitmask) == 0) {
rightmask |= bitmask;
if (node->rightnode == LEAF) {
assert(node->right);
data = tree->leaf_emit(node->right,
data);
index += tree->leaf_size(node->right);
} else if (node->right) {
assert(node->rightnode==NODE);
indent += 1;
node = node->right;
break;
}
}
leftmask &= ~bitmask;
rightmask &= ~bitmask;
node = node->parent;
indent -= 1;
}
}
}
/* ------------------------------------------------------------------ */
/*
* Unicode data.
*
* We need to keep track of the Canonical Combining Class, the Age,
* and decompositions for a code point.
*
* For the Age, we store the index into the ages table. Effectively
* this is a generation number that the table maps to a unicode
* version.
*
* The correction field is used to indicate that this entry is in the
* corrections array, which contains decompositions that were
* corrected in later revisions. The value of the correction field is
* the Unicode version in which the mapping was corrected.
*/
struct unicode_data {
unsigned int code;
int ccc;
int gen;
int correction;
unsigned int *utf32nfdi;
unsigned int *utf32nfdicf;
char *utf8nfdi;
char *utf8nfdicf;
};
struct unicode_data unicode_data[0x110000];
struct unicode_data *corrections;
int corrections_count;
struct tree *nfdi_tree;
struct tree *nfdicf_tree;
struct tree *trees;
int trees_count;
/*
* Check the corrections array to see if this entry was corrected at
* some point.
*/
static struct unicode_data *corrections_lookup(struct unicode_data *u)
{
int i;
for (i = 0; i != corrections_count; i++)
if (u->code == corrections[i].code)
return &corrections[i];
return u;
}
static int nfdi_equal(void *l, void *r)
{
struct unicode_data *left = l;
struct unicode_data *right = r;
if (left->gen != right->gen)
return 0;
if (left->ccc != right->ccc)
return 0;
if (left->utf8nfdi && right->utf8nfdi &&
strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
return 1;
if (left->utf8nfdi || right->utf8nfdi)
return 0;
return 1;
}
static int nfdicf_equal(void *l, void *r)
{
struct unicode_data *left = l;
struct unicode_data *right = r;
if (left->gen != right->gen)
return 0;
if (left->ccc != right->ccc)
return 0;
if (left->utf8nfdicf && right->utf8nfdicf &&
strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
return 1;
if (left->utf8nfdicf && right->utf8nfdicf)
return 0;
if (left->utf8nfdicf || right->utf8nfdicf)
return 0;
if (left->utf8nfdi && right->utf8nfdi &&
strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
return 1;
if (left->utf8nfdi || right->utf8nfdi)
return 0;
return 1;
}
static void nfdi_print(void *l, int indent)
{
struct unicode_data *leaf = l;
printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
leaf->code, leaf->ccc, leaf->gen);
if (leaf->utf8nfdi)
printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
printf("\n");
}
static void nfdicf_print(void *l, int indent)
{
struct unicode_data *leaf = l;
printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
leaf->code, leaf->ccc, leaf->gen);
if (leaf->utf8nfdicf)
printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
else if (leaf->utf8nfdi)
printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
printf("\n");
}
static int nfdi_mark(void *l)
{
return 1;
}
static int nfdicf_mark(void *l)
{
struct unicode_data *leaf = l;
if (leaf->utf8nfdicf)
return 1;
return 0;
}
static int correction_mark(void *l)
{
struct unicode_data *leaf = l;
return leaf->correction;
}
static int nfdi_size(void *l)
{
struct unicode_data *leaf = l;
int size = 2;
if (leaf->utf8nfdi)
size += strlen(leaf->utf8nfdi) + 1;
return size;
}
static int nfdicf_size(void *l)
{
struct unicode_data *leaf = l;
int size = 2;
if (leaf->utf8nfdicf)
size += strlen(leaf->utf8nfdicf) + 1;
else if (leaf->utf8nfdi)
size += strlen(leaf->utf8nfdi) + 1;
return size;
}
static int *nfdi_index(struct tree *tree, void *l)
{
struct unicode_data *leaf = l;
return &tree->leafindex[leaf->code];
}
static int *nfdicf_index(struct tree *tree, void *l)
{
struct unicode_data *leaf = l;
return &tree->leafindex[leaf->code];
}
static unsigned char *nfdi_emit(void *l, unsigned char *data)
{
struct unicode_data *leaf = l;
unsigned char *s;
*data++ = leaf->gen;
if (leaf->utf8nfdi) {
*data++ = DECOMPOSE;
s = (unsigned char*)leaf->utf8nfdi;
while ((*data++ = *s++) != 0)
;
} else {
*data++ = leaf->ccc;
}
return data;
}
static unsigned char *nfdicf_emit(void *l, unsigned char *data)
{
struct unicode_data *leaf = l;
unsigned char *s;
*data++ = leaf->gen;
if (leaf->utf8nfdicf) {
*data++ = DECOMPOSE;
s = (unsigned char*)leaf->utf8nfdicf;
while ((*data++ = *s++) != 0)
;
} else if (leaf->utf8nfdi) {
*data++ = DECOMPOSE;
s = (unsigned char*)leaf->utf8nfdi;
while ((*data++ = *s++) != 0)
;
} else {
*data++ = leaf->ccc;
}
return data;
}
static void utf8_create(struct unicode_data *data)
{
char utf[18*4+1];
char *u;
unsigned int *um;
int i;
u = utf;
um = data->utf32nfdi;
if (um) {
for (i = 0; um[i]; i++)
u += utf8encode(u, um[i]);
*u = '\0';
data->utf8nfdi = strdup(utf);
}
u = utf;
um = data->utf32nfdicf;
if (um) {
for (i = 0; um[i]; i++)
u += utf8encode(u, um[i]);
*u = '\0';
if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
data->utf8nfdicf = strdup(utf);
}
}
static void utf8_init(void)
{
unsigned int unichar;
int i;
for (unichar = 0; unichar != 0x110000; unichar++)
utf8_create(&unicode_data[unichar]);
for (i = 0; i != corrections_count; i++)
utf8_create(&corrections[i]);
}
static void trees_init(void)
{
struct unicode_data *data;
unsigned int maxage;
unsigned int nextage;
int count;
int i;
int j;
/* Count the number of different ages. */
count = 0;
nextage = (unsigned int)-1;
do {
maxage = nextage;
nextage = 0;
for (i = 0; i <= corrections_count; i++) {
data = &corrections[i];
if (nextage < data->correction &&
data->correction < maxage)
nextage = data->correction;
}
count++;
} while (nextage);
/* Two trees per age: nfdi and nfdicf */
trees_count = count * 2;
trees = calloc(trees_count, sizeof(struct tree));
/* Assign ages to the trees. */
count = trees_count;
nextage = (unsigned int)-1;
do {
maxage = nextage;
trees[--count].maxage = maxage;
trees[--count].maxage = maxage;
nextage = 0;
for (i = 0; i <= corrections_count; i++) {
data = &corrections[i];
if (nextage < data->correction &&
data->correction < maxage)
nextage = data->correction;
}
} while (nextage);
/* The ages assigned above are off by one. */
for (i = 0; i != trees_count; i++) {
j = 0;
while (ages[j] < trees[i].maxage)
j++;
trees[i].maxage = ages[j-1];
}
/* Set up the forwarding between trees. */
trees[trees_count-2].next = &trees[trees_count-1];
trees[trees_count-1].leaf_mark = nfdi_mark;
trees[trees_count-2].leaf_mark = nfdicf_mark;
for (i = 0; i != trees_count-2; i += 2) {
trees[i].next = &trees[trees_count-2];
trees[i].leaf_mark = correction_mark;
trees[i+1].next = &trees[trees_count-1];
trees[i+1].leaf_mark = correction_mark;
}
/* Assign the callouts. */
for (i = 0; i != trees_count; i += 2) {
trees[i].type = "nfdicf";
trees[i].leaf_equal = nfdicf_equal;
trees[i].leaf_print = nfdicf_print;
trees[i].leaf_size = nfdicf_size;
trees[i].leaf_index = nfdicf_index;
trees[i].leaf_emit = nfdicf_emit;
trees[i+1].type = "nfdi";
trees[i+1].leaf_equal = nfdi_equal;
trees[i+1].leaf_print = nfdi_print;
trees[i+1].leaf_size = nfdi_size;
trees[i+1].leaf_index = nfdi_index;
trees[i+1].leaf_emit = nfdi_emit;
}
/* Finish init. */
for (i = 0; i != trees_count; i++)
trees[i].childnode = NODE;
}
static void trees_populate(void)
{
struct unicode_data *data;
unsigned int unichar;
char keyval[4];
int keylen;
int i;
for (i = 0; i != trees_count; i++) {
if (verbose > 0) {
printf("Populating %s_%x\n",
trees[i].type, trees[i].maxage);
}
for (unichar = 0; unichar != 0x110000; unichar++) {
if (unicode_data[unichar].gen < 0)
continue;
keylen = utf8encode(keyval, unichar);
data = corrections_lookup(&unicode_data[unichar]);
if (data->correction <= trees[i].maxage)
data = &unicode_data[unichar];
insert(&trees[i], keyval, keylen, data);
}
}
}
static void trees_reduce(void)
{
int i;
int size;
int changed;
for (i = 0; i != trees_count; i++)
prune(&trees[i]);
for (i = 0; i != trees_count; i++)
mark_nodes(&trees[i]);
do {
size = 0;
for (i = 0; i != trees_count; i++)
size = index_nodes(&trees[i], size);
changed = 0;
for (i = 0; i != trees_count; i++)
changed += size_nodes(&trees[i]);
} while (changed);
utf8data = calloc(size, 1);
utf8data_size = size;
for (i = 0; i != trees_count; i++)
emit(&trees[i], utf8data);
if (verbose > 0) {
for (i = 0; i != trees_count; i++) {
printf("%s_%x idx %d\n",
trees[i].type, trees[i].maxage, trees[i].index);
}
}
nfdi = utf8data + trees[trees_count-1].index;
nfdicf = utf8data + trees[trees_count-2].index;
nfdi_tree = &trees[trees_count-1];
nfdicf_tree = &trees[trees_count-2];
}
static void verify(struct tree *tree)
{
struct unicode_data *data;
utf8leaf_t *leaf;
unsigned int unichar;
char key[4];
int report;
int nocf;
if (verbose > 0)
printf("Verifying %s_%x\n", tree->type, tree->maxage);
nocf = strcmp(tree->type, "nfdicf");
for (unichar = 0; unichar != 0x110000; unichar++) {
report = 0;
data = corrections_lookup(&unicode_data[unichar]);
if (data->correction <= tree->maxage)
data = &unicode_data[unichar];
utf8encode(key,unichar);
leaf = utf8lookup(tree, key);
if (!leaf) {
if (data->gen != -1)
report++;
if (unichar < 0xd800 || unichar > 0xdfff)
report++;
} else {
if (unichar >= 0xd800 && unichar <= 0xdfff)
report++;
if (data->gen == -1)
report++;
if (data->gen != LEAF_GEN(leaf))
report++;
if (LEAF_CCC(leaf) == DECOMPOSE) {
if (nocf) {
if (!data->utf8nfdi) {
report++;
} else if (strcmp(data->utf8nfdi,
LEAF_STR(leaf))) {
report++;
}
} else {
if (!data->utf8nfdicf &&
!data->utf8nfdi) {
report++;
} else if (data->utf8nfdicf) {
if (strcmp(data->utf8nfdicf,
LEAF_STR(leaf)))
report++;
} else if (strcmp(data->utf8nfdi,
LEAF_STR(leaf))) {
report++;
}
}
} else if (data->ccc != LEAF_CCC(leaf)) {
report++;
}
}
if (report) {
printf("%X code %X gen %d ccc %d"
" nfdi -> \"%s\"",
unichar, data->code, data->gen,
data->ccc,
data->utf8nfdi);
if (leaf) {
printf(" gen %d ccc %d"
" nfdi -> \"%s\"",
LEAF_GEN(leaf),
LEAF_CCC(leaf),
LEAF_CCC(leaf) == DECOMPOSE ?
LEAF_STR(leaf) : "");
}
printf("\n");
}
}
}
static void trees_verify(void)
{
int i;
for (i = 0; i != trees_count; i++)
verify(&trees[i]);
}
/* ------------------------------------------------------------------ */
static void help(void)
{
printf("Usage: %s [options]\n", argv0);
printf("\n");
printf("This program creates an a data trie used for parsing and\n");
printf("normalization of UTF-8 strings. The trie is derived from\n");
printf("a set of input files from the Unicode character database\n");
printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
printf("\n");
printf("The generated tree supports two normalization forms:\n");
printf("\n");
printf("\tnfdi:\n");
printf("\t- Apply unicode normalization form NFD.\n");
printf("\t- Remove any Default_Ignorable_Code_Point.\n");
printf("\n");
printf("\tnfdicf:\n");
printf("\t- Apply unicode normalization form NFD.\n");
printf("\t- Remove any Default_Ignorable_Code_Point.\n");
printf("\t- Apply a full casefold (C + F).\n");
printf("\n");
printf("These forms were chosen as being most useful when dealing\n");
printf("with file names: NFD catches most cases where characters\n");
printf("should be considered equivalent. The ignorables are mostly\n");
printf("invisible, making names hard to type.\n");
printf("\n");
printf("The options to specify the files to be used are listed\n");
printf("below with their default values, which are the names used\n");
printf("by version 11.0.0 of the Unicode Character Database.\n");
printf("\n");
printf("The input files:\n");
printf("\t-a %s\n", AGE_NAME);
printf("\t-c %s\n", CCC_NAME);
printf("\t-p %s\n", PROP_NAME);
printf("\t-d %s\n", DATA_NAME);
printf("\t-f %s\n", FOLD_NAME);
printf("\t-n %s\n", NORM_NAME);
printf("\n");
printf("Additionally, the generated tables are tested using:\n");
printf("\t-t %s\n", TEST_NAME);
printf("\n");
printf("Finally, the output file:\n");
printf("\t-o %s\n", UTF8_NAME);
printf("\n");
}
static void usage(void)
{
help();
exit(1);
}
static void open_fail(const char *name, int error)
{
printf("Error %d opening %s: %s\n", error, name, strerror(error));
exit(1);
}
static void file_fail(const char *filename)
{
printf("Error parsing %s\n", filename);
exit(1);
}
static void line_fail(const char *filename, const char *line)
{
printf("Error parsing %s:%s\n", filename, line);
exit(1);
}
/* ------------------------------------------------------------------ */
static void print_utf32(unsigned int *utf32str)
{
int i;
for (i = 0; utf32str[i]; i++)
printf(" %X", utf32str[i]);
}
static void print_utf32nfdi(unsigned int unichar)
{
printf(" %X ->", unichar);
print_utf32(unicode_data[unichar].utf32nfdi);
printf("\n");
}
static void print_utf32nfdicf(unsigned int unichar)
{
printf(" %X ->", unichar);
print_utf32(unicode_data[unichar].utf32nfdicf);
printf("\n");
}
/* ------------------------------------------------------------------ */
static void age_init(void)
{
FILE *file;
unsigned int first;
unsigned int last;
unsigned int unichar;
unsigned int major;
unsigned int minor;
unsigned int revision;
int gen;
int count;
int ret;
if (verbose > 0)
printf("Parsing %s\n", age_name);
file = fopen(age_name, "r");
if (!file)
open_fail(age_name, errno);
count = 0;
gen = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "# Age=V%d_%d_%d",
&major, &minor, &revision);
if (ret == 3) {
ages_count++;
if (verbose > 1)
printf(" Age V%d_%d_%d\n",
major, minor, revision);
if (!age_valid(major, minor, revision))
line_fail(age_name, line);
continue;
}
ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
if (ret == 2) {
ages_count++;
if (verbose > 1)
printf(" Age V%d_%d\n", major, minor);
if (!age_valid(major, minor, 0))
line_fail(age_name, line);
continue;
}
}
/* We must have found something above. */
if (verbose > 1)
printf("%d age entries\n", ages_count);
if (ages_count == 0 || ages_count > MAXGEN)
file_fail(age_name);
/* There is a 0 entry. */
ages_count++;
ages = calloc(ages_count + 1, sizeof(*ages));
/* And a guard entry. */
ages[ages_count] = (unsigned int)-1;
rewind(file);
count = 0;
gen = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "# Age=V%d_%d_%d",
&major, &minor, &revision);
if (ret == 3) {
ages[++gen] =
UNICODE_AGE(major, minor, revision);
if (verbose > 1)
printf(" Age V%d_%d_%d = gen %d\n",
major, minor, revision, gen);
if (!age_valid(major, minor, revision))
line_fail(age_name, line);
continue;
}
ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
if (ret == 2) {
ages[++gen] = UNICODE_AGE(major, minor, 0);
if (verbose > 1)
printf(" Age V%d_%d = %d\n",
major, minor, gen);
if (!age_valid(major, minor, 0))
line_fail(age_name, line);
continue;
}
ret = sscanf(line, "%X..%X ; %d.%d #",
&first, &last, &major, &minor);
if (ret == 4) {
for (unichar = first; unichar <= last; unichar++)
unicode_data[unichar].gen = gen;
count += 1 + last - first;
if (verbose > 1)
printf(" %X..%X gen %d\n", first, last, gen);
if (!utf32valid(first) || !utf32valid(last))
line_fail(age_name, line);
continue;
}
ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
if (ret == 3) {
unicode_data[unichar].gen = gen;
count++;
if (verbose > 1)
printf(" %X gen %d\n", unichar, gen);
if (!utf32valid(unichar))
line_fail(age_name, line);
continue;
}
}
unicode_maxage = ages[gen];
fclose(file);
/* Nix surrogate block */
if (verbose > 1)
printf(" Removing surrogate block D800..DFFF\n");
for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
unicode_data[unichar].gen = -1;
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(age_name);
}
static void ccc_init(void)
{
FILE *file;
unsigned int first;
unsigned int last;
unsigned int unichar;
unsigned int value;
int count;
int ret;
if (verbose > 0)
printf("Parsing %s\n", ccc_name);
file = fopen(ccc_name, "r");
if (!file)
open_fail(ccc_name, errno);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
if (ret == 3) {
for (unichar = first; unichar <= last; unichar++) {
unicode_data[unichar].ccc = value;
count++;
}
if (verbose > 1)
printf(" %X..%X ccc %d\n", first, last, value);
if (!utf32valid(first) || !utf32valid(last))
line_fail(ccc_name, line);
continue;
}
ret = sscanf(line, "%X ; %d #", &unichar, &value);
if (ret == 2) {
unicode_data[unichar].ccc = value;
count++;
if (verbose > 1)
printf(" %X ccc %d\n", unichar, value);
if (!utf32valid(unichar))
line_fail(ccc_name, line);
continue;
}
}
fclose(file);
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(ccc_name);
}
static int ignore_compatibility_form(char *type)
{
int i;
char *ignored_types[] = {"font", "noBreak", "initial", "medial",
"final", "isolated", "circle", "super",
"sub", "vertical", "wide", "narrow",
"small", "square", "fraction", "compat"};
for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
if (strcmp(type, ignored_types[i]) == 0)
return 1;
return 0;
}
static void nfdi_init(void)
{
FILE *file;
unsigned int unichar;
unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
char *s;
char *type;
unsigned int *um;
int count;
int i;
int ret;
if (verbose > 0)
printf("Parsing %s\n", data_name);
file = fopen(data_name, "r");
if (!file)
open_fail(data_name, errno);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
&unichar, buf0);
if (ret != 2)
continue;
if (!utf32valid(unichar))
line_fail(data_name, line);
s = buf0;
/* skip over <tag> */
if (*s == '<') {
type = ++s;
while (*++s != '>');
*s++ = '\0';
if(ignore_compatibility_form(type))
continue;
}
/* decode the decomposition into UTF-32 */
i = 0;
while (*s) {
mapping[i] = strtoul(s, &s, 16);
if (!utf32valid(mapping[i]))
line_fail(data_name, line);
i++;
}
mapping[i++] = 0;
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdi = um;
if (verbose > 1)
print_utf32nfdi(unichar);
count++;
}
fclose(file);
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(data_name);
}
static void nfdicf_init(void)
{
FILE *file;
unsigned int unichar;
unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
char status;
char *s;
unsigned int *um;
int i;
int count;
int ret;
if (verbose > 0)
printf("Parsing %s\n", fold_name);
file = fopen(fold_name, "r");
if (!file)
open_fail(fold_name, errno);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
if (ret != 3)
continue;
if (!utf32valid(unichar))
line_fail(fold_name, line);
/* Use the C+F casefold. */
if (status != 'C' && status != 'F')
continue;
s = buf0;
if (*s == '<')
while (*s++ != ' ')
;
i = 0;
while (*s) {
mapping[i] = strtoul(s, &s, 16);
if (!utf32valid(mapping[i]))
line_fail(fold_name, line);
i++;
}
mapping[i++] = 0;
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
if (verbose > 1)
print_utf32nfdicf(unichar);
count++;
}
fclose(file);
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(fold_name);
}
static void ignore_init(void)
{
FILE *file;
unsigned int unichar;
unsigned int first;
unsigned int last;
unsigned int *um;
int count;
int ret;
if (verbose > 0)
printf("Parsing %s\n", prop_name);
file = fopen(prop_name, "r");
if (!file)
open_fail(prop_name, errno);
assert(file);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
if (ret == 3) {
if (strcmp(buf0, "Default_Ignorable_Code_Point"))
continue;
if (!utf32valid(first) || !utf32valid(last))
line_fail(prop_name, line);
for (unichar = first; unichar <= last; unichar++) {
free(unicode_data[unichar].utf32nfdi);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdi = um;
free(unicode_data[unichar].utf32nfdicf);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdicf = um;
count++;
}
if (verbose > 1)
printf(" %X..%X Default_Ignorable_Code_Point\n",
first, last);
continue;
}
ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
if (ret == 2) {
if (strcmp(buf0, "Default_Ignorable_Code_Point"))
continue;
if (!utf32valid(unichar))
line_fail(prop_name, line);
free(unicode_data[unichar].utf32nfdi);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdi = um;
free(unicode_data[unichar].utf32nfdicf);
um = malloc(sizeof(unsigned int));
*um = 0;
unicode_data[unichar].utf32nfdicf = um;
if (verbose > 1)
printf(" %X Default_Ignorable_Code_Point\n",
unichar);
count++;
continue;
}
}
fclose(file);
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(prop_name);
}
static void corrections_init(void)
{
FILE *file;
unsigned int unichar;
unsigned int major;
unsigned int minor;
unsigned int revision;
unsigned int age;
unsigned int *um;
unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
char *s;
int i;
int count;
int ret;
if (verbose > 0)
printf("Parsing %s\n", norm_name);
file = fopen(norm_name, "r");
if (!file)
open_fail(norm_name, errno);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
&unichar, buf0, buf1,
&major, &minor, &revision);
if (ret != 6)
continue;
if (!utf32valid(unichar) || !age_valid(major, minor, revision))
line_fail(norm_name, line);
count++;
}
corrections = calloc(count, sizeof(struct unicode_data));
corrections_count = count;
rewind(file);
count = 0;
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
&unichar, buf0, buf1,
&major, &minor, &revision);
if (ret != 6)
continue;
if (!utf32valid(unichar) || !age_valid(major, minor, revision))
line_fail(norm_name, line);
corrections[count] = unicode_data[unichar];
assert(corrections[count].code == unichar);
age = UNICODE_AGE(major, minor, revision);
corrections[count].correction = age;
i = 0;
s = buf0;
while (*s) {
mapping[i] = strtoul(s, &s, 16);
if (!utf32valid(mapping[i]))
line_fail(norm_name, line);
i++;
}
mapping[i++] = 0;
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
corrections[count].utf32nfdi = um;
if (verbose > 1)
printf(" %X -> %s -> %s V%d_%d_%d\n",
unichar, buf0, buf1, major, minor, revision);
count++;
}
fclose(file);
if (verbose > 0)
printf("Found %d entries\n", count);
if (count == 0)
file_fail(norm_name);
}
/* ------------------------------------------------------------------ */
/*
* Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
*
* AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
* D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
*
* SBase = 0xAC00
* LBase = 0x1100
* VBase = 0x1161
* TBase = 0x11A7
* LCount = 19
* VCount = 21
* TCount = 28
* NCount = 588 (VCount * TCount)
* SCount = 11172 (LCount * NCount)
*
* Decomposition:
* SIndex = s - SBase
*
* LV (Canonical/Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* LPart = LBase + LIndex
* VPart = VBase + VIndex
*
* LVT (Canonical)
* LVIndex = (SIndex / TCount) * TCount
* TIndex = (Sindex % TCount)
* LVPart = SBase + LVIndex
* TPart = TBase + TIndex
*
* LVT (Full)
* LIndex = SIndex / NCount
* VIndex = (Sindex % NCount) / TCount
* TIndex = (Sindex % TCount)
* LPart = LBase + LIndex
* VPart = VBase + VIndex
* if (TIndex == 0) {
* d = <LPart, VPart>
* } else {
* TPart = TBase + TIndex
* d = <LPart, VPart, TPart>
* }
*
*/
static void
hangul_decompose(void)
{
unsigned int sb = 0xAC00;
unsigned int lb = 0x1100;
unsigned int vb = 0x1161;
unsigned int tb = 0x11a7;
/* unsigned int lc = 19; */
unsigned int vc = 21;
unsigned int tc = 28;
unsigned int nc = (vc * tc);
/* unsigned int sc = (lc * nc); */
unsigned int unichar;
unsigned int mapping[4];
unsigned int *um;
int count;
int i;
if (verbose > 0)
printf("Decomposing hangul\n");
/* Hangul */
count = 0;
for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
unsigned int si = unichar - sb;
unsigned int li = si / nc;
unsigned int vi = (si % nc) / tc;
unsigned int ti = si % tc;
i = 0;
mapping[i++] = lb + li;
mapping[i++] = vb + vi;
if (ti)
mapping[i++] = tb + ti;
mapping[i++] = 0;
assert(!unicode_data[unichar].utf32nfdi);
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdi = um;
assert(!unicode_data[unichar].utf32nfdicf);
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
if (verbose > 1)
print_utf32nfdi(unichar);
count++;
}
if (verbose > 0)
printf("Created %d entries\n", count);
}
static void nfdi_decompose(void)
{
unsigned int unichar;
unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
unsigned int *um;
unsigned int *dc;
int count;
int i;
int j;
int ret;
if (verbose > 0)
printf("Decomposing nfdi\n");
count = 0;
for (unichar = 0; unichar != 0x110000; unichar++) {
if (!unicode_data[unichar].utf32nfdi)
continue;
for (;;) {
ret = 1;
i = 0;
um = unicode_data[unichar].utf32nfdi;
while (*um) {
dc = unicode_data[*um].utf32nfdi;
if (dc) {
for (j = 0; dc[j]; j++)
mapping[i++] = dc[j];
ret = 0;
} else {
mapping[i++] = *um;
}
um++;
}
mapping[i++] = 0;
if (ret)
break;
free(unicode_data[unichar].utf32nfdi);
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdi = um;
}
/* Add this decomposition to nfdicf if there is no entry. */
if (!unicode_data[unichar].utf32nfdicf) {
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
}
if (verbose > 1)
print_utf32nfdi(unichar);
count++;
}
if (verbose > 0)
printf("Processed %d entries\n", count);
}
static void nfdicf_decompose(void)
{
unsigned int unichar;
unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
unsigned int *um;
unsigned int *dc;
int count;
int i;
int j;
int ret;
if (verbose > 0)
printf("Decomposing nfdicf\n");
count = 0;
for (unichar = 0; unichar != 0x110000; unichar++) {
if (!unicode_data[unichar].utf32nfdicf)
continue;
for (;;) {
ret = 1;
i = 0;
um = unicode_data[unichar].utf32nfdicf;
while (*um) {
dc = unicode_data[*um].utf32nfdicf;
if (dc) {
for (j = 0; dc[j]; j++)
mapping[i++] = dc[j];
ret = 0;
} else {
mapping[i++] = *um;
}
um++;
}
mapping[i++] = 0;
if (ret)
break;
free(unicode_data[unichar].utf32nfdicf);
um = malloc(i * sizeof(unsigned int));
memcpy(um, mapping, i * sizeof(unsigned int));
unicode_data[unichar].utf32nfdicf = um;
}
if (verbose > 1)
print_utf32nfdicf(unichar);
count++;
}
if (verbose > 0)
printf("Processed %d entries\n", count);
}
/* ------------------------------------------------------------------ */
int utf8agemax(struct tree *, const char *);
int utf8nagemax(struct tree *, const char *, size_t);
int utf8agemin(struct tree *, const char *);
int utf8nagemin(struct tree *, const char *, size_t);
ssize_t utf8len(struct tree *, const char *);
ssize_t utf8nlen(struct tree *, const char *, size_t);
struct utf8cursor;
int utf8cursor(struct utf8cursor *, struct tree *, const char *);
int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
int utf8byte(struct utf8cursor *);
/*
* Use trie to scan s, touching at most len bytes.
* Returns the leaf if one exists, NULL otherwise.
*
* A non-NULL return guarantees that the UTF-8 sequence starting at s
* is well-formed and corresponds to a known unicode code point. The
* shorthand for this will be "is valid UTF-8 unicode".
*/
static utf8leaf_t *utf8nlookup(struct tree *tree, const char *s, size_t len)
{
utf8trie_t *trie;
int offlen;
int offset;
int mask;
int node;
if (!tree)
return NULL;
if (len == 0)
return NULL;
node = 1;
trie = utf8data + tree->index;
while (node) {
offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
if (*trie & NEXTBYTE) {
if (--len == 0)
return NULL;
s++;
}
mask = 1 << (*trie & BITNUM);
if (*s & mask) {
/* Right leg */
if (offlen) {
/* Right node at offset of trie */
node = (*trie & RIGHTNODE);
offset = trie[offlen];
while (--offlen) {
offset <<= 8;
offset |= trie[offlen];
}
trie += offset;
} else if (*trie & RIGHTPATH) {
/* Right node after this node */
node = (*trie & TRIENODE);
trie++;
} else {
/* No right node. */
return NULL;
}
} else {
/* Left leg */
if (offlen) {
/* Left node after this node. */
node = (*trie & LEFTNODE);
trie += offlen + 1;
} else if (*trie & RIGHTPATH) {
/* No left node. */
return NULL;
} else {
/* Left node after this node */
node = (*trie & TRIENODE);
trie++;
}
}
}
return trie;
}
/*
* Use trie to scan s.
* Returns the leaf if one exists, NULL otherwise.
*
* Forwards to trie_nlookup().
*/
static utf8leaf_t *utf8lookup(struct tree *tree, const char *s)
{
return utf8nlookup(tree, s, (size_t)-1);
}
/*
* Return the number of bytes used by the current UTF-8 sequence.
* Assumes the input points to the first byte of a valid UTF-8
* sequence.
*/
static inline int utf8clen(const char *s)
{
unsigned char c = *s;
return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
}
/*
* Maximum age of any character in s.
* Return -1 if s is not valid UTF-8 unicode.
* Return 0 if only non-assigned code points are used.
*/
int utf8agemax(struct tree *tree, const char *s)
{
utf8leaf_t *leaf;
int age = 0;
int leaf_age;
if (!tree)
return -1;
while (*s) {
if (!(leaf = utf8lookup(tree, s)))
return -1;
leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age > age)
age = leaf_age;
s += utf8clen(s);
}
return age;
}
/*
* Minimum age of any character in s.
* Return -1 if s is not valid UTF-8 unicode.
* Return 0 if non-assigned code points are used.
*/
int utf8agemin(struct tree *tree, const char *s)
{
utf8leaf_t *leaf;
int age;
int leaf_age;
if (!tree)
return -1;
age = tree->maxage;
while (*s) {
if (!(leaf = utf8lookup(tree, s)))
return -1;
leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age < age)
age = leaf_age;
s += utf8clen(s);
}
return age;
}
/*
* Maximum age of any character in s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
int utf8nagemax(struct tree *tree, const char *s, size_t len)
{
utf8leaf_t *leaf;
int age = 0;
int leaf_age;
if (!tree)
return -1;
while (len && *s) {
if (!(leaf = utf8nlookup(tree, s, len)))
return -1;
leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age > age)
age = leaf_age;
len -= utf8clen(s);
s += utf8clen(s);
}
return age;
}
/*
* Maximum age of any character in s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
int utf8nagemin(struct tree *tree, const char *s, size_t len)
{
utf8leaf_t *leaf;
int leaf_age;
int age;
if (!tree)
return -1;
age = tree->maxage;
while (len && *s) {
if (!(leaf = utf8nlookup(tree, s, len)))
return -1;
leaf_age = ages[LEAF_GEN(leaf)];
if (leaf_age <= tree->maxage && leaf_age < age)
age = leaf_age;
len -= utf8clen(s);
s += utf8clen(s);
}
return age;
}
/*
* Length of the normalization of s.
* Return -1 if s is not valid UTF-8 unicode.
*
* A string of Default_Ignorable_Code_Point has length 0.
*/
ssize_t utf8len(struct tree *tree, const char *s)
{
utf8leaf_t *leaf;
size_t ret = 0;
if (!tree)
return -1;
while (*s) {
if (!(leaf = utf8lookup(tree, s)))
return -1;
if (ages[LEAF_GEN(leaf)] > tree->maxage)
ret += utf8clen(s);
else if (LEAF_CCC(leaf) == DECOMPOSE)
ret += strlen(LEAF_STR(leaf));
else
ret += utf8clen(s);
s += utf8clen(s);
}
return ret;
}
/*
* Length of the normalization of s, touch at most len bytes.
* Return -1 if s is not valid UTF-8 unicode.
*/
ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
{
utf8leaf_t *leaf;
size_t ret = 0;
if (!tree)
return -1;
while (len && *s) {
if (!(leaf = utf8nlookup(tree, s, len)))
return -1;
if (ages[LEAF_GEN(leaf)] > tree->maxage)
ret += utf8clen(s);
else if (LEAF_CCC(leaf) == DECOMPOSE)
ret += strlen(LEAF_STR(leaf));
else
ret += utf8clen(s);
len -= utf8clen(s);
s += utf8clen(s);
}
return ret;
}
/*
* Cursor structure used by the normalizer.
*/
struct utf8cursor {
struct tree *tree;
const char *s;
const char *p;
const char *ss;
const char *sp;
unsigned int len;
unsigned int slen;
short int ccc;
short int nccc;
unsigned int unichar;
};
/*
* Set up an utf8cursor for use by utf8byte().
*
* s : string.
* len : length of s.
* u8c : pointer to cursor.
* trie : utf8trie_t to use for normalization.
*
* Returns -1 on error, 0 on success.
*/
int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
size_t len)
{
if (!tree)
return -1;
if (!s)
return -1;
u8c->tree = tree;
u8c->s = s;
u8c->p = NULL;
u8c->ss = NULL;
u8c->sp = NULL;
u8c->len = len;
u8c->slen = 0;
u8c->ccc = STOPPER;
u8c->nccc = STOPPER;
u8c->unichar = 0;
/* Check we didn't clobber the maximum length. */
if (u8c->len != len)
return -1;
/* The first byte of s may not be an utf8 continuation. */
if (len > 0 && (*s & 0xC0) == 0x80)
return -1;
return 0;
}
/*
* Set up an utf8cursor for use by utf8byte().
*
* s : NUL-terminated string.
* u8c : pointer to cursor.
* trie : utf8trie_t to use for normalization.
*
* Returns -1 on error, 0 on success.
*/
int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
{
return utf8ncursor(u8c, tree, s, (unsigned int)-1);
}
/*
* Get one byte from the normalized form of the string described by u8c.
*
* Returns the byte cast to an unsigned char on succes, and -1 on failure.
*
* The cursor keeps track of the location in the string in u8c->s.
* When a character is decomposed, the current location is stored in
* u8c->p, and u8c->s is set to the start of the decomposition. Note
* that bytes from a decomposition do not count against u8c->len.
*
* Characters are emitted if they match the current CCC in u8c->ccc.
* Hitting end-of-string while u8c->ccc == STOPPER means we're done,
* and the function returns 0 in that case.
*
* Sorting by CCC is done by repeatedly scanning the string. The
* values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
* the start of the scan. The first pass finds the lowest CCC to be
* emitted and stores it in u8c->nccc, the second pass emits the
* characters with this CCC and finds the next lowest CCC. This limits
* the number of passes to 1 + the number of different CCCs in the
* sequence being scanned.
*
* Therefore:
* u8c->p != NULL -> a decomposition is being scanned.
* u8c->ss != NULL -> this is a repeating scan.
* u8c->ccc == -1 -> this is the first scan of a repeating scan.
*/
int utf8byte(struct utf8cursor *u8c)
{
utf8leaf_t *leaf;
int ccc;
for (;;) {
/* Check for the end of a decomposed character. */
if (u8c->p && *u8c->s == '\0') {
u8c->s = u8c->p;
u8c->p = NULL;
}
/* Check for end-of-string. */
if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
/* There is no next byte. */
if (u8c->ccc == STOPPER)
return 0;
/* End-of-string during a scan counts as a stopper. */
ccc = STOPPER;
goto ccc_mismatch;
} else if ((*u8c->s & 0xC0) == 0x80) {
/* This is a continuation of the current character. */
if (!u8c->p)
u8c->len--;
return (unsigned char)*u8c->s++;
}
/* Look up the data for the current character. */
if (u8c->p)
leaf = utf8lookup(u8c->tree, u8c->s);
else
leaf = utf8nlookup(u8c->tree, u8c->s, u8c->len);
/* No leaf found implies that the input is a binary blob. */
if (!leaf)
return -1;
/* Characters that are too new have CCC 0. */
if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
ccc = STOPPER;
} else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
u8c->len -= utf8clen(u8c->s);
u8c->p = u8c->s + utf8clen(u8c->s);
u8c->s = LEAF_STR(leaf);
/* Empty decomposition implies CCC 0. */
if (*u8c->s == '\0') {
if (u8c->ccc == STOPPER)
continue;
ccc = STOPPER;
goto ccc_mismatch;
}
leaf = utf8lookup(u8c->tree, u8c->s);
ccc = LEAF_CCC(leaf);
}
u8c->unichar = utf8decode(u8c->s);
/*
* If this is not a stopper, then see if it updates
* the next canonical class to be emitted.
*/
if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
u8c->nccc = ccc;
/*
* Return the current byte if this is the current
* combining class.
*/
if (ccc == u8c->ccc) {
if (!u8c->p)
u8c->len--;
return (unsigned char)*u8c->s++;
}
/* Current combining class mismatch. */
ccc_mismatch:
if (u8c->nccc == STOPPER) {
/*
* Scan forward for the first canonical class
* to be emitted. Save the position from
* which to restart.
*/
assert(u8c->ccc == STOPPER);
u8c->ccc = MINCCC - 1;
u8c->nccc = ccc;
u8c->sp = u8c->p;
u8c->ss = u8c->s;
u8c->slen = u8c->len;
if (!u8c->p)
u8c->len -= utf8clen(u8c->s);
u8c->s += utf8clen(u8c->s);
} else if (ccc != STOPPER) {
/* Not a stopper, and not the ccc we're emitting. */
if (!u8c->p)
u8c->len -= utf8clen(u8c->s);
u8c->s += utf8clen(u8c->s);
} else if (u8c->nccc != MAXCCC + 1) {
/* At a stopper, restart for next ccc. */
u8c->ccc = u8c->nccc;
u8c->nccc = MAXCCC + 1;
u8c->s = u8c->ss;
u8c->p = u8c->sp;
u8c->len = u8c->slen;
} else {
/* All done, proceed from here. */
u8c->ccc = STOPPER;
u8c->nccc = STOPPER;
u8c->sp = NULL;
u8c->ss = NULL;
u8c->slen = 0;
}
}
}
/* ------------------------------------------------------------------ */
static int normalize_line(struct tree *tree)
{
char *s;
char *t;
int c;
struct utf8cursor u8c;
/* First test: null-terminated string. */
s = buf2;
t = buf3;
if (utf8cursor(&u8c, tree, s))
return -1;
while ((c = utf8byte(&u8c)) > 0)
if (c != (unsigned char)*t++)
return -1;
if (c < 0)
return -1;
if (*t != 0)
return -1;
/* Second test: length-limited string. */
s = buf2;
/* Replace NUL with a value that will cause an error if seen. */
s[strlen(s) + 1] = -1;
t = buf3;
if (utf8cursor(&u8c, tree, s))
return -1;
while ((c = utf8byte(&u8c)) > 0)
if (c != (unsigned char)*t++)
return -1;
if (c < 0)
return -1;
if (*t != 0)
return -1;
return 0;
}
static void normalization_test(void)
{
FILE *file;
unsigned int unichar;
struct unicode_data *data;
char *s;
char *t;
int ret;
int ignorables;
int tests = 0;
int failures = 0;
if (verbose > 0)
printf("Parsing %s\n", test_name);
/* Step one, read data from file. */
file = fopen(test_name, "r");
if (!file)
open_fail(test_name, errno);
while (fgets(line, LINESIZE, file)) {
ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
buf0, buf1);
if (ret != 2 || *line == '#')
continue;
s = buf0;
t = buf2;
while (*s) {
unichar = strtoul(s, &s, 16);
t += utf8encode(t, unichar);
}
*t = '\0';
ignorables = 0;
s = buf1;
t = buf3;
while (*s) {
unichar = strtoul(s, &s, 16);
data = &unicode_data[unichar];
if (data->utf8nfdi && !*data->utf8nfdi)
ignorables = 1;
else
t += utf8encode(t, unichar);
}
*t = '\0';
tests++;
if (normalize_line(nfdi_tree) < 0) {
printf("Line %s -> %s", buf0, buf1);
if (ignorables)
printf(" (ignorables removed)");
printf(" failure\n");
failures++;
}
}
fclose(file);
if (verbose > 0)
printf("Ran %d tests with %d failures\n", tests, failures);
if (failures)
file_fail(test_name);
}
/* ------------------------------------------------------------------ */
static void write_file(void)
{
FILE *file;
int i;
int j;
int t;
int gen;
if (verbose > 0)
printf("Writing %s\n", utf8_name);
file = fopen(utf8_name, "w");
if (!file)
open_fail(utf8_name, errno);
fprintf(file, "/* This file is generated code, do not edit. */\n");
fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
fprintf(file, "#endif\n");
fprintf(file, "\n");
fprintf(file, "static const unsigned int utf8vers = %#x;\n",
unicode_maxage);
fprintf(file, "\n");
fprintf(file, "static const unsigned int utf8agetab[] = {\n");
for (i = 0; i != ages_count; i++)
fprintf(file, "\t%#x%s\n", ages[i],
ages[i] == unicode_maxage ? "" : ",");
fprintf(file, "};\n");
fprintf(file, "\n");
fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
t = 0;
for (gen = 0; gen < ages_count; gen++) {
fprintf(file, "\t{ %#x, %d }%s\n",
ages[gen], trees[t].index,
ages[gen] == unicode_maxage ? "" : ",");
if (trees[t].maxage == ages[gen])
t += 2;
}
fprintf(file, "};\n");
fprintf(file, "\n");
fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
t = 1;
for (gen = 0; gen < ages_count; gen++) {
fprintf(file, "\t{ %#x, %d }%s\n",
ages[gen], trees[t].index,
ages[gen] == unicode_maxage ? "" : ",");
if (trees[t].maxage == ages[gen])
t += 2;
}
fprintf(file, "};\n");
fprintf(file, "\n");
fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
utf8data_size);
t = 0;
for (i = 0; i != utf8data_size; i += 16) {
if (i == trees[t].index) {
fprintf(file, "\t/* %s_%x */\n",
trees[t].type, trees[t].maxage);
if (t < trees_count-1)
t++;
}
fprintf(file, "\t");
for (j = i; j != i + 16; j++)
fprintf(file, "0x%.2x%s", utf8data[j],
(j < utf8data_size -1 ? "," : ""));
fprintf(file, "\n");
}
fprintf(file, "};\n");
fclose(file);
}
/* ------------------------------------------------------------------ */
int main(int argc, char *argv[])
{
unsigned int unichar;
int opt;
argv0 = argv[0];
while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
switch (opt) {
case 'a':
age_name = optarg;
break;
case 'c':
ccc_name = optarg;
break;
case 'd':
data_name = optarg;
break;
case 'f':
fold_name = optarg;
break;
case 'n':
norm_name = optarg;
break;
case 'o':
utf8_name = optarg;
break;
case 'p':
prop_name = optarg;
break;
case 't':
test_name = optarg;
break;
case 'v':
verbose++;
break;
case 'h':
help();
exit(0);
default:
usage();
}
}
if (verbose > 1)
help();
for (unichar = 0; unichar != 0x110000; unichar++)
unicode_data[unichar].code = unichar;
age_init();
ccc_init();
nfdi_init();
nfdicf_init();
ignore_init();
corrections_init();
hangul_decompose();
nfdi_decompose();
nfdicf_decompose();
utf8_init();
trees_init();
trees_populate();
trees_reduce();
trees_verify();
/* Prevent "unused function" warning. */
(void)lookup(nfdi_tree, " ");
if (verbose > 2)
tree_walk(nfdi_tree);
if (verbose > 2)
tree_walk(nfdicf_tree);
normalization_test();
write_file();
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment