Initial import of Dillo
This commit is contained in:
603
dw/hyphenator.cc
Normal file
603
dw/hyphenator.cc
Normal file
@ -0,0 +1,603 @@
|
||||
/*
|
||||
* Dillo Widget
|
||||
*
|
||||
* Copyright 2012-2013 Sebastian Geerken <sgeerken@dillo.org>,
|
||||
* Johannes Hofmann <Johannes.Hofmann@gmx.de>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
#include "hyphenator.hh"
|
||||
|
||||
#include "dlib/dlib.h"
|
||||
#include "../lout/misc.hh"
|
||||
#include "../lout/unicode.hh"
|
||||
#include <limits.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#define LEN 1000
|
||||
|
||||
/*
|
||||
* This is (or at least began as) a direct translation of Ned Batchelder's
|
||||
* public-domain Python implementation at
|
||||
* http://nedbatchelder.com/code/modules/hyphenate.py
|
||||
*/
|
||||
|
||||
using namespace lout::object;
|
||||
using namespace lout::container::typed;
|
||||
using namespace lout::misc;
|
||||
using namespace lout::unicode;
|
||||
|
||||
namespace dw {
|
||||
|
||||
HashTable <String, Hyphenator> *Hyphenator::hyphenators =
|
||||
new HashTable <String, Hyphenator> (true, true);
|
||||
|
||||
Hyphenator::Hyphenator (const char *patFile, const char *excFile, int pack)
|
||||
{
|
||||
trie = NULL; // As long we are not sure whether a pattern file can be read.
|
||||
|
||||
int bufLen = strlen (patFile) + 5 + 1;
|
||||
char *buf = new char[bufLen];
|
||||
snprintf(buf, bufLen, "%s.trie", patFile);
|
||||
FILE *trieF = fopen (buf, "r");
|
||||
delete[] buf;
|
||||
|
||||
if (trieF) {
|
||||
trie = new Trie ();
|
||||
if (trie->load (trieF) != 0) {
|
||||
delete trie;
|
||||
trie = NULL;
|
||||
}
|
||||
fclose (trieF);
|
||||
}
|
||||
|
||||
if (trie == NULL) {
|
||||
TrieBuilder trieBuilder(pack);
|
||||
FILE *patF = fopen (patFile, "r");
|
||||
if (patF) {
|
||||
|
||||
while (!feof (patF)) {
|
||||
char buf[LEN + 1];
|
||||
char *s = fgets (buf, LEN, patF);
|
||||
if (s && s[0] != '%') { // ignore lines starting with '%' as comment
|
||||
// TODO Better exit with an error, when the line is too long.
|
||||
int l = strlen (s);
|
||||
if (s[l - 1] == '\n')
|
||||
s[l - 1] = 0;
|
||||
insertPattern (&trieBuilder, s);
|
||||
}
|
||||
}
|
||||
|
||||
trie = trieBuilder.createTrie ();
|
||||
fclose (patF);
|
||||
}
|
||||
}
|
||||
|
||||
exceptions = NULL; // Again, only instantiated when needed.
|
||||
|
||||
FILE *excF = fopen (excFile, "r");
|
||||
if (excF) {
|
||||
exceptions = new HashTable <ConstString, Vector <Integer> > (true, true);
|
||||
while (!feof (excF)) {
|
||||
char buf[LEN + 1];
|
||||
char *s = fgets (buf, LEN, excF);
|
||||
if (s && s[0] != '%') { // ignore lines starting with '%' as comment
|
||||
// TODO Better exit with an error, when the line is too long.
|
||||
int l = strlen (s);
|
||||
if (s[l - 1] == '\n')
|
||||
s[l - 1] = 0;
|
||||
insertException (s);
|
||||
}
|
||||
}
|
||||
fclose (excF);
|
||||
}
|
||||
}
|
||||
|
||||
Hyphenator::~Hyphenator ()
|
||||
{
|
||||
delete trie;
|
||||
delete exceptions;
|
||||
}
|
||||
|
||||
Hyphenator *Hyphenator::getHyphenator (const char *lang)
|
||||
{
|
||||
String *langString = new String (lang);
|
||||
|
||||
Hyphenator *hyphenator = hyphenators->get (langString);
|
||||
if (hyphenator)
|
||||
delete langString;
|
||||
else {
|
||||
int patFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
|
||||
char *patFile = new char[patFileLen];
|
||||
snprintf (patFile, patFileLen, "%s/hyphenation/%s.pat",
|
||||
DILLO_LIBDIR, lang);
|
||||
int excFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
|
||||
char *excFile = new char[excFileLen];
|
||||
snprintf (excFile, excFileLen, "%s/hyphenation/%s.exc",
|
||||
DILLO_LIBDIR, lang);
|
||||
|
||||
//printf ("Loading hyphenation patterns for language '%s' from '%s' and "
|
||||
// "exceptions from '%s' ...\n", lang, patFile, excFile);
|
||||
|
||||
hyphenator = new Hyphenator (patFile, excFile);
|
||||
hyphenators->put (langString, hyphenator);
|
||||
delete[] patFile;
|
||||
delete[] excFile;
|
||||
}
|
||||
|
||||
//lout::misc::StringBuffer sb;
|
||||
//hyphenators->intoStringBuffer (&sb);
|
||||
//printf ("%s\n", sb.getChars ());
|
||||
|
||||
return hyphenator;
|
||||
}
|
||||
|
||||
void Hyphenator::insertPattern (TrieBuilder *trieBuilder, char *s)
|
||||
{
|
||||
// Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd'
|
||||
// and a list of points [ 0, 1, 0, 3, 4 ].
|
||||
int l = strlen (s);
|
||||
char *chars = new char[l + 1];
|
||||
SimpleVector<char> points (1);
|
||||
|
||||
// TODO numbers consisting of multiple digits?
|
||||
// TODO Encoding: This implementation works exactly like the Python
|
||||
// implementation, based on UTF-8. Does this always work?
|
||||
int numChars = 0;
|
||||
for (int i = 0; s[i]; i++) {
|
||||
if (s[i] >= '0' && s[i] <= '9') {
|
||||
points.setSize(numChars + 1, '0');
|
||||
points.set(numChars, s[i]);
|
||||
} else {
|
||||
chars[numChars++] = s[i];
|
||||
}
|
||||
}
|
||||
chars[numChars] = 0;
|
||||
|
||||
points.setSize(numChars + 2, '0');
|
||||
points.set(numChars + 1, '\0');
|
||||
|
||||
// Insert the pattern into the tree. Each character finds a dict
|
||||
// another level down in the tree, and leaf nodes have the list of
|
||||
// points.
|
||||
|
||||
//printf("insertPattern %s\n", chars);
|
||||
|
||||
trieBuilder->insert (chars, points.getArray ());
|
||||
delete[] chars;
|
||||
}
|
||||
|
||||
void Hyphenator::insertException (char *s)
|
||||
{
|
||||
Vector<Integer> *breaks = new Vector<Integer> (1, true);
|
||||
|
||||
int len = strlen (s);
|
||||
for (int i = 0; i < len - 1; i++)
|
||||
if((unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
|
||||
breaks->put (new Integer (i - 2 * breaks->size()));
|
||||
|
||||
char *noHyphens = new char[len - 2 * breaks->size() + 1];
|
||||
int j = 0;
|
||||
for (int i = 0; i < len; ) {
|
||||
if(i < len - 1 &&
|
||||
(unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
|
||||
i += 2;
|
||||
else
|
||||
noHyphens[j++] = s[i++];
|
||||
}
|
||||
noHyphens[j] = 0;
|
||||
|
||||
exceptions->put (new String (noHyphens), breaks);
|
||||
delete[] noHyphens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple test to avoid much costs. Passing it does not mean that the word
|
||||
* can be hyphenated.
|
||||
*/
|
||||
bool Hyphenator::isHyphenationCandidate (const char *word)
|
||||
{
|
||||
// Short words aren't hyphenated.
|
||||
return (strlen (word) > 4); // TODO UTF-8?
|
||||
}
|
||||
|
||||
/**
|
||||
* Test whether the character on which "s" points (UTF-8) is an actual
|
||||
* part of the word. Other characters at the beginning and end are
|
||||
* ignored.
|
||||
*
|
||||
* TODO Currently only suitable for English and German.
|
||||
* TODO Only lowercase. (Uppercase not needed.)
|
||||
*/
|
||||
bool Hyphenator::isCharPartOfActualWord (char *s)
|
||||
{
|
||||
return isAlpha (decodeUtf8 (s));
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a word, returns a list of the possible hyphenation points.
|
||||
*/
|
||||
int *Hyphenator::hyphenateWord(core::Platform *platform,
|
||||
const char *word, int *numBreaks)
|
||||
{
|
||||
if ((trie == NULL && exceptions == NULL) || !isHyphenationCandidate (word)) {
|
||||
*numBreaks = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *wordLc = platform->textToLower (word, strlen (word));
|
||||
|
||||
int start = 0;
|
||||
SimpleVector <int> breakPos (1);
|
||||
|
||||
// Split the original word up, ignore anything but characters, and
|
||||
// collect all break points, so that they fit to the original
|
||||
// word. (The latter is what the offset in the call of
|
||||
// hyphenateSingleWord() is for.)
|
||||
while (true) {
|
||||
while (wordLc[start] && !isCharPartOfActualWord (wordLc + start))
|
||||
start = platform->nextGlyph (wordLc, start);
|
||||
|
||||
if (wordLc[start] == 0)
|
||||
break;
|
||||
|
||||
int end = start, i = end;
|
||||
while (wordLc[i]) {
|
||||
if (!isCharPartOfActualWord (wordLc + i))
|
||||
break;
|
||||
else
|
||||
end = i;
|
||||
i = platform->nextGlyph (wordLc, i);
|
||||
}
|
||||
end = platform->nextGlyph (wordLc, end);
|
||||
|
||||
int nextStart;
|
||||
if (wordLc[end]) {
|
||||
nextStart = platform->nextGlyph (wordLc, end);
|
||||
wordLc[end] = 0;
|
||||
} else
|
||||
nextStart = end;
|
||||
|
||||
hyphenateSingleWord (platform, wordLc + start, start, &breakPos);
|
||||
start = nextStart;
|
||||
}
|
||||
|
||||
free (wordLc);
|
||||
|
||||
*numBreaks = breakPos.size ();
|
||||
if (*numBreaks == 0)
|
||||
return NULL;
|
||||
else {
|
||||
return breakPos.detachArray ();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Hyphenate a single word, which only consists of lowercase
|
||||
* characters. Store break positions + "offset" in "breakPos".
|
||||
*/
|
||||
void Hyphenator::hyphenateSingleWord(core::Platform *platform,
|
||||
char *wordLc, int offset,
|
||||
SimpleVector <int> *breakPos)
|
||||
{
|
||||
// If the word is an exception, get the stored points.
|
||||
Vector <Integer> *exceptionalBreaks;
|
||||
ConstString key (wordLc);
|
||||
if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) {
|
||||
for (int i = 0; i < exceptionalBreaks->size(); i++) {
|
||||
breakPos->increase ();
|
||||
breakPos->set (breakPos->size() - 1,
|
||||
exceptionalBreaks->get(i)->getValue() + offset);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// trie == NULL means that there is no pattern file.
|
||||
if (trie == NULL)
|
||||
return;
|
||||
|
||||
char *work = new char[strlen (wordLc) + 3];
|
||||
strcpy (work, ".");
|
||||
strcat (work, wordLc);
|
||||
strcat (work, ".");
|
||||
|
||||
int l = strlen (work);
|
||||
SimpleVector <int> points (l + 1);
|
||||
points.setSize (l + 1, 0);
|
||||
|
||||
for (int i = 0; i < l; i++) {
|
||||
int state = trie->root;
|
||||
|
||||
for (int j = i; j < l && trie->validState (state); j++) {
|
||||
const char *p = trie->getData((unsigned char) work[j], &state);
|
||||
|
||||
if (p) {
|
||||
for (int k = 0; p[k]; k++)
|
||||
points.set(i + k,
|
||||
lout::misc::max (points.get (i + k), p[k] - '0'));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete[] work;
|
||||
|
||||
// No hyphens in the first two chars or the last two.
|
||||
// Characters are not bytes, so UTF-8 characters must be counted.
|
||||
const char *s = nextUtf8Char (wordLc);
|
||||
if (s != NULL && (s = nextUtf8Char (s)) != NULL) {
|
||||
// First two characters.
|
||||
int bytesStart = s - wordLc;
|
||||
for (int i = 0; i < bytesStart; i++)
|
||||
points.set (i + 1, 0);
|
||||
|
||||
// Last two characters: instead of iterating back from the end,
|
||||
// we simply iterate from the start to the end and count the
|
||||
// characters.
|
||||
|
||||
int lenBytes = strlen (wordLc);
|
||||
int lenUtf8 = numUtf8Chars (wordLc);
|
||||
int bytesEnd = 0;
|
||||
|
||||
s = wordLc;
|
||||
for (int i = 0; s; s = nextUtf8Char (s), i++) {
|
||||
if (i == lenUtf8 - 2)
|
||||
bytesEnd = lenBytes - (s - wordLc);
|
||||
}
|
||||
|
||||
for (int i = 0; i < bytesEnd; i++)
|
||||
points.set (points.size() - 2 - i, 0);
|
||||
}
|
||||
|
||||
// Examine the points to build the break point list.
|
||||
int n = lout::misc::min ((int)strlen (wordLc), points.size () - 2);
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (points.get(i + 2) % 2) {
|
||||
breakPos->increase ();
|
||||
breakPos->set (breakPos->size() - 1, i + 1 + offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Trie::TrieNode TrieBuilder::trieNodeNull = {'\0', 0, NULL};
|
||||
|
||||
TrieBuilder::TrieBuilder (int pack)
|
||||
{
|
||||
this->pack = pack;
|
||||
dataList = new SimpleVector <DataEntry> (10000);
|
||||
stateStack = new SimpleVector <StackEntry> (10);
|
||||
tree = new SimpleVector <Trie::TrieNode> (20000);
|
||||
dataZone = new ZoneAllocator (1024);
|
||||
stateStackPush(0);
|
||||
}
|
||||
|
||||
TrieBuilder::~TrieBuilder ()
|
||||
{
|
||||
delete dataList;
|
||||
delete stateStack;
|
||||
delete tree;
|
||||
delete dataZone;
|
||||
}
|
||||
|
||||
void TrieBuilder::insert (const char *key, const char *value)
|
||||
{
|
||||
dataList->increase ();
|
||||
dataList->getLastRef ()->key = (unsigned char *) dStrdup(key);
|
||||
dataList->getLastRef ()->value = dataZone->strdup (value);
|
||||
}
|
||||
|
||||
int TrieBuilder::keyCompare (const void *p1, const void *p2)
|
||||
{
|
||||
DataEntry *pd1 = (DataEntry *) p1;
|
||||
DataEntry *pd2 = (DataEntry *) p2;
|
||||
|
||||
return strcmp ((char *) pd1->key, (char *) pd2->key);
|
||||
}
|
||||
|
||||
int TrieBuilder::insertState (StackEntry *state, bool root)
|
||||
{
|
||||
int i, j;
|
||||
|
||||
if (state->count == 0)
|
||||
return 0;
|
||||
|
||||
if (root) {
|
||||
i = 0; // we reserve slot 0 for the root state
|
||||
} else {
|
||||
/* The bigger pack is the more slots we check and the smaller
|
||||
* the trie will be, but CPU consumption also increases.
|
||||
* Reasonable values for pack seemt to be between 256 and 1024.
|
||||
*/
|
||||
i = tree->size () - pack + 2 * state->count;
|
||||
|
||||
if (i < 256) // reserve first 256 entries for the root state
|
||||
i = 256;
|
||||
}
|
||||
|
||||
for (;; i++) {
|
||||
if (i + 256 > tree->size ())
|
||||
tree->setSize (i + 256, trieNodeNull);
|
||||
|
||||
for (j = 1; j < 256; j++) {
|
||||
Trie::TrieNode *tn = tree->getRef(i + j);
|
||||
|
||||
if (tn->c == j || ((state->next[j] || state->data[j]) && tn->c != 0))
|
||||
break;
|
||||
}
|
||||
|
||||
if (j == 256) // found a suitable slot
|
||||
break;
|
||||
}
|
||||
|
||||
for (int j = 1; j < 256; j++) {
|
||||
Trie::TrieNode *tn = tree->getRef(i + j);
|
||||
|
||||
if (state->next[j] || state->data[j]) {
|
||||
tn->c = j;
|
||||
tn->next = state->next[j];
|
||||
tn->data = state->data[j];
|
||||
}
|
||||
}
|
||||
|
||||
assert (root || i >= 256);
|
||||
assert (!root || i == 0);
|
||||
return i;
|
||||
}
|
||||
|
||||
void TrieBuilder::stateStackPush (unsigned char c)
|
||||
{
|
||||
stateStack->increase ();
|
||||
StackEntry *e = stateStack->getLastRef ();
|
||||
memset (e, 0, sizeof (StackEntry));
|
||||
e->c = c;
|
||||
}
|
||||
|
||||
int TrieBuilder::stateStackPop ()
|
||||
{
|
||||
int next = insertState (stateStack->getLastRef (), stateStack->size () == 1);
|
||||
unsigned char c = stateStack->getLastRef ()->c;
|
||||
const char *data = stateStack->getLastRef ()->data1;
|
||||
|
||||
stateStack->setSize (stateStack->size () - 1);
|
||||
|
||||
if (stateStack->size () > 0) {
|
||||
assert (stateStack->getLastRef ()->next[c] == 0);
|
||||
assert (stateStack->getLastRef ()->data[c] == NULL);
|
||||
stateStack->getLastRef ()->next[c] = next;
|
||||
stateStack->getLastRef ()->data[c] = data;
|
||||
stateStack->getLastRef ()->count++;
|
||||
}
|
||||
|
||||
return next;
|
||||
}
|
||||
|
||||
Trie *TrieBuilder::createTrie ()
|
||||
{
|
||||
// we need to sort the patterns as byte strings not as unicode
|
||||
qsort (dataList->getArray (), dataList->size (),
|
||||
sizeof (DataEntry), keyCompare);
|
||||
|
||||
for (int i = 0; i < dataList->size (); i++) {
|
||||
insertSorted (dataList->getRef (i)->key, dataList->getRef (i)->value);
|
||||
free (dataList->getRef (i)->key);
|
||||
}
|
||||
|
||||
while (stateStack->size ())
|
||||
stateStackPop ();
|
||||
|
||||
int size = tree->size ();
|
||||
Trie *trie = new Trie(tree->detachArray(), size, true, dataZone);
|
||||
dataZone = NULL;
|
||||
return trie;
|
||||
}
|
||||
|
||||
void TrieBuilder::insertSorted (unsigned char *s, const char *data)
|
||||
{
|
||||
int len = strlen((char*)s);
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
if (stateStack->size () > i + 1 &&
|
||||
stateStack->getRef (i + 1)->c != s[i]) {
|
||||
for (int j = stateStack->size () - 1; j >= i + 1; j--)
|
||||
stateStackPop();
|
||||
}
|
||||
|
||||
if (i + 1 >= stateStack->size ())
|
||||
stateStackPush(s[i]);
|
||||
}
|
||||
|
||||
while (stateStack->size () > len + 1)
|
||||
stateStackPop();
|
||||
|
||||
assert (stateStack->size () == len + 1);
|
||||
stateStack->getLastRef ()->data1 = data;
|
||||
}
|
||||
|
||||
Trie::Trie (TrieNode *array, int size, bool freeArray, ZoneAllocator *dataZone)
|
||||
{
|
||||
this->array = array;
|
||||
this->size = size;
|
||||
this->freeArray = freeArray;
|
||||
this->dataZone = dataZone;
|
||||
}
|
||||
|
||||
Trie::~Trie ()
|
||||
{
|
||||
delete dataZone;
|
||||
if (freeArray)
|
||||
free(array);
|
||||
}
|
||||
|
||||
void Trie::save (FILE *file)
|
||||
{
|
||||
for (int i = 0; i < size; i++) {
|
||||
Trie::TrieNode *tn = &array[i];
|
||||
|
||||
if (tn->data)
|
||||
fprintf(file, "%u, %u, %s\n", tn->c, tn->next, tn->data);
|
||||
else
|
||||
fprintf(file, "%u, %u\n", tn->c, tn->next);
|
||||
}
|
||||
}
|
||||
|
||||
int Trie::load (FILE *file)
|
||||
{
|
||||
int next, c, maxNext = 0;
|
||||
SimpleVector <TrieNode> tree (100);
|
||||
dataZone = new ZoneAllocator (1024);
|
||||
|
||||
while (!feof (file)) {
|
||||
char buf[LEN + 1];
|
||||
char *s = fgets (buf, LEN, file);
|
||||
|
||||
if (!s)
|
||||
continue;
|
||||
|
||||
char data[LEN + 1];
|
||||
int n = sscanf (s, "%d, %d, %s", &c, &next, data);
|
||||
|
||||
if (n >= 2 && c >= 0 && c < 256 && next >= 0) {
|
||||
tree.increase ();
|
||||
tree.getLastRef ()->c = c;
|
||||
tree.getLastRef ()->next = next;
|
||||
if (n >= 3)
|
||||
tree.getLastRef ()->data = dataZone->strdup (data);
|
||||
else
|
||||
tree.getLastRef ()->data = NULL;
|
||||
|
||||
if (next > maxNext)
|
||||
maxNext = next;
|
||||
} else {
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
if (maxNext >= tree.size ())
|
||||
goto error;
|
||||
|
||||
size = tree.size ();
|
||||
array = tree.detachArray ();
|
||||
freeArray = true;
|
||||
return 0;
|
||||
|
||||
error:
|
||||
delete dataZone;
|
||||
dataZone = NULL;
|
||||
return 1;
|
||||
}
|
||||
|
||||
} // namespace dw
|
Reference in New Issue
Block a user