604 lines
16 KiB
C++
604 lines
16 KiB
C++
/*
|
|
* Dillo Widget
|
|
*
|
|
* Copyright 2012-2013 Sebastian Geerken <sgeerken@dillo.org>,
|
|
* Johannes Hofmann <Johannes.Hofmann@gmx.de>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
|
|
#include "hyphenator.hh"
|
|
|
|
#include "dlib/dlib.h"
|
|
#include "../lout/misc.hh"
|
|
#include "../lout/unicode.hh"
|
|
#include <limits.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
|
|
#define LEN 1000
|
|
|
|
/*
|
|
* This is (or at least began as) a direct translation of Ned Batchelder's
|
|
* public-domain Python implementation at
|
|
* http://nedbatchelder.com/code/modules/hyphenate.py
|
|
*/
|
|
|
|
using namespace lout::object;
|
|
using namespace lout::container::typed;
|
|
using namespace lout::misc;
|
|
using namespace lout::unicode;
|
|
|
|
namespace dw {
|
|
|
|
HashTable <String, Hyphenator> *Hyphenator::hyphenators =
|
|
new HashTable <String, Hyphenator> (true, true);
|
|
|
|
Hyphenator::Hyphenator (const char *patFile, const char *excFile, int pack)
|
|
{
|
|
trie = NULL; // As long we are not sure whether a pattern file can be read.
|
|
|
|
int bufLen = strlen (patFile) + 5 + 1;
|
|
char *buf = new char[bufLen];
|
|
snprintf(buf, bufLen, "%s.trie", patFile);
|
|
FILE *trieF = fopen (buf, "r");
|
|
delete[] buf;
|
|
|
|
if (trieF) {
|
|
trie = new Trie ();
|
|
if (trie->load (trieF) != 0) {
|
|
delete trie;
|
|
trie = NULL;
|
|
}
|
|
fclose (trieF);
|
|
}
|
|
|
|
if (trie == NULL) {
|
|
TrieBuilder trieBuilder(pack);
|
|
FILE *patF = fopen (patFile, "r");
|
|
if (patF) {
|
|
|
|
while (!feof (patF)) {
|
|
char buf[LEN + 1];
|
|
char *s = fgets (buf, LEN, patF);
|
|
if (s && s[0] != '%') { // ignore lines starting with '%' as comment
|
|
// TODO Better exit with an error, when the line is too long.
|
|
int l = strlen (s);
|
|
if (s[l - 1] == '\n')
|
|
s[l - 1] = 0;
|
|
insertPattern (&trieBuilder, s);
|
|
}
|
|
}
|
|
|
|
trie = trieBuilder.createTrie ();
|
|
fclose (patF);
|
|
}
|
|
}
|
|
|
|
exceptions = NULL; // Again, only instantiated when needed.
|
|
|
|
FILE *excF = fopen (excFile, "r");
|
|
if (excF) {
|
|
exceptions = new HashTable <ConstString, Vector <Integer> > (true, true);
|
|
while (!feof (excF)) {
|
|
char buf[LEN + 1];
|
|
char *s = fgets (buf, LEN, excF);
|
|
if (s && s[0] != '%') { // ignore lines starting with '%' as comment
|
|
// TODO Better exit with an error, when the line is too long.
|
|
int l = strlen (s);
|
|
if (s[l - 1] == '\n')
|
|
s[l - 1] = 0;
|
|
insertException (s);
|
|
}
|
|
}
|
|
fclose (excF);
|
|
}
|
|
}
|
|
|
|
Hyphenator::~Hyphenator ()
|
|
{
|
|
delete trie;
|
|
delete exceptions;
|
|
}
|
|
|
|
Hyphenator *Hyphenator::getHyphenator (const char *lang)
|
|
{
|
|
String *langString = new String (lang);
|
|
|
|
Hyphenator *hyphenator = hyphenators->get (langString);
|
|
if (hyphenator)
|
|
delete langString;
|
|
else {
|
|
int patFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
|
|
char *patFile = new char[patFileLen];
|
|
snprintf (patFile, patFileLen, "%s/hyphenation/%s.pat",
|
|
DILLO_LIBDIR, lang);
|
|
int excFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
|
|
char *excFile = new char[excFileLen];
|
|
snprintf (excFile, excFileLen, "%s/hyphenation/%s.exc",
|
|
DILLO_LIBDIR, lang);
|
|
|
|
//printf ("Loading hyphenation patterns for language '%s' from '%s' and "
|
|
// "exceptions from '%s' ...\n", lang, patFile, excFile);
|
|
|
|
hyphenator = new Hyphenator (patFile, excFile);
|
|
hyphenators->put (langString, hyphenator);
|
|
delete[] patFile;
|
|
delete[] excFile;
|
|
}
|
|
|
|
//lout::misc::StringBuffer sb;
|
|
//hyphenators->intoStringBuffer (&sb);
|
|
//printf ("%s\n", sb.getChars ());
|
|
|
|
return hyphenator;
|
|
}
|
|
|
|
void Hyphenator::insertPattern (TrieBuilder *trieBuilder, char *s)
|
|
{
|
|
// Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd'
|
|
// and a list of points [ 0, 1, 0, 3, 4 ].
|
|
int l = strlen (s);
|
|
char *chars = new char[l + 1];
|
|
SimpleVector<char> points (1);
|
|
|
|
// TODO numbers consisting of multiple digits?
|
|
// TODO Encoding: This implementation works exactly like the Python
|
|
// implementation, based on UTF-8. Does this always work?
|
|
int numChars = 0;
|
|
for (int i = 0; s[i]; i++) {
|
|
if (s[i] >= '0' && s[i] <= '9') {
|
|
points.setSize(numChars + 1, '0');
|
|
points.set(numChars, s[i]);
|
|
} else {
|
|
chars[numChars++] = s[i];
|
|
}
|
|
}
|
|
chars[numChars] = 0;
|
|
|
|
points.setSize(numChars + 2, '0');
|
|
points.set(numChars + 1, '\0');
|
|
|
|
// Insert the pattern into the tree. Each character finds a dict
|
|
// another level down in the tree, and leaf nodes have the list of
|
|
// points.
|
|
|
|
//printf("insertPattern %s\n", chars);
|
|
|
|
trieBuilder->insert (chars, points.getArray ());
|
|
delete[] chars;
|
|
}
|
|
|
|
void Hyphenator::insertException (char *s)
|
|
{
|
|
Vector<Integer> *breaks = new Vector<Integer> (1, true);
|
|
|
|
int len = strlen (s);
|
|
for (int i = 0; i < len - 1; i++)
|
|
if((unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
|
|
breaks->put (new Integer (i - 2 * breaks->size()));
|
|
|
|
char *noHyphens = new char[len - 2 * breaks->size() + 1];
|
|
int j = 0;
|
|
for (int i = 0; i < len; ) {
|
|
if(i < len - 1 &&
|
|
(unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
|
|
i += 2;
|
|
else
|
|
noHyphens[j++] = s[i++];
|
|
}
|
|
noHyphens[j] = 0;
|
|
|
|
exceptions->put (new String (noHyphens), breaks);
|
|
delete[] noHyphens;
|
|
}
|
|
|
|
/**
|
|
* Simple test to avoid much costs. Passing it does not mean that the word
|
|
* can be hyphenated.
|
|
*/
|
|
bool Hyphenator::isHyphenationCandidate (const char *word)
|
|
{
|
|
// Short words aren't hyphenated.
|
|
return (strlen (word) > 4); // TODO UTF-8?
|
|
}
|
|
|
|
/**
|
|
* Test whether the character on which "s" points (UTF-8) is an actual
|
|
* part of the word. Other characters at the beginning and end are
|
|
* ignored.
|
|
*
|
|
* TODO Currently only suitable for English and German.
|
|
* TODO Only lowercase. (Uppercase not needed.)
|
|
*/
|
|
bool Hyphenator::isCharPartOfActualWord (char *s)
|
|
{
|
|
return isAlpha (decodeUtf8 (s));
|
|
}
|
|
|
|
/**
|
|
* Given a word, returns a list of the possible hyphenation points.
|
|
*/
|
|
int *Hyphenator::hyphenateWord(core::Platform *platform,
|
|
const char *word, int *numBreaks)
|
|
{
|
|
if ((trie == NULL && exceptions == NULL) || !isHyphenationCandidate (word)) {
|
|
*numBreaks = 0;
|
|
return NULL;
|
|
}
|
|
|
|
char *wordLc = platform->textToLower (word, strlen (word));
|
|
|
|
int start = 0;
|
|
SimpleVector <int> breakPos (1);
|
|
|
|
// Split the original word up, ignore anything but characters, and
|
|
// collect all break points, so that they fit to the original
|
|
// word. (The latter is what the offset in the call of
|
|
// hyphenateSingleWord() is for.)
|
|
while (true) {
|
|
while (wordLc[start] && !isCharPartOfActualWord (wordLc + start))
|
|
start = platform->nextGlyph (wordLc, start);
|
|
|
|
if (wordLc[start] == 0)
|
|
break;
|
|
|
|
int end = start, i = end;
|
|
while (wordLc[i]) {
|
|
if (!isCharPartOfActualWord (wordLc + i))
|
|
break;
|
|
else
|
|
end = i;
|
|
i = platform->nextGlyph (wordLc, i);
|
|
}
|
|
end = platform->nextGlyph (wordLc, end);
|
|
|
|
int nextStart;
|
|
if (wordLc[end]) {
|
|
nextStart = platform->nextGlyph (wordLc, end);
|
|
wordLc[end] = 0;
|
|
} else
|
|
nextStart = end;
|
|
|
|
hyphenateSingleWord (platform, wordLc + start, start, &breakPos);
|
|
start = nextStart;
|
|
}
|
|
|
|
free (wordLc);
|
|
|
|
*numBreaks = breakPos.size ();
|
|
if (*numBreaks == 0)
|
|
return NULL;
|
|
else {
|
|
return breakPos.detachArray ();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Hyphenate a single word, which only consists of lowercase
|
|
* characters. Store break positions + "offset" in "breakPos".
|
|
*/
|
|
void Hyphenator::hyphenateSingleWord(core::Platform *platform,
|
|
char *wordLc, int offset,
|
|
SimpleVector <int> *breakPos)
|
|
{
|
|
// If the word is an exception, get the stored points.
|
|
Vector <Integer> *exceptionalBreaks;
|
|
ConstString key (wordLc);
|
|
if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) {
|
|
for (int i = 0; i < exceptionalBreaks->size(); i++) {
|
|
breakPos->increase ();
|
|
breakPos->set (breakPos->size() - 1,
|
|
exceptionalBreaks->get(i)->getValue() + offset);
|
|
}
|
|
return;
|
|
}
|
|
|
|
|
|
// trie == NULL means that there is no pattern file.
|
|
if (trie == NULL)
|
|
return;
|
|
|
|
char *work = new char[strlen (wordLc) + 3];
|
|
strcpy (work, ".");
|
|
strcat (work, wordLc);
|
|
strcat (work, ".");
|
|
|
|
int l = strlen (work);
|
|
SimpleVector <int> points (l + 1);
|
|
points.setSize (l + 1, 0);
|
|
|
|
for (int i = 0; i < l; i++) {
|
|
int state = trie->root;
|
|
|
|
for (int j = i; j < l && trie->validState (state); j++) {
|
|
const char *p = trie->getData((unsigned char) work[j], &state);
|
|
|
|
if (p) {
|
|
for (int k = 0; p[k]; k++)
|
|
points.set(i + k,
|
|
lout::misc::max (points.get (i + k), p[k] - '0'));
|
|
}
|
|
}
|
|
}
|
|
|
|
delete[] work;
|
|
|
|
// No hyphens in the first two chars or the last two.
|
|
// Characters are not bytes, so UTF-8 characters must be counted.
|
|
const char *s = nextUtf8Char (wordLc);
|
|
if (s != NULL && (s = nextUtf8Char (s)) != NULL) {
|
|
// First two characters.
|
|
int bytesStart = s - wordLc;
|
|
for (int i = 0; i < bytesStart; i++)
|
|
points.set (i + 1, 0);
|
|
|
|
// Last two characters: instead of iterating back from the end,
|
|
// we simply iterate from the start to the end and count the
|
|
// characters.
|
|
|
|
int lenBytes = strlen (wordLc);
|
|
int lenUtf8 = numUtf8Chars (wordLc);
|
|
int bytesEnd = 0;
|
|
|
|
s = wordLc;
|
|
for (int i = 0; s; s = nextUtf8Char (s), i++) {
|
|
if (i == lenUtf8 - 2)
|
|
bytesEnd = lenBytes - (s - wordLc);
|
|
}
|
|
|
|
for (int i = 0; i < bytesEnd; i++)
|
|
points.set (points.size() - 2 - i, 0);
|
|
}
|
|
|
|
// Examine the points to build the break point list.
|
|
int n = lout::misc::min ((int)strlen (wordLc), points.size () - 2);
|
|
for (int i = 0; i < n; i++) {
|
|
if (points.get(i + 2) % 2) {
|
|
breakPos->increase ();
|
|
breakPos->set (breakPos->size() - 1, i + 1 + offset);
|
|
}
|
|
}
|
|
}
|
|
|
|
Trie::TrieNode TrieBuilder::trieNodeNull = {'\0', 0, NULL};
|
|
|
|
TrieBuilder::TrieBuilder (int pack)
|
|
{
|
|
this->pack = pack;
|
|
dataList = new SimpleVector <DataEntry> (10000);
|
|
stateStack = new SimpleVector <StackEntry> (10);
|
|
tree = new SimpleVector <Trie::TrieNode> (20000);
|
|
dataZone = new ZoneAllocator (1024);
|
|
stateStackPush(0);
|
|
}
|
|
|
|
TrieBuilder::~TrieBuilder ()
|
|
{
|
|
delete dataList;
|
|
delete stateStack;
|
|
delete tree;
|
|
delete dataZone;
|
|
}
|
|
|
|
void TrieBuilder::insert (const char *key, const char *value)
|
|
{
|
|
dataList->increase ();
|
|
dataList->getLastRef ()->key = (unsigned char *) dStrdup(key);
|
|
dataList->getLastRef ()->value = dataZone->strdup (value);
|
|
}
|
|
|
|
int TrieBuilder::keyCompare (const void *p1, const void *p2)
|
|
{
|
|
DataEntry *pd1 = (DataEntry *) p1;
|
|
DataEntry *pd2 = (DataEntry *) p2;
|
|
|
|
return strcmp ((char *) pd1->key, (char *) pd2->key);
|
|
}
|
|
|
|
int TrieBuilder::insertState (StackEntry *state, bool root)
|
|
{
|
|
int i, j;
|
|
|
|
if (state->count == 0)
|
|
return 0;
|
|
|
|
if (root) {
|
|
i = 0; // we reserve slot 0 for the root state
|
|
} else {
|
|
/* The bigger pack is the more slots we check and the smaller
|
|
* the trie will be, but CPU consumption also increases.
|
|
* Reasonable values for pack seemt to be between 256 and 1024.
|
|
*/
|
|
i = tree->size () - pack + 2 * state->count;
|
|
|
|
if (i < 256) // reserve first 256 entries for the root state
|
|
i = 256;
|
|
}
|
|
|
|
for (;; i++) {
|
|
if (i + 256 > tree->size ())
|
|
tree->setSize (i + 256, trieNodeNull);
|
|
|
|
for (j = 1; j < 256; j++) {
|
|
Trie::TrieNode *tn = tree->getRef(i + j);
|
|
|
|
if (tn->c == j || ((state->next[j] || state->data[j]) && tn->c != 0))
|
|
break;
|
|
}
|
|
|
|
if (j == 256) // found a suitable slot
|
|
break;
|
|
}
|
|
|
|
for (int j = 1; j < 256; j++) {
|
|
Trie::TrieNode *tn = tree->getRef(i + j);
|
|
|
|
if (state->next[j] || state->data[j]) {
|
|
tn->c = j;
|
|
tn->next = state->next[j];
|
|
tn->data = state->data[j];
|
|
}
|
|
}
|
|
|
|
assert (root || i >= 256);
|
|
assert (!root || i == 0);
|
|
return i;
|
|
}
|
|
|
|
void TrieBuilder::stateStackPush (unsigned char c)
|
|
{
|
|
stateStack->increase ();
|
|
StackEntry *e = stateStack->getLastRef ();
|
|
memset (e, 0, sizeof (StackEntry));
|
|
e->c = c;
|
|
}
|
|
|
|
int TrieBuilder::stateStackPop ()
|
|
{
|
|
int next = insertState (stateStack->getLastRef (), stateStack->size () == 1);
|
|
unsigned char c = stateStack->getLastRef ()->c;
|
|
const char *data = stateStack->getLastRef ()->data1;
|
|
|
|
stateStack->setSize (stateStack->size () - 1);
|
|
|
|
if (stateStack->size () > 0) {
|
|
assert (stateStack->getLastRef ()->next[c] == 0);
|
|
assert (stateStack->getLastRef ()->data[c] == NULL);
|
|
stateStack->getLastRef ()->next[c] = next;
|
|
stateStack->getLastRef ()->data[c] = data;
|
|
stateStack->getLastRef ()->count++;
|
|
}
|
|
|
|
return next;
|
|
}
|
|
|
|
Trie *TrieBuilder::createTrie ()
|
|
{
|
|
// we need to sort the patterns as byte strings not as unicode
|
|
qsort (dataList->getArray (), dataList->size (),
|
|
sizeof (DataEntry), keyCompare);
|
|
|
|
for (int i = 0; i < dataList->size (); i++) {
|
|
insertSorted (dataList->getRef (i)->key, dataList->getRef (i)->value);
|
|
free (dataList->getRef (i)->key);
|
|
}
|
|
|
|
while (stateStack->size ())
|
|
stateStackPop ();
|
|
|
|
int size = tree->size ();
|
|
Trie *trie = new Trie(tree->detachArray(), size, true, dataZone);
|
|
dataZone = NULL;
|
|
return trie;
|
|
}
|
|
|
|
void TrieBuilder::insertSorted (unsigned char *s, const char *data)
|
|
{
|
|
int len = strlen((char*)s);
|
|
|
|
for (int i = 0; i < len; i++) {
|
|
if (stateStack->size () > i + 1 &&
|
|
stateStack->getRef (i + 1)->c != s[i]) {
|
|
for (int j = stateStack->size () - 1; j >= i + 1; j--)
|
|
stateStackPop();
|
|
}
|
|
|
|
if (i + 1 >= stateStack->size ())
|
|
stateStackPush(s[i]);
|
|
}
|
|
|
|
while (stateStack->size () > len + 1)
|
|
stateStackPop();
|
|
|
|
assert (stateStack->size () == len + 1);
|
|
stateStack->getLastRef ()->data1 = data;
|
|
}
|
|
|
|
Trie::Trie (TrieNode *array, int size, bool freeArray, ZoneAllocator *dataZone)
|
|
{
|
|
this->array = array;
|
|
this->size = size;
|
|
this->freeArray = freeArray;
|
|
this->dataZone = dataZone;
|
|
}
|
|
|
|
Trie::~Trie ()
|
|
{
|
|
delete dataZone;
|
|
if (freeArray)
|
|
free(array);
|
|
}
|
|
|
|
void Trie::save (FILE *file)
|
|
{
|
|
for (int i = 0; i < size; i++) {
|
|
Trie::TrieNode *tn = &array[i];
|
|
|
|
if (tn->data)
|
|
fprintf(file, "%u, %u, %s\n", tn->c, tn->next, tn->data);
|
|
else
|
|
fprintf(file, "%u, %u\n", tn->c, tn->next);
|
|
}
|
|
}
|
|
|
|
int Trie::load (FILE *file)
|
|
{
|
|
int next, c, maxNext = 0;
|
|
SimpleVector <TrieNode> tree (100);
|
|
dataZone = new ZoneAllocator (1024);
|
|
|
|
while (!feof (file)) {
|
|
char buf[LEN + 1];
|
|
char *s = fgets (buf, LEN, file);
|
|
|
|
if (!s)
|
|
continue;
|
|
|
|
char data[LEN + 1];
|
|
int n = sscanf (s, "%d, %d, %s", &c, &next, data);
|
|
|
|
if (n >= 2 && c >= 0 && c < 256 && next >= 0) {
|
|
tree.increase ();
|
|
tree.getLastRef ()->c = c;
|
|
tree.getLastRef ()->next = next;
|
|
if (n >= 3)
|
|
tree.getLastRef ()->data = dataZone->strdup (data);
|
|
else
|
|
tree.getLastRef ()->data = NULL;
|
|
|
|
if (next > maxNext)
|
|
maxNext = next;
|
|
} else {
|
|
goto error;
|
|
}
|
|
}
|
|
|
|
if (maxNext >= tree.size ())
|
|
goto error;
|
|
|
|
size = tree.size ();
|
|
array = tree.detachArray ();
|
|
freeArray = true;
|
|
return 0;
|
|
|
|
error:
|
|
delete dataZone;
|
|
dataZone = NULL;
|
|
return 1;
|
|
}
|
|
|
|
} // namespace dw
|