Files
flenser/dw/hyphenator.cc
ADAM David Alan Martin 1bf12c858f
Some checks failed
CI / ubuntu-latest-html-tests (push) Has been cancelled
CI / alpine-mbedtls-3_6_0 (push) Has been cancelled
CI / ubuntu-latest-no-tls (push) Has been cancelled
CI / ubuntu-latest-mbedtls2 (push) Has been cancelled
CI / ubuntu-latest-openssl-3 (push) Has been cancelled
CI / ubuntu-latest-with-old-std (push) Has been cancelled
CI / ubuntu-20-04-openssl-1-1 (push) Has been cancelled
CI / macOS-13-openssl-1-1 (push) Has been cancelled
CI / macOS-13-openssl-3 (push) Has been cancelled
CI / freebsd-14-openssl-3 (push) Has been cancelled
CI / windows-mbedtls (push) Has been cancelled
String alloc shouldn't convert to std::string.
As I work through making code use more C++ RAII and such, most
of the work is handling strings, especially temporaries.  As member
variables which manage string memory get turned into `std::string`,
some use cases might wind up leaking memory.  (One was found in
this change.)

By using a non-convertible-to-string result, such accidents should
be avoided.
2025-09-08 13:24:56 -04:00

580 lines
15 KiB
C++

/*
* Dillo Widget
*
* Copyright 2012-2013 Sebastian Geerken <sgeerken@dillo.org>,
* Johannes Hofmann <Johannes.Hofmann@gmx.de>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "hyphenator.hh"
#include "dlib/dlib.hh"
#include "../lout/misc.hh"
#include "../lout/unicode.hh"
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <string>
#define LEN 1000
/*
* This is (or at least began as) a direct translation of Ned Batchelder's
* public-domain Python implementation at
* http://nedbatchelder.com/code/modules/hyphenate.py
*/
using namespace lout::object;
using namespace lout::container::typed;
using namespace lout::misc;
using namespace lout::unicode;
namespace dw {
std::map< std::string, std::unique_ptr< Hyphenator > > Hyphenator::hyphenators;
Hyphenator::Hyphenator (const char *patFile, const char *excFile, int pack)
{
using namespace std::literals::string_literals;
FILE *trieF = fopen ( (patFile + ".trie"s).c_str(), "r");
if (trieF) {
trie = std::make_unique< Trie >();
if (trie->load (trieF) != 0) {
trie= nullptr;
}
fclose (trieF);
}
if (trie == nullptr) {
TrieBuilder trieBuilder(pack);
FILE *patF = fopen (patFile, "r");
if (patF) {
while (!feof (patF)) {
char buf[LEN + 1];
char *s = fgets (buf, LEN, patF);
if (s && s[0] != '%') { // ignore lines starting with '%' as comment
// TODO Better exit with an error, when the line is too long.
int l = strlen (s);
if (s[l - 1] == '\n')
s[l - 1] = 0;
insertPattern (&trieBuilder, s);
}
}
trie = trieBuilder.createTrie ();
fclose (patF);
}
}
FILE *excF = fopen (excFile, "r");
if (excF) {
exceptions.emplace();
while (!feof (excF)) {
char buf[LEN + 1];
char *s = fgets (buf, LEN, excF);
if (s && s[0] != '%') { // ignore lines starting with '%' as comment
// TODO Better exit with an error, when the line is too long.
int l = strlen (s);
if (s[l - 1] == '\n')
s[l - 1] = 0;
insertException (s);
}
}
fclose (excF);
}
}
Hyphenator *Hyphenator::getHyphenator (const char *lang)
{
auto &hyphenator= hyphenators[ lang ];
if( not hyphenator )
{
using namespace std::literals::string_literals;
std::string patFile= DILLO_LIBDIR + "/hyphenation/"s + lang + ".pat";
std::string excFile= DILLO_LIBDIR + "/hyphenation"s + lang + ".exc";
//printf ("Loading hyphenation patterns for language '%s' from '%s' and "
// "exceptions from '%s' ...\n", lang, patFile, excFile);
hyphenator= std::make_unique< Hyphenator >( patFile.c_str(), excFile.c_str() );
}
//lout::misc::StringBuffer sb;
//hyphenators->intoStringBuffer (&sb);
//printf ("%s\n", sb.getChars ());
return hyphenator.get();
}
void Hyphenator::insertPattern (TrieBuilder *trieBuilder, char *s)
{
// Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd'
// and a list of points [ 0, 1, 0, 3, 4 ].
std::string chars;
std::vector< char > points;
// TODO numbers consisting of multiple digits?
// TODO Encoding: This implementation works exactly like the Python
// implementation, based on UTF-8. Does this always work?
int numChars = 0;
for (int i = 0; s[i]; i++) {
if (s[i] >= '0' && s[i] <= '9') {
points.push_back( s[ i ] );
} else {
numChars++;
chars+= s[i];
}
}
chars[numChars] = 0;
points.resize( numChars + 2, '0' );
points.at( numChars + 1 )= '\0';
// Insert the pattern into the tree. Each character finds a dict
// another level down in the tree, and leaf nodes have the list of
// points.
//printf("insertPattern %s\n", chars);
trieBuilder->insert (chars.c_str(), points.data());
}
void Hyphenator::insertException (char *s)
{
std::vector< int > breaks;
int len = strlen (s);
for (int i = 0; i < len - 1; i++)
if((unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
breaks.push_back( i - 2 * breaks.size() );
std::string noHyphens;
noHyphens.resize( len - 2 * breaks.size() + 1 );
int j = 0;
for (int i = 0; i < len; ) {
if(i < len - 1 &&
(unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
i += 2;
else
noHyphens[j++] = s[i++];
}
noHyphens[j] = 0;
exceptions->emplace( noHyphens, std::move( breaks ) );
}
/**
* Simple test to avoid much costs. Passing it does not mean that the word
* can be hyphenated.
*/
bool Hyphenator::isHyphenationCandidate (const char *word)
{
// Short words aren't hyphenated.
return (strlen (word) > 4); // TODO UTF-8?
}
/**
* Test whether the character on which "s" points (UTF-8) is an actual
* part of the word. Other characters at the beginning and end are
* ignored.
*
* TODO Currently only suitable for English and German.
* TODO Only lowercase. (Uppercase not needed.)
*/
bool Hyphenator::isCharPartOfActualWord (char *s)
{
return isAlpha (decodeUtf8 (s));
}
/**
* Given a word, returns a list of the possible hyphenation points.
*/
int *Hyphenator::hyphenateWord(core::Platform *platform,
const char *word, int *numBreaks)
{
if ((trie == NULL && not exceptions.has_value()) || !isHyphenationCandidate (word)) {
*numBreaks = 0;
return NULL;
}
char *wordLc = platform->textToLower (word, strlen (word));
int start = 0;
SimpleVector <int> breakPos (1);
// Split the original word up, ignore anything but characters, and
// collect all break points, so that they fit to the original
// word. (The latter is what the offset in the call of
// hyphenateSingleWord() is for.)
while (true) {
while (wordLc[start] && !isCharPartOfActualWord (wordLc + start))
start = platform->nextGlyph (wordLc, start);
if (wordLc[start] == 0)
break;
int end = start, i = end;
while (wordLc[i]) {
if (!isCharPartOfActualWord (wordLc + i))
break;
else
end = i;
i = platform->nextGlyph (wordLc, i);
}
end = platform->nextGlyph (wordLc, end);
int nextStart;
if (wordLc[end]) {
nextStart = platform->nextGlyph (wordLc, end);
wordLc[end] = 0;
} else
nextStart = end;
hyphenateSingleWord (platform, wordLc + start, start, &breakPos);
start = nextStart;
}
free (wordLc);
*numBreaks = breakPos.size ();
if (*numBreaks == 0)
return NULL;
else {
return breakPos.detachArray ();
}
}
/**
* Hyphenate a single word, which only consists of lowercase
* characters. Store break positions + "offset" in "breakPos".
*/
void Hyphenator::hyphenateSingleWord(core::Platform *platform,
char *wordLc, int offset,
SimpleVector <int> *breakPos)
{
// If the word is an exception, get the stored points.
std::string key= wordLc;
if (exceptions.has_value() && exceptions.value().contains( key ) )
{
auto exceptionalBreaks= exceptions.value().at( key );
for (int i = 0; i < exceptionalBreaks.size(); i++) {
breakPos->increase ();
breakPos->set (breakPos->size() - 1, exceptionalBreaks.at( i ) + offset);
}
return;
}
// trie == NULL means that there is no pattern file.
if (trie == NULL)
return;
char *work = new char[strlen (wordLc) + 3];
strcpy (work, ".");
strcat (work, wordLc);
strcat (work, ".");
int l = strlen (work);
SimpleVector <int> points (l + 1);
points.setSize (l + 1, 0);
for (int i = 0; i < l; i++) {
int state = trie->root;
for (int j = i; j < l && trie->validState (state); j++) {
const char *p = trie->getData((unsigned char) work[j], &state);
if (p) {
for (int k = 0; p[k]; k++)
points.set(i + k,
std::max (points.get (i + k), p[k] - '0'));
}
}
}
delete[] work;
// No hyphens in the first two chars or the last two.
// Characters are not bytes, so UTF-8 characters must be counted.
const char *s = nextUtf8Char (wordLc);
if (s != NULL && (s = nextUtf8Char (s)) != NULL) {
// First two characters.
int bytesStart = s - wordLc;
for (int i = 0; i < bytesStart; i++)
points.set (i + 1, 0);
// Last two characters: instead of iterating back from the end,
// we simply iterate from the start to the end and count the
// characters.
int lenBytes = strlen (wordLc);
int lenUtf8 = numUtf8Chars (wordLc);
int bytesEnd = 0;
s = wordLc;
for (int i = 0; s; s = nextUtf8Char (s), i++) {
if (i == lenUtf8 - 2)
bytesEnd = lenBytes - (s - wordLc);
}
for (int i = 0; i < bytesEnd; i++)
points.set (points.size() - 2 - i, 0);
}
// Examine the points to build the break point list.
int n = std::min ((int)strlen (wordLc), points.size () - 2);
for (int i = 0; i < n; i++) {
if (points.get(i + 2) % 2) {
breakPos->increase ();
breakPos->set (breakPos->size() - 1, i + 1 + offset);
}
}
}
Trie::TrieNode TrieBuilder::trieNodeNull = {'\0', 0, NULL};
TrieBuilder::TrieBuilder (int pack)
{
this->pack = pack;
dataList = new SimpleVector <DataEntry> (10000);
stateStack = new SimpleVector <StackEntry> (10);
tree = new SimpleVector <Trie::TrieNode> (20000);
dataZone = new ZoneAllocator (1024);
stateStackPush(0);
}
TrieBuilder::~TrieBuilder ()
{
delete dataList;
delete stateStack;
delete tree;
delete dataZone;
}
void TrieBuilder::insert (const char *key, const char *value)
{
dataList->increase ();
dataList->getLastRef ()->key = (unsigned char *) dStrdup(key).ptr;
dataList->getLastRef ()->value = dataZone->strdup (value);
}
int TrieBuilder::keyCompare (const void *p1, const void *p2)
{
DataEntry *pd1 = (DataEntry *) p1;
DataEntry *pd2 = (DataEntry *) p2;
return strcmp ((char *) pd1->key, (char *) pd2->key);
}
int TrieBuilder::insertState (StackEntry *state, bool root)
{
int i, j;
if (state->count == 0)
return 0;
if (root) {
i = 0; // we reserve slot 0 for the root state
} else {
/* The bigger pack is the more slots we check and the smaller
* the trie will be, but CPU consumption also increases.
* Reasonable values for pack seemt to be between 256 and 1024.
*/
i = tree->size () - pack + 2 * state->count;
if (i < 256) // reserve first 256 entries for the root state
i = 256;
}
for (;; i++) {
if (i + 256 > tree->size ())
tree->setSize (i + 256, trieNodeNull);
for (j = 1; j < 256; j++) {
Trie::TrieNode *tn = tree->getRef(i + j);
if (tn->c == j || ((state->next[j] || state->data[j]) && tn->c != 0))
break;
}
if (j == 256) // found a suitable slot
break;
}
for (int j = 1; j < 256; j++) {
Trie::TrieNode *tn = tree->getRef(i + j);
if (state->next[j] || state->data[j]) {
tn->c = j;
tn->next = state->next[j];
tn->data = state->data[j];
}
}
assert (root || i >= 256);
assert (!root || i == 0);
return i;
}
void TrieBuilder::stateStackPush (unsigned char c)
{
stateStack->increase ();
StackEntry *e = stateStack->getLastRef ();
memset (e, 0, sizeof (StackEntry));
e->c = c;
}
int TrieBuilder::stateStackPop ()
{
int next = insertState (stateStack->getLastRef (), stateStack->size () == 1);
unsigned char c = stateStack->getLastRef ()->c;
const char *data = stateStack->getLastRef ()->data1;
stateStack->setSize (stateStack->size () - 1);
if (stateStack->size () > 0) {
assert (stateStack->getLastRef ()->next[c] == 0);
assert (stateStack->getLastRef ()->data[c] == NULL);
stateStack->getLastRef ()->next[c] = next;
stateStack->getLastRef ()->data[c] = data;
stateStack->getLastRef ()->count++;
}
return next;
}
std::unique_ptr< Trie >
TrieBuilder::createTrie ()
{
// we need to sort the patterns as byte strings not as unicode
qsort (dataList->getArray (), dataList->size (),
sizeof (DataEntry), keyCompare);
for (int i = 0; i < dataList->size (); i++) {
insertSorted (dataList->getRef (i)->key, dataList->getRef (i)->value);
free (dataList->getRef (i)->key);
}
while (stateStack->size ())
stateStackPop ();
int size = tree->size ();
auto trie = std::make_unique< Trie >( tree->detachArray(), size, true, dataZone );
dataZone = NULL;
return trie;
}
void TrieBuilder::insertSorted (unsigned char *s, const char *data)
{
int len = strlen((char*)s);
for (int i = 0; i < len; i++) {
if (stateStack->size () > i + 1 &&
stateStack->getRef (i + 1)->c != s[i]) {
for (int j = stateStack->size () - 1; j >= i + 1; j--)
stateStackPop();
}
if (i + 1 >= stateStack->size ())
stateStackPush(s[i]);
}
while (stateStack->size () > len + 1)
stateStackPop();
assert (stateStack->size () == len + 1);
stateStack->getLastRef ()->data1 = data;
}
Trie::Trie (TrieNode *array, int size, bool freeArray, ZoneAllocator *dataZone)
{
this->array = array;
this->size = size;
this->freeArray = freeArray;
this->dataZone = dataZone;
}
Trie::~Trie ()
{
delete dataZone;
if (freeArray)
free(array);
}
void Trie::save (FILE *file)
{
for (int i = 0; i < size; i++) {
Trie::TrieNode *tn = &array[i];
if (tn->data)
fprintf(file, "%u, %u, %s\n", tn->c, tn->next, tn->data);
else
fprintf(file, "%u, %u\n", tn->c, tn->next);
}
}
int Trie::load (FILE *file)
{
int next, c, maxNext = 0;
SimpleVector <TrieNode> tree (100);
dataZone = new ZoneAllocator (1024);
while (!feof (file)) {
char buf[LEN + 1];
char *s = fgets (buf, LEN, file);
if (!s)
continue;
char data[LEN + 1];
int n = sscanf (s, "%d, %d, %s", &c, &next, data);
if (n >= 2 && c >= 0 && c < 256 && next >= 0) {
tree.increase ();
tree.getLastRef ()->c = c;
tree.getLastRef ()->next = next;
if (n >= 3)
tree.getLastRef ()->data = dataZone->strdup (data);
else
tree.getLastRef ()->data = NULL;
if (next > maxNext)
maxNext = next;
} else {
goto error;
}
}
if (maxNext >= tree.size ())
goto error;
size = tree.size ();
array = tree.detachArray ();
freeArray = true;
return 0;
error:
delete dataZone;
dataZone = NULL;
return 1;
}
} // namespace dw