Initial import of Dillo

2025-02-28 13:34:30 -05:00
parent bd4e3eebd8
commit 20fea64cb5
496 changed files with 156174 additions and 0 deletions
--- a/dw/hyphenator.cc
+++ b/dw/hyphenator.cc
@ -0,0 +1,603 @@
+/*
+ * Dillo Widget
+ *
+ * Copyright 2012-2013 Sebastian Geerken <sgeerken@dillo.org>,
+ *                     Johannes Hofmann <Johannes.Hofmann@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "hyphenator.hh"
+
+#include "dlib/dlib.h"
+#include "../lout/misc.hh"
+#include "../lout/unicode.hh"
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+
+#define LEN 1000
+
+/*
+ * This is (or at least began as) a direct translation of Ned Batchelder's
+ * public-domain Python implementation at
+ * http://nedbatchelder.com/code/modules/hyphenate.py
+ */
+
+using namespace lout::object;
+using namespace lout::container::typed;
+using namespace lout::misc;
+using namespace lout::unicode;
+
+namespace dw {
+
+HashTable <String, Hyphenator> *Hyphenator::hyphenators =
+   new HashTable <String, Hyphenator> (true, true);
+
+Hyphenator::Hyphenator (const char *patFile, const char *excFile, int pack)
+{
+   trie = NULL; // As long we are not sure whether a pattern file can be read.
+
+   int bufLen = strlen (patFile) + 5 + 1;
+   char *buf = new char[bufLen];
+   snprintf(buf, bufLen, "%s.trie", patFile);
+   FILE *trieF = fopen (buf, "r");
+   delete[] buf;
+
+   if (trieF) {
+      trie = new Trie ();
+      if (trie->load (trieF) != 0) {
+         delete trie;
+         trie = NULL;
+      }
+      fclose (trieF);
+   }
+
+   if (trie == NULL) {
+      TrieBuilder trieBuilder(pack);
+      FILE *patF = fopen (patFile, "r");
+      if (patF) {
+
+         while (!feof (patF)) {
+            char buf[LEN + 1];
+            char *s = fgets (buf, LEN, patF);
+            if (s && s[0] != '%') { // ignore lines starting with '%' as comment
+               // TODO Better exit with an error, when the line is too long.
+               int l = strlen (s);
+               if (s[l - 1] == '\n')
+                  s[l - 1] = 0;
+               insertPattern (&trieBuilder, s);
+            }
+         }
+
+         trie = trieBuilder.createTrie ();
+         fclose (patF);
+      }
+   }
+
+   exceptions = NULL; // Again, only instantiated when needed.
+
+   FILE *excF = fopen (excFile, "r");
+   if (excF) {
+      exceptions = new HashTable <ConstString, Vector <Integer> > (true, true);
+      while (!feof (excF)) {
+         char buf[LEN + 1];
+         char *s = fgets (buf, LEN, excF);
+         if (s && s[0] != '%') { // ignore lines starting with '%' as comment
+             // TODO Better exit with an error, when the line is too long.
+            int l = strlen (s);
+            if (s[l - 1] == '\n')
+               s[l - 1] = 0;
+            insertException (s);
+         }
+      }
+      fclose (excF);
+   }
+}
+
+Hyphenator::~Hyphenator ()
+{
+   delete trie;
+   delete exceptions;
+}
+
+Hyphenator *Hyphenator::getHyphenator (const char *lang)
+{
+   String *langString = new String (lang);
+
+   Hyphenator *hyphenator = hyphenators->get (langString);
+   if (hyphenator)
+      delete langString;
+   else {
+      int patFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
+      char *patFile = new char[patFileLen];
+      snprintf (patFile, patFileLen, "%s/hyphenation/%s.pat",
+                DILLO_LIBDIR, lang);
+      int excFileLen = strlen (DILLO_LIBDIR) + 13 + strlen (lang) + 4 + 1;
+      char *excFile = new char[excFileLen];
+      snprintf (excFile, excFileLen, "%s/hyphenation/%s.exc",
+                DILLO_LIBDIR, lang);
+
+      //printf ("Loading hyphenation patterns for language '%s' from '%s' and "
+      //        "exceptions from '%s' ...\n", lang, patFile, excFile);
+
+      hyphenator = new Hyphenator (patFile, excFile);
+      hyphenators->put (langString, hyphenator);
+      delete[] patFile;
+      delete[] excFile;
+   }
+
+   //lout::misc::StringBuffer sb;
+   //hyphenators->intoStringBuffer (&sb);
+   //printf ("%s\n", sb.getChars ());
+
+   return hyphenator;
+}
+
+void Hyphenator::insertPattern (TrieBuilder *trieBuilder, char *s)
+{
+   // Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd'
+   // and a list of points [ 0, 1, 0, 3, 4 ].
+   int l = strlen (s);
+   char *chars = new char[l + 1];
+   SimpleVector<char> points (1);
+
+   // TODO numbers consisting of multiple digits?
+   // TODO Encoding: This implementation works exactly like the Python
+   // implementation, based on UTF-8. Does this always work?
+   int numChars = 0;
+   for (int i = 0; s[i]; i++) {
+      if (s[i] >= '0' && s[i] <= '9') {
+         points.setSize(numChars + 1, '0');
+         points.set(numChars, s[i]);
+      } else {
+         chars[numChars++] = s[i];
+      }
+   }
+   chars[numChars] = 0;
+
+   points.setSize(numChars + 2, '0');
+   points.set(numChars + 1, '\0');
+
+   // Insert the pattern into the tree.  Each character finds a dict
+   // another level down in the tree, and leaf nodes have the list of
+   // points.
+
+   //printf("insertPattern %s\n", chars);
+
+   trieBuilder->insert (chars, points.getArray ());
+   delete[] chars;
+}
+
+void Hyphenator::insertException (char *s)
+{
+   Vector<Integer> *breaks = new Vector<Integer> (1, true);
+
+   int len = strlen (s);
+   for (int i = 0; i < len - 1; i++)
+      if((unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
+         breaks->put (new Integer (i - 2 * breaks->size()));
+
+   char *noHyphens = new char[len - 2 * breaks->size() + 1];
+   int j = 0;
+   for (int i = 0; i < len; ) {
+      if(i < len - 1 &&
+         (unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
+         i += 2;
+      else
+         noHyphens[j++] = s[i++];
+   }
+   noHyphens[j] = 0;
+
+   exceptions->put (new String (noHyphens), breaks);
+   delete[] noHyphens;
+}
+
+/**
+ * Simple test to avoid much costs. Passing it does not mean that the word
+ * can be hyphenated.
+ */
+bool Hyphenator::isHyphenationCandidate (const char *word)
+{
+   // Short words aren't hyphenated.
+   return (strlen (word) > 4); // TODO UTF-8?
+}
+
+/**
+ * Test whether the character on which "s" points (UTF-8) is an actual
+ * part of the word. Other characters at the beginning and end are
+ * ignored.
+ *
+ * TODO Currently only suitable for English and German.
+ * TODO Only lowercase. (Uppercase not needed.)
+ */
+bool Hyphenator::isCharPartOfActualWord (char *s)
+{
+   return isAlpha (decodeUtf8 (s));
+}
+
+/**
+ * Given a word, returns a list of the possible hyphenation points.
+ */
+int *Hyphenator::hyphenateWord(core::Platform *platform,
+                               const char *word, int *numBreaks)
+{
+   if ((trie == NULL && exceptions == NULL) || !isHyphenationCandidate (word)) {
+      *numBreaks = 0;
+      return NULL;
+   }
+
+   char *wordLc = platform->textToLower (word, strlen (word));
+
+   int start = 0;
+   SimpleVector <int> breakPos (1);
+
+   // Split the original word up, ignore anything but characters, and
+   // collect all break points, so that they fit to the original
+   // word. (The latter is what the offset in the call of
+   // hyphenateSingleWord() is for.)
+   while (true) {
+      while (wordLc[start] && !isCharPartOfActualWord (wordLc + start))
+         start = platform->nextGlyph (wordLc, start);
+
+      if (wordLc[start] == 0)
+         break;
+
+      int end = start, i = end;
+      while (wordLc[i]) {
+         if (!isCharPartOfActualWord (wordLc + i))
+            break;
+         else
+            end = i;
+         i = platform->nextGlyph (wordLc, i);
+      }
+      end = platform->nextGlyph (wordLc, end);
+
+      int nextStart;
+      if (wordLc[end]) {
+         nextStart = platform->nextGlyph (wordLc, end);
+         wordLc[end] = 0;
+      } else
+         nextStart = end;
+
+      hyphenateSingleWord (platform, wordLc + start, start, &breakPos);
+      start = nextStart;
+   }
+
+   free (wordLc);
+
+   *numBreaks = breakPos.size ();
+   if (*numBreaks == 0)
+      return NULL;
+   else {
+      return breakPos.detachArray ();
+   }
+}
+
+/**
+ * Hyphenate a single word, which only consists of lowercase
+ * characters. Store break positions + "offset" in "breakPos".
+ */
+void Hyphenator::hyphenateSingleWord(core::Platform *platform,
+                                     char *wordLc, int offset,
+                                     SimpleVector <int> *breakPos)
+{
+   // If the word is an exception, get the stored points.
+   Vector <Integer> *exceptionalBreaks;
+   ConstString key (wordLc);
+   if (exceptions != NULL && (exceptionalBreaks = exceptions->get (&key))) {
+      for (int i = 0; i < exceptionalBreaks->size(); i++) {
+         breakPos->increase ();
+         breakPos->set (breakPos->size() - 1,
+                        exceptionalBreaks->get(i)->getValue() + offset);
+      }
+      return;
+   }
+
+
+   // trie == NULL means that there is no pattern file.
+   if (trie == NULL)
+      return;
+
+   char *work = new char[strlen (wordLc) + 3];
+   strcpy (work, ".");
+   strcat (work, wordLc);
+   strcat (work, ".");
+
+   int l = strlen (work);
+   SimpleVector <int> points (l + 1);
+   points.setSize (l + 1, 0);
+
+   for (int i = 0; i < l; i++) {
+      int state = trie->root;
+
+      for (int j = i; j < l && trie->validState (state); j++) {
+         const char *p = trie->getData((unsigned char) work[j], &state);
+
+         if (p) {
+            for (int k = 0; p[k]; k++)
+               points.set(i + k,
+                          lout::misc::max (points.get (i + k), p[k] - '0'));
+         }
+      }
+   }
+
+   delete[] work;
+
+   // No hyphens in the first two chars or the last two.
+   // Characters are not bytes, so UTF-8 characters must be counted.
+   const char *s = nextUtf8Char (wordLc);
+   if (s != NULL && (s = nextUtf8Char (s)) != NULL) {
+      // First two characters.
+      int bytesStart = s - wordLc;
+      for (int i = 0; i < bytesStart; i++)
+         points.set (i + 1, 0);
+
+      // Last two characters: instead of iterating back from the end,
+      // we simply iterate from the start to the end and count the
+      // characters.
+
+      int lenBytes = strlen (wordLc);
+      int lenUtf8 = numUtf8Chars (wordLc);
+      int bytesEnd = 0;
+
+      s = wordLc;
+      for (int i = 0; s; s = nextUtf8Char (s), i++) {
+         if (i == lenUtf8 - 2)
+            bytesEnd = lenBytes - (s - wordLc);
+      }
+
+      for (int i = 0; i < bytesEnd; i++)
+         points.set (points.size() - 2 - i, 0);
+   }
+
+   // Examine the points to build the break point list.
+   int n = lout::misc::min ((int)strlen (wordLc), points.size () - 2);
+   for (int i = 0; i < n; i++) {
+      if (points.get(i + 2) % 2) {
+         breakPos->increase ();
+         breakPos->set (breakPos->size() - 1, i + 1 + offset);
+      }
+   }
+}
+
+Trie::TrieNode TrieBuilder::trieNodeNull = {'\0', 0, NULL};
+
+TrieBuilder::TrieBuilder (int pack)
+{
+   this->pack = pack;
+   dataList = new SimpleVector <DataEntry> (10000);
+   stateStack = new SimpleVector <StackEntry> (10);
+   tree = new SimpleVector <Trie::TrieNode> (20000);
+   dataZone = new ZoneAllocator (1024);
+   stateStackPush(0);
+}
+
+TrieBuilder::~TrieBuilder ()
+{
+   delete dataList;
+   delete stateStack;
+   delete tree;
+   delete dataZone;
+}
+
+void TrieBuilder::insert (const char *key, const char *value)
+{
+   dataList->increase ();
+   dataList->getLastRef ()->key = (unsigned char *) dStrdup(key);
+   dataList->getLastRef ()->value = dataZone->strdup (value);
+}
+
+int TrieBuilder::keyCompare (const void *p1, const void *p2)
+{
+   DataEntry *pd1 = (DataEntry *) p1;
+   DataEntry *pd2 = (DataEntry *) p2;
+
+   return strcmp ((char *) pd1->key, (char *) pd2->key);
+}
+
+int TrieBuilder::insertState (StackEntry *state, bool root)
+{
+   int i, j;
+
+   if (state->count == 0)
+      return 0;
+
+   if (root) {
+      i = 0; // we reserve slot 0 for the root state
+   } else {
+      /* The bigger pack is the more slots we check and the smaller
+       * the trie will be, but CPU consumption also increases.
+       * Reasonable values for pack seemt to be between 256 and 1024.
+       */
+      i = tree->size () - pack + 2 * state->count;
+
+      if (i < 256) // reserve first 256 entries for the root state
+         i = 256;
+   }
+
+   for (;; i++) {
+      if (i + 256 > tree->size ())
+         tree->setSize (i + 256, trieNodeNull);
+
+      for (j = 1; j < 256; j++) {
+         Trie::TrieNode *tn = tree->getRef(i + j);
+
+         if (tn->c == j || ((state->next[j] || state->data[j]) && tn->c != 0))
+            break;
+      }
+
+      if (j == 256) // found a suitable slot
+         break;
+   }
+
+   for (int j = 1; j < 256; j++) {
+      Trie::TrieNode *tn = tree->getRef(i + j);
+
+      if (state->next[j] || state->data[j]) {
+         tn->c = j;
+         tn->next = state->next[j];
+         tn->data = state->data[j];
+      }
+   }
+
+   assert (root || i >= 256);
+   assert (!root || i == 0);
+   return i;
+}
+
+void TrieBuilder::stateStackPush (unsigned char c)
+{
+   stateStack->increase ();
+   StackEntry *e = stateStack->getLastRef ();
+   memset (e, 0, sizeof (StackEntry));
+   e->c = c;
+}
+
+int TrieBuilder::stateStackPop ()
+{
+   int next = insertState (stateStack->getLastRef (), stateStack->size () == 1);
+   unsigned char c = stateStack->getLastRef ()->c;
+   const char *data = stateStack->getLastRef ()->data1;
+
+   stateStack->setSize (stateStack->size () - 1);
+
+   if (stateStack->size () > 0) {
+      assert (stateStack->getLastRef ()->next[c] == 0);
+      assert (stateStack->getLastRef ()->data[c] == NULL);
+      stateStack->getLastRef ()->next[c] = next;
+      stateStack->getLastRef ()->data[c] = data;
+      stateStack->getLastRef ()->count++;
+   }
+
+   return next;
+}
+
+Trie *TrieBuilder::createTrie ()
+{
+   // we need to sort the patterns as byte strings not as unicode
+   qsort (dataList->getArray (), dataList->size (),
+      sizeof (DataEntry), keyCompare);
+
+   for (int i = 0; i < dataList->size (); i++) {
+      insertSorted (dataList->getRef (i)->key, dataList->getRef (i)->value);
+      free (dataList->getRef (i)->key);
+   }
+
+   while (stateStack->size ())
+      stateStackPop ();
+
+   int size = tree->size ();
+   Trie *trie = new Trie(tree->detachArray(), size, true, dataZone);
+   dataZone = NULL;
+   return trie;
+}
+
+void TrieBuilder::insertSorted (unsigned char *s, const char *data)
+{
+   int len = strlen((char*)s);
+
+   for (int i = 0; i < len; i++) {
+      if (stateStack->size () > i + 1 &&
+          stateStack->getRef (i + 1)->c != s[i]) {
+         for (int j = stateStack->size () - 1; j >= i + 1; j--)
+            stateStackPop();
+      }
+
+      if (i + 1 >= stateStack->size ())
+         stateStackPush(s[i]);
+   }
+
+   while (stateStack->size () > len + 1)
+      stateStackPop();
+
+   assert (stateStack->size () == len + 1);
+   stateStack->getLastRef ()->data1 = data;
+}
+
+Trie::Trie (TrieNode *array, int size, bool freeArray, ZoneAllocator *dataZone)
+{
+   this->array = array;
+   this->size = size;
+   this->freeArray = freeArray;
+   this->dataZone = dataZone;
+}
+
+Trie::~Trie ()
+{
+   delete dataZone;
+   if (freeArray)
+      free(array);
+}
+
+void Trie::save (FILE *file)
+{
+   for (int i = 0; i < size; i++) {
+      Trie::TrieNode *tn = &array[i];
+
+      if (tn->data)
+         fprintf(file, "%u, %u, %s\n", tn->c, tn->next, tn->data);
+      else
+         fprintf(file, "%u, %u\n", tn->c, tn->next);
+   }
+}
+
+int Trie::load (FILE *file)
+{
+   int next, c, maxNext = 0;
+   SimpleVector <TrieNode> tree (100);
+   dataZone = new ZoneAllocator (1024);
+
+   while (!feof (file)) {
+      char buf[LEN + 1];
+      char *s = fgets (buf, LEN, file);
+
+      if (!s)
+         continue;
+
+      char data[LEN + 1];
+      int n = sscanf (s, "%d, %d, %s", &c, &next, data);
+
+      if (n >= 2 && c >= 0 && c < 256 && next >= 0) {
+         tree.increase ();
+         tree.getLastRef ()->c = c;
+         tree.getLastRef ()->next = next;
+         if (n >= 3)
+            tree.getLastRef ()->data = dataZone->strdup (data);
+         else
+            tree.getLastRef ()->data = NULL;
+
+         if (next > maxNext)
+            maxNext = next;
+      } else {
+         goto error;
+      }
+   }
+
+   if (maxNext >= tree.size ())
+      goto error;
+
+   size = tree.size ();
+   array = tree.detachArray ();
+   freeArray = true;
+   return 0;
+
+error:
+   delete dataZone;
+   dataZone = NULL;
+   return 1;
+}
+
+} // namespace dw