flenser/dw/hyphenator.cc

/*
 * Dillo Widget
 *
 * Copyright 2012-2013 Sebastian Geerken <sgeerken@dillo.org>,
 *                     Johannes Hofmann <Johannes.Hofmann@gmx.de>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


#include "hyphenator.hh"

#include "dlib/dlib.hh"
#include "../lout/misc.hh"
#include "../lout/unicode.hh"
#include <limits.h>
#include <stdio.h>
#include <string.h>

#include <string>

#define LEN 1000

/*
 * This is (or at least began as) a direct translation of Ned Batchelder's
 * public-domain Python implementation at
 * http://nedbatchelder.com/code/modules/hyphenate.py
 */

using namespace lout::object;
using namespace lout::container::typed;
using namespace lout::misc;
using namespace lout::unicode;

namespace dw {

std::map< std::string, std::unique_ptr< Hyphenator > > Hyphenator::hyphenators;

Hyphenator::Hyphenator (const char *patFile, const char *excFile, int pack)
{
   using namespace std::literals::string_literals;
   FILE *trieF = fopen ( (patFile + ".trie"s).c_str(), "r");

   if (trieF) {
      trie = std::make_unique< Trie >();
      if (trie->load (trieF) != 0) {
         trie= nullptr;
      }
      fclose (trieF);
   }

   if (trie == nullptr) {
      TrieBuilder trieBuilder(pack);
      FILE *patF = fopen (patFile, "r");
      if (patF) {

         while (!feof (patF)) {
            char buf[LEN + 1];
            char *s = fgets (buf, LEN, patF);
            if (s && s[0] != '%') { // ignore lines starting with '%' as comment
               // TODO Better exit with an error, when the line is too long.
               int l = strlen (s);
               if (s[l - 1] == '\n')
                  s[l - 1] = 0;
               insertPattern (&trieBuilder, s);
            }
         }

         trie = trieBuilder.createTrie ();
         fclose (patF);
      }
   }

   FILE *excF = fopen (excFile, "r");
   if (excF) {
      exceptions.emplace();
      while (!feof (excF)) {
         char buf[LEN + 1];
         char *s = fgets (buf, LEN, excF);
         if (s && s[0] != '%') { // ignore lines starting with '%' as comment
             // TODO Better exit with an error, when the line is too long.
            int l = strlen (s);
            if (s[l - 1] == '\n')
               s[l - 1] = 0;
            insertException (s);
         }
      }
      fclose (excF);
   }
}

Hyphenator *Hyphenator::getHyphenator (const char *lang)
{
   auto &hyphenator= hyphenators[ lang ];

   if( not hyphenator )
   {
      using namespace std::literals::string_literals;
      std::string patFile= DILLO_LIBDIR + "/hyphenation/"s + lang + ".pat";
      std::string excFile= DILLO_LIBDIR + "/hyphenation"s + lang + ".exc";

      //printf ("Loading hyphenation patterns for language '%s' from '%s' and "
      //        "exceptions from '%s' ...\n", lang, patFile, excFile);

      hyphenator= std::make_unique< Hyphenator >( patFile.c_str(), excFile.c_str() );
   }

   //lout::misc::StringBuffer sb;
   //hyphenators->intoStringBuffer (&sb);
   //printf ("%s\n", sb.getChars ());

   return hyphenator.get();
}

void Hyphenator::insertPattern (TrieBuilder *trieBuilder, char *s)
{
   // Convert the a pattern like 'a1bc3d4' into a string of chars 'abcd'
   // and a list of points [ 0, 1, 0, 3, 4 ].
   std::string chars;
   std::vector< char > points;

   // TODO numbers consisting of multiple digits?
   // TODO Encoding: This implementation works exactly like the Python
   // implementation, based on UTF-8. Does this always work?
   int numChars = 0;
   for (int i = 0; s[i]; i++) {
      if (s[i] >= '0' && s[i] <= '9') {
         points.push_back( s[ i ] );
      } else {
         numChars++;
         chars+= s[i];
      }
   }
   chars[numChars] = 0;

   points.resize( numChars + 2, '0' );
   points.at( numChars + 1 )= '\0';

   // Insert the pattern into the tree.  Each character finds a dict
   // another level down in the tree, and leaf nodes have the list of
   // points.

   //printf("insertPattern %s\n", chars);

   trieBuilder->insert (chars.c_str(), points.data());
}

void Hyphenator::insertException (char *s)
{
   std::vector< int > breaks;

   int len = strlen (s);
   for (int i = 0; i < len - 1; i++)
      if((unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
         breaks.push_back( i - 2 * breaks.size() );

   std::string noHyphens;
   noHyphens.resize( len - 2 * breaks.size() + 1 );
   int j = 0;
   for (int i = 0; i < len; ) {
      if(i < len - 1 &&
         (unsigned char)s[i] == 0xc2 && (unsigned char)s[i + 1] == 0xad)
         i += 2;
      else
         noHyphens[j++] = s[i++];
   }
   noHyphens[j] = 0;

   exceptions->emplace( noHyphens, std::move( breaks ) );
}

/**
 * Simple test to avoid much costs. Passing it does not mean that the word
 * can be hyphenated.
 */
bool Hyphenator::isHyphenationCandidate (const char *word)
{
   // Short words aren't hyphenated.
   return (strlen (word) > 4); // TODO UTF-8?
}

/**
 * Test whether the character on which "s" points (UTF-8) is an actual
 * part of the word. Other characters at the beginning and end are
 * ignored.
 *
 * TODO Currently only suitable for English and German.
 * TODO Only lowercase. (Uppercase not needed.)
 */
bool Hyphenator::isCharPartOfActualWord (char *s)
{
   return isAlpha (decodeUtf8 (s));
}

/**
 * Given a word, returns a list of the possible hyphenation points.
 */
int *Hyphenator::hyphenateWord(core::Platform *platform,
                               const char *word, int *numBreaks)
{
   if ((trie == NULL && not exceptions.has_value()) || !isHyphenationCandidate (word)) {
      *numBreaks = 0;
      return NULL;
   }

   char *wordLc = platform->textToLower (word, strlen (word));

   int start = 0;
   SimpleVector <int> breakPos (1);

   // Split the original word up, ignore anything but characters, and
   // collect all break points, so that they fit to the original
   // word. (The latter is what the offset in the call of
   // hyphenateSingleWord() is for.)
   while (true) {
      while (wordLc[start] && !isCharPartOfActualWord (wordLc + start))
         start = platform->nextGlyph (wordLc, start);

      if (wordLc[start] == 0)
         break;

      int end = start, i = end;
      while (wordLc[i]) {
         if (!isCharPartOfActualWord (wordLc + i))
            break;
         else
            end = i;
         i = platform->nextGlyph (wordLc, i);
      }
      end = platform->nextGlyph (wordLc, end);

      int nextStart;
      if (wordLc[end]) {
         nextStart = platform->nextGlyph (wordLc, end);
         wordLc[end] = 0;
      } else
         nextStart = end;

      hyphenateSingleWord (platform, wordLc + start, start, &breakPos);
      start = nextStart;
   }

   free (wordLc);

   *numBreaks = breakPos.size ();
   if (*numBreaks == 0)
      return NULL;
   else {
      return breakPos.detachArray ();
   }
}

/**
 * Hyphenate a single word, which only consists of lowercase
 * characters. Store break positions + "offset" in "breakPos".
 */
void Hyphenator::hyphenateSingleWord(core::Platform *platform,
                                     char *wordLc, int offset,
                                     SimpleVector <int> *breakPos)
{
   // If the word is an exception, get the stored points.
   std::string key= wordLc;
   if (exceptions.has_value() && exceptions.value().contains( key ) )
   {
      auto exceptionalBreaks= exceptions.value().at( key );
      for (int i = 0; i < exceptionalBreaks.size(); i++) {
         breakPos->increase ();
         breakPos->set (breakPos->size() - 1, exceptionalBreaks.at( i ) + offset);
      }
      return;
   }


   // trie == NULL means that there is no pattern file.
   if (trie == NULL)
      return;

   char *work = new char[strlen (wordLc) + 3];
   strcpy (work, ".");
   strcat (work, wordLc);
   strcat (work, ".");

   int l = strlen (work);
   SimpleVector <int> points (l + 1);
   points.setSize (l + 1, 0);

   for (int i = 0; i < l; i++) {
      int state = trie->root;

      for (int j = i; j < l && trie->validState (state); j++) {
         const char *p = trie->getData((unsigned char) work[j], &state);

         if (p) {
            for (int k = 0; p[k]; k++)
               points.set(i + k,
                          std::max (points.get (i + k), p[k] - '0'));
         }
      }
   }

   delete[] work;

   // No hyphens in the first two chars or the last two.
   // Characters are not bytes, so UTF-8 characters must be counted.
   const char *s = nextUtf8Char (wordLc);
   if (s != NULL && (s = nextUtf8Char (s)) != NULL) {
      // First two characters.
      int bytesStart = s - wordLc;
      for (int i = 0; i < bytesStart; i++)
         points.set (i + 1, 0);

      // Last two characters: instead of iterating back from the end,
      // we simply iterate from the start to the end and count the
      // characters.

      int lenBytes = strlen (wordLc);
      int lenUtf8 = numUtf8Chars (wordLc);
      int bytesEnd = 0;

      s = wordLc;
      for (int i = 0; s; s = nextUtf8Char (s), i++) {
         if (i == lenUtf8 - 2)
            bytesEnd = lenBytes - (s - wordLc);
      }

      for (int i = 0; i < bytesEnd; i++)
         points.set (points.size() - 2 - i, 0);
   }

   // Examine the points to build the break point list.
   int n = std::min ((int)strlen (wordLc), points.size () - 2);
   for (int i = 0; i < n; i++) {
      if (points.get(i + 2) % 2) {
         breakPos->increase ();
         breakPos->set (breakPos->size() - 1, i + 1 + offset);
      }
   }
}

Trie::TrieNode TrieBuilder::trieNodeNull = {'\0', 0, NULL};

TrieBuilder::TrieBuilder (int pack)
{
   this->pack = pack;
   dataList = new SimpleVector <DataEntry> (10000);
   stateStack = new SimpleVector <StackEntry> (10);
   tree = new SimpleVector <Trie::TrieNode> (20000);
   dataZone = new ZoneAllocator (1024);
   stateStackPush(0);
}

TrieBuilder::~TrieBuilder ()
{
   delete dataList;
   delete stateStack;
   delete tree;
   delete dataZone;
}

void TrieBuilder::insert (const char *key, const char *value)
{
   dataList->increase ();
   dataList->getLastRef ()->key = (unsigned char *) dStrdup(key).ptr;
   dataList->getLastRef ()->value = dataZone->strdup (value);
}

int TrieBuilder::keyCompare (const void *p1, const void *p2)
{
   DataEntry *pd1 = (DataEntry *) p1;
   DataEntry *pd2 = (DataEntry *) p2;

   return strcmp ((char *) pd1->key, (char *) pd2->key);
}

int TrieBuilder::insertState (StackEntry *state, bool root)
{
   int i, j;

   if (state->count == 0)
      return 0;

   if (root) {
      i = 0; // we reserve slot 0 for the root state
   } else {
      /* The bigger pack is the more slots we check and the smaller
       * the trie will be, but CPU consumption also increases.
       * Reasonable values for pack seemt to be between 256 and 1024.
       */
      i = tree->size () - pack + 2 * state->count;

      if (i < 256) // reserve first 256 entries for the root state
         i = 256;
   }

   for (;; i++) {
      if (i + 256 > tree->size ())
         tree->setSize (i + 256, trieNodeNull);

      for (j = 1; j < 256; j++) {
         Trie::TrieNode *tn = tree->getRef(i + j);

         if (tn->c == j || ((state->next[j] || state->data[j]) && tn->c != 0))
            break;
      }

      if (j == 256) // found a suitable slot
         break;
   }

   for (int j = 1; j < 256; j++) {
      Trie::TrieNode *tn = tree->getRef(i + j);

      if (state->next[j] || state->data[j]) {
         tn->c = j;
         tn->next = state->next[j];
         tn->data = state->data[j];
      }
   }

   assert (root || i >= 256);
   assert (!root || i == 0);
   return i;
}

void TrieBuilder::stateStackPush (unsigned char c)
{
   stateStack->increase ();
   StackEntry *e = stateStack->getLastRef ();
   memset (e, 0, sizeof (StackEntry));
   e->c = c;
}

int TrieBuilder::stateStackPop ()
{
   int next = insertState (stateStack->getLastRef (), stateStack->size () == 1);
   unsigned char c = stateStack->getLastRef ()->c;
   const char *data = stateStack->getLastRef ()->data1;

   stateStack->setSize (stateStack->size () - 1);

   if (stateStack->size () > 0) {
      assert (stateStack->getLastRef ()->next[c] == 0);
      assert (stateStack->getLastRef ()->data[c] == NULL);
      stateStack->getLastRef ()->next[c] = next;
      stateStack->getLastRef ()->data[c] = data;
      stateStack->getLastRef ()->count++;
   }

   return next;
}

std::unique_ptr< Trie >
TrieBuilder::createTrie ()
{
   // we need to sort the patterns as byte strings not as unicode
   qsort (dataList->getArray (), dataList->size (),
      sizeof (DataEntry), keyCompare);

   for (int i = 0; i < dataList->size (); i++) {
      insertSorted (dataList->getRef (i)->key, dataList->getRef (i)->value);
      free (dataList->getRef (i)->key);
   }

   while (stateStack->size ())
      stateStackPop ();

   int size = tree->size ();
   auto trie = std::make_unique< Trie >( tree->detachArray(), size, true, dataZone );
   dataZone = NULL;
   return trie;
}

void TrieBuilder::insertSorted (unsigned char *s, const char *data)
{
   int len = strlen((char*)s);

   for (int i = 0; i < len; i++) {
      if (stateStack->size () > i + 1 &&
          stateStack->getRef (i + 1)->c != s[i]) {
         for (int j = stateStack->size () - 1; j >= i + 1; j--)
            stateStackPop();
      }

      if (i + 1 >= stateStack->size ())
         stateStackPush(s[i]);
   }

   while (stateStack->size () > len + 1)
      stateStackPop();

   assert (stateStack->size () == len + 1);
   stateStack->getLastRef ()->data1 = data;
}

Trie::Trie (TrieNode *array, int size, bool freeArray, ZoneAllocator *dataZone)
{
   this->array = array;
   this->size = size;
   this->freeArray = freeArray;
   this->dataZone = dataZone;
}

Trie::~Trie ()
{
   delete dataZone;
   if (freeArray)
      free(array);
}

void Trie::save (FILE *file)
{
   for (int i = 0; i < size; i++) {
      Trie::TrieNode *tn = &array[i];

      if (tn->data)
         fprintf(file, "%u, %u, %s\n", tn->c, tn->next, tn->data);
      else
         fprintf(file, "%u, %u\n", tn->c, tn->next);
   }
}

int Trie::load (FILE *file)
{
   int next, c, maxNext = 0;
   SimpleVector <TrieNode> tree (100);
   dataZone = new ZoneAllocator (1024);

   while (!feof (file)) {
      char buf[LEN + 1];
      char *s = fgets (buf, LEN, file);

      if (!s)
         continue;

      char data[LEN + 1];
      int n = sscanf (s, "%d, %d, %s", &c, &next, data);

      if (n >= 2 && c >= 0 && c < 256 && next >= 0) {
         tree.increase ();
         tree.getLastRef ()->c = c;
         tree.getLastRef ()->next = next;
         if (n >= 3)
            tree.getLastRef ()->data = dataZone->strdup (data);
         else
            tree.getLastRef ()->data = NULL;

         if (next > maxNext)
            maxNext = next;
      } else {
         goto error;
      }
   }

   if (maxNext >= tree.size ())
      goto error;

   size = tree.size ();
   array = tree.detachArray ();
   freeArray = true;
   return 0;

error:
   delete dataZone;
   dataZone = NULL;
   return 1;
}

} // namespace dw