3rdparty/hunspell/1.6.2/src/hunspell/hashmgr.cxx

   1 /* ***** BEGIN LICENSE BLOCK *****
   2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3  *
   4  * Copyright (C) 2002-2017 Németh László
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
  17  *
  18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
  19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
  20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
  21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
  22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
  23  *
  24  * Alternatively, the contents of this file may be used under the terms of
  25  * either the GNU General Public License Version 2 or later (the "GPL"), or
  26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27  * in which case the provisions of the GPL or the LGPL are applicable instead
  28  * of those above. If you wish to allow use of your version of this file only
  29  * under the terms of either the GPL or the LGPL, and not to allow others to
  30  * use your version of this file under the terms of the MPL, indicate your
  31  * decision by deleting the provisions above and replace them with the notice
  32  * and other provisions required by the GPL or the LGPL. If you do not delete
  33  * the provisions above, a recipient may use your version of this file under
  34  * the terms of any one of the MPL, the GPL or the LGPL.
  35  *
  36  * ***** END LICENSE BLOCK ***** */
  37 /*
  38  * Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
  39  * And Contributors.  All rights reserved.
  40  *
  41  * Redistribution and use in source and binary forms, with or without
  42  * modification, are permitted provided that the following conditions
  43  * are met:
  44  *
  45  * 1. Redistributions of source code must retain the above copyright
  46  *    notice, this list of conditions and the following disclaimer.
  47  *
  48  * 2. Redistributions in binary form must reproduce the above copyright
  49  *    notice, this list of conditions and the following disclaimer in the
  50  *    documentation and/or other materials provided with the distribution.
  51  *
  52  * 3. All modifications to the source code must be clearly marked as
  53  *    such.  Binary redistributions based on modified source code
  54  *    must be clearly marked as modified versions in the documentation
  55  *    and/or other materials provided with the distribution.
  56  *
  57  * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
  58  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  59  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  60  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
  61  * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  62  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  63  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  64  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  65  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  66  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  67  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  68  * SUCH DAMAGE.
  69  */
  70
  71 #include <stdlib.h>
  72 #include <string.h>
  73 #include <stdio.h>
  74 #include <ctype.h>
  75 #include <limits>
  76 #include <sstream>
  77
  78 #include "hashmgr.hxx"
  79 #include "csutil.hxx"
  80 #include "atypes.hxx"
  81
  82 // build a hash table from a munched word list
  83
  84 HashMgr::HashMgr(const char* tpath, const char* apath, const char* key)
  85     : tablesize(0),
  86       tableptr(NULL),
  87       flag_mode(FLAG_CHAR),
  88       complexprefixes(0),
  89       utf8(0),
  90       forbiddenword(FORBIDDENWORD)  // forbidden word signing flag
  91       ,
  92       numaliasf(0),
  93       aliasf(NULL),
  94       aliasflen(0),
  95       numaliasm(0),
  96       aliasm(NULL) {
  97   langnum = 0;
  98   csconv = 0;
  99   load_config(apath, key);
 100   int ec = load_tables(tpath, key);
 101   if (ec) {
 102     /* error condition - what should we do here */
 103     HUNSPELL_WARNING(stderr, "Hash Manager Error : %d\n", ec);
 104     free(tableptr);
 105     //keep tablesize to 1 to fix possible division with zero
 106     tablesize = 1;
 107     tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*));
 108     if (!tableptr) {
 109       tablesize = 0;
 110     }
 111   }
 112 }
 113
 114 HashMgr::~HashMgr() {
 115   if (tableptr) {
 116     // now pass through hash table freeing up everything
 117     // go through column by column of the table
 118     for (int i = 0; i < tablesize; i++) {
 119       struct hentry* pt = tableptr[i];
 120       struct hentry* nt = NULL;
 121       while (pt) {
 122         nt = pt->next;
 123         if (pt->astr &&
 124             (!aliasf || TESTAFF(pt->astr, ONLYUPCASEFLAG, pt->alen)))
 125           free(pt->astr);
 126         free(pt);
 127         pt = nt;
 128       }
 129     }
 130     free(tableptr);
 131   }
 132   tablesize = 0;
 133
 134   if (aliasf) {
 135     for (int j = 0; j < (numaliasf); j++)
 136       free(aliasf[j]);
 137     free(aliasf);
 138     aliasf = NULL;
 139     if (aliasflen) {
 140       free(aliasflen);
 141       aliasflen = NULL;
 142     }
 143   }
 144   if (aliasm) {
 145     for (int j = 0; j < (numaliasm); j++)
 146       free(aliasm[j]);
 147     free(aliasm);
 148     aliasm = NULL;
 149   }
 150
 151 #ifndef OPENOFFICEORG
 152 #ifndef MOZILLA_CLIENT
 153   if (utf8)
 154     free_utf_tbl();
 155 #endif
 156 #endif
 157
 158 #ifdef MOZILLA_CLIENT
 159   delete[] csconv;
 160 #endif
 161 }
 162
 163 // lookup a root word in the hashtable
 164
 165 struct hentry* HashMgr::lookup(const char* word) const {
 166   struct hentry* dp;
 167   if (tableptr) {
 168     dp = tableptr[hash(word)];
 169     if (!dp)
 170       return NULL;
 171     for (; dp != NULL; dp = dp->next) {
 172       if (strcmp(word, dp->word) == 0)
 173         return dp;
 174     }
 175   }
 176   return NULL;
 177 }
 178
 179 // add a word to the hash table (private)
 180 int HashMgr::add_word(const std::string& in_word,
 181                       int wcl,
 182                       unsigned short* aff,
 183                       int al,
 184                       const std::string* in_desc,
 185                       bool onlyupcase) {
 186   const std::string* word = &in_word;
 187   const std::string* desc = in_desc;
 188
 189   std::string *word_copy = NULL;
 190   std::string *desc_copy = NULL;
 191   if (!ignorechars.empty() || complexprefixes) {
 192     word_copy = new std::string(in_word);
 193
 194     if (!ignorechars.empty()) {
 195       if (utf8) {
 196         wcl = remove_ignored_chars_utf(*word_copy, ignorechars_utf16);
 197       } else {
 198         remove_ignored_chars(*word_copy, ignorechars);
 199       }
 200     }
 201
 202     if (complexprefixes) {
 203       if (utf8)
 204         wcl = reverseword_utf(*word_copy);
 205       else
 206         reverseword(*word_copy);
 207
 208       if (in_desc && !aliasm) {
 209         desc_copy = new std::string(*in_desc);
 210
 211         if (complexprefixes) {
 212           if (utf8)
 213             reverseword_utf(*desc_copy);
 214           else
 215             reverseword(*desc_copy);
 216         }
 217         desc = desc_copy;
 218       }
 219     }
 220
 221     word = word_copy;
 222   }
 223
 224   bool upcasehomonym = false;
 225   int descl = desc ? (aliasm ? sizeof(char*) : desc->size() + 1) : 0;
 226   // variable-length hash record with word and optional fields
 227   struct hentry* hp =
 228       (struct hentry*)malloc(sizeof(struct hentry) + word->size() + descl);
 229   if (!hp) {
 230     delete desc_copy;
 231     delete word_copy;
 232     return 1;
 233   }
 234
 235   char* hpw = hp->word;
 236   strcpy(hpw, word->c_str());
 237
 238   int i = hash(hpw);
 239
 240   hp->blen = (unsigned char)word->size();
 241   hp->clen = (unsigned char)wcl;
 242   hp->alen = (short)al;
 243   hp->astr = aff;
 244   hp->next = NULL;
 245   hp->next_homonym = NULL;
 246
 247   // store the description string or its pointer
 248   if (desc) {
 249     hp->var = H_OPT;
 250     if (aliasm) {
 251       hp->var += H_OPT_ALIASM;
 252       store_pointer(hpw + word->size() + 1, get_aliasm(atoi(desc->c_str())));
 253     } else {
 254       strcpy(hpw + word->size() + 1, desc->c_str());
 255     }
 256     if (strstr(HENTRY_DATA(hp), MORPH_PHON))
 257       hp->var += H_OPT_PHON;
 258   } else
 259     hp->var = 0;
 260
 261   struct hentry* dp = tableptr[i];
 262   if (!dp) {
 263     tableptr[i] = hp;
 264     delete desc_copy;
 265     delete word_copy;
 266     return 0;
 267   }
 268   while (dp->next != NULL) {
 269     if ((!dp->next_homonym) && (strcmp(hp->word, dp->word) == 0)) {
 270       // remove hidden onlyupcase homonym
 271       if (!onlyupcase) {
 272         if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
 273           free(dp->astr);
 274           dp->astr = hp->astr;
 275           dp->alen = hp->alen;
 276           free(hp);
 277           delete desc_copy;
 278           delete word_copy;
 279           return 0;
 280         } else {
 281           dp->next_homonym = hp;
 282         }
 283       } else {
 284         upcasehomonym = true;
 285       }
 286     }
 287     dp = dp->next;
 288   }
 289   if (strcmp(hp->word, dp->word) == 0) {
 290     // remove hidden onlyupcase homonym
 291     if (!onlyupcase) {
 292       if ((dp->astr) && TESTAFF(dp->astr, ONLYUPCASEFLAG, dp->alen)) {
 293         free(dp->astr);
 294         dp->astr = hp->astr;
 295         dp->alen = hp->alen;
 296         free(hp);
 297         delete desc_copy;
 298         delete word_copy;
 299         return 0;
 300       } else {
 301         dp->next_homonym = hp;
 302       }
 303     } else {
 304       upcasehomonym = true;
 305     }
 306   }
 307   if (!upcasehomonym) {
 308     dp->next = hp;
 309   } else {
 310     // remove hidden onlyupcase homonym
 311     if (hp->astr)
 312       free(hp->astr);
 313     free(hp);
 314   }
 315
 316   delete desc_copy;
 317   delete word_copy;
 318   return 0;
 319 }
 320
 321 int HashMgr::add_hidden_capitalized_word(const std::string& word,
 322                                          int wcl,
 323                                          unsigned short* flags,
 324                                          int flagslen,
 325                                          const std::string* dp,
 326                                          int captype) {
 327   if (flags == NULL)
 328     flagslen = 0;
 329
 330   // add inner capitalized forms to handle the following allcap forms:
 331   // Mixed caps: OpenOffice.org -> OPENOFFICE.ORG
 332   // Allcaps with suffixes: CIA's -> CIA'S
 333   if (((captype == HUHCAP) || (captype == HUHINITCAP) ||
 334        ((captype == ALLCAP) && (flagslen != 0))) &&
 335       !((flagslen != 0) && TESTAFF(flags, forbiddenword, flagslen))) {
 336     unsigned short* flags2 =
 337         (unsigned short*)malloc(sizeof(unsigned short) * (flagslen + 1));
 338     if (!flags2)
 339       return 1;
 340     if (flagslen)
 341       memcpy(flags2, flags, flagslen * sizeof(unsigned short));
 342     flags2[flagslen] = ONLYUPCASEFLAG;
 343     if (utf8) {
 344       std::string st;
 345       std::vector<w_char> w;
 346       u8_u16(w, word);
 347       mkallsmall_utf(w, langnum);
 348       mkinitcap_utf(w, langnum);
 349       u16_u8(st, w);
 350       return add_word(st, wcl, flags2, flagslen + 1, dp, true);
 351     } else {
 352       std::string new_word(word);
 353       mkallsmall(new_word, csconv);
 354       mkinitcap(new_word, csconv);
 355       int ret = add_word(new_word, wcl, flags2, flagslen + 1, dp, true);
 356       return ret;
 357     }
 358   }
 359   return 0;
 360 }
 361
 362 // detect captype and modify word length for UTF-8 encoding
 363 int HashMgr::get_clen_and_captype(const std::string& word, int* captype, std::vector<w_char> &workbuf) {
 364   int len;
 365   if (utf8) {
 366     len = u8_u16(workbuf, word);
 367     *captype = get_captype_utf8(workbuf, langnum);
 368   } else {
 369     len = word.size();
 370     *captype = get_captype(word, csconv);
 371   }
 372   return len;
 373 }
 374
 375 int HashMgr::get_clen_and_captype(const std::string& word, int* captype) {
 376   std::vector<w_char> workbuf;
 377   return get_clen_and_captype(word, captype, workbuf);
 378 }
 379
 380 // remove word (personal dictionary function for standalone applications)
 381 int HashMgr::remove(const std::string& word) {
 382   struct hentry* dp = lookup(word.c_str());
 383   while (dp) {
 384     if (dp->alen == 0 || !TESTAFF(dp->astr, forbiddenword, dp->alen)) {
 385       unsigned short* flags =
 386           (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen + 1));
 387       if (!flags)
 388         return 1;
 389       for (int i = 0; i < dp->alen; i++)
 390         flags[i] = dp->astr[i];
 391       flags[dp->alen] = forbiddenword;
 392       free(dp->astr);
 393       dp->astr = flags;
 394       dp->alen++;
 395       std::sort(flags, flags + dp->alen);
 396     }
 397     dp = dp->next_homonym;
 398   }
 399   return 0;
 400 }
 401
 402 /* remove forbidden flag to add a personal word to the hash */
 403 int HashMgr::remove_forbidden_flag(const std::string& word) {
 404   struct hentry* dp = lookup(word.c_str());
 405   if (!dp)
 406     return 1;
 407   while (dp) {
 408     if (dp->astr && TESTAFF(dp->astr, forbiddenword, dp->alen)) {
 409       if (dp->alen == 1)
 410         dp->alen = 0;  // XXX forbidden words of personal dic.
 411       else {
 412         unsigned short* flags2 =
 413             (unsigned short*)malloc(sizeof(unsigned short) * (dp->alen - 1));
 414         if (!flags2)
 415           return 1;
 416         int i, j = 0;
 417         for (i = 0; i < dp->alen; i++) {
 418           if (dp->astr[i] != forbiddenword)
 419             flags2[j++] = dp->astr[i];
 420         }
 421         dp->alen--;
 422         free(dp->astr);
 423         dp->astr = flags2;  // XXX allowed forbidden words
 424       }
 425     }
 426     dp = dp->next_homonym;
 427   }
 428   return 0;
 429 }
 430
 431 // add a custom dic. word to the hash table (public)
 432 int HashMgr::add(const std::string& word) {
 433   if (remove_forbidden_flag(word)) {
 434     int captype;
 435     int al = 0;
 436     unsigned short* flags = NULL;
 437     int wcl = get_clen_and_captype(word, &captype);
 438     add_word(word, wcl, flags, al, NULL, false);
 439     return add_hidden_capitalized_word(word, wcl, flags, al, NULL,
 440                                        captype);
 441   }
 442   return 0;
 443 }
 444
 445 int HashMgr::add_with_affix(const std::string& word, const std::string& example) {
 446   // detect captype and modify word length for UTF-8 encoding
 447   struct hentry* dp = lookup(example.c_str());
 448   remove_forbidden_flag(word);
 449   if (dp && dp->astr) {
 450     int captype;
 451     int wcl = get_clen_and_captype(word, &captype);
 452     if (aliasf) {
 453       add_word(word, wcl, dp->astr, dp->alen, NULL, false);
 454     } else {
 455       unsigned short* flags =
 456           (unsigned short*)malloc(dp->alen * sizeof(unsigned short));
 457       if (flags) {
 458         memcpy((void*)flags, (void*)dp->astr,
 459                dp->alen * sizeof(unsigned short));
 460         add_word(word, wcl, flags, dp->alen, NULL, false);
 461       } else
 462         return 1;
 463     }
 464     return add_hidden_capitalized_word(word, wcl, dp->astr,
 465                                        dp->alen, NULL, captype);
 466   }
 467   return 1;
 468 }
 469
 470 // walk the hash table entry by entry - null at end
 471 // initialize: col=-1; hp = NULL; hp = walk_hashtable(&col, hp);
 472 struct hentry* HashMgr::walk_hashtable(int& col, struct hentry* hp) const {
 473   if (hp && hp->next != NULL)
 474     return hp->next;
 475   for (col++; col < tablesize; col++) {
 476     if (tableptr[col])
 477       return tableptr[col];
 478   }
 479   // null at end and reset to start
 480   col = -1;
 481   return NULL;
 482 }
 483
 484 // load a munched word list and build a hash table on the fly
 485 int HashMgr::load_tables(const char* tpath, const char* key) {
 486   // open dictionary file
 487   FileMgr* dict = new FileMgr(tpath, key);
 488   if (dict == NULL)
 489     return 1;
 490
 491   // first read the first line of file to get hash table size */
 492   std::string ts;
 493   if (!dict->getline(ts)) {
 494     HUNSPELL_WARNING(stderr, "error: empty dic file %s\n", tpath);
 495     delete dict;
 496     return 2;
 497   }
 498   mychomp(ts);
 499
 500   /* remove byte order mark */
 501   if (ts.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
 502     ts.erase(0, 3);
 503   }
 504
 505   tablesize = atoi(ts.c_str());
 506
 507   int nExtra = 5 + USERWORD;
 508
 509   if (tablesize <= 0 ||
 510       (tablesize >= (std::numeric_limits<int>::max() - 1 - nExtra) /
 511                         int(sizeof(struct hentry*)))) {
 512     HUNSPELL_WARNING(
 513         stderr, "error: line 1: missing or bad word count in the dic file\n");
 514     delete dict;
 515     return 4;
 516   }
 517   tablesize += nExtra;
 518   if ((tablesize % 2) == 0)
 519     tablesize++;
 520
 521   // allocate the hash table
 522   tableptr = (struct hentry**)calloc(tablesize, sizeof(struct hentry*));
 523   if (!tableptr) {
 524     delete dict;
 525     return 3;
 526   }
 527
 528   // loop through all words on much list and add to hash
 529   // table and create word and affix strings
 530
 531   std::vector<w_char> workbuf;
 532
 533   while (dict->getline(ts)) {
 534     mychomp(ts);
 535     // split each line into word and morphological description
 536     size_t dp_pos = 0;
 537     while ((dp_pos = ts.find(':', dp_pos)) != std::string::npos) {
 538       if ((dp_pos > 3) && (ts[dp_pos - 3] == ' ' || ts[dp_pos - 3] == '\t')) {
 539         for (dp_pos -= 3; dp_pos > 0 && (ts[dp_pos-1] == ' ' || ts[dp_pos-1] == '\t'); --dp_pos)
 540           ;
 541         if (dp_pos == 0) {  // missing word
 542           dp_pos = std::string::npos;
 543         } else {
 544           ++dp_pos;
 545         }
 546         break;
 547       }
 548       ++dp_pos;
 549     }
 550
 551     // tabulator is the old morphological field separator
 552     size_t dp2_pos = ts.find('\t');
 553     if (dp2_pos != std::string::npos && (dp_pos == std::string::npos || dp2_pos < dp_pos)) {
 554       dp_pos = dp2_pos + 1;
 555     }
 556
 557     std::string dp;
 558     if (dp_pos != std::string::npos) {
 559       dp.assign(ts.substr(dp_pos));
 560       ts.resize(dp_pos - 1);
 561     }
 562
 563     // split each line into word and affix char strings
 564     // "\/" signs slash in words (not affix separator)
 565     // "/" at beginning of the line is word character (not affix separator)
 566     size_t ap_pos = ts.find('/');
 567     while (ap_pos != std::string::npos) {
 568       if (ap_pos == 0) {
 569         ++ap_pos;
 570         continue;
 571       } else if (ts[ap_pos - 1] != '\\')
 572         break;
 573       // replace "\/" with "/"
 574       ts.erase(ap_pos - 1, 1);
 575       ap_pos = ts.find('/', ap_pos);
 576     }
 577
 578     unsigned short* flags;
 579     int al;
 580     if (ap_pos != std::string::npos && ap_pos != ts.size()) {
 581       std::string ap(ts.substr(ap_pos + 1));
 582       ts.resize(ap_pos);
 583       if (aliasf) {
 584         int index = atoi(ap.c_str());
 585         al = get_aliasf(index, &flags, dict);
 586         if (!al) {
 587           HUNSPELL_WARNING(stderr, "error: line %d: bad flag vector alias\n",
 588                            dict->getlinenum());
 589         }
 590       } else {
 591         al = decode_flags(&flags, ap.c_str(), dict);
 592         if (al == -1) {
 593           HUNSPELL_WARNING(stderr, "Can't allocate memory.\n");
 594           delete dict;
 595           return 6;
 596         }
 597         std::sort(flags, flags + al);
 598       }
 599     } else {
 600       al = 0;
 601       flags = NULL;
 602     }
 603
 604     int captype;
 605     int wcl = get_clen_and_captype(ts, &captype, workbuf);
 606     const std::string *dp_str = dp.empty() ? NULL : &dp;
 607     // add the word and its index plus its capitalized form optionally
 608     if (add_word(ts, wcl, flags, al, dp_str, false) ||
 609         add_hidden_capitalized_word(ts, wcl, flags, al, dp_str, captype)) {
 610       delete dict;
 611       return 5;
 612     }
 613   }
 614
 615   delete dict;
 616   return 0;
 617 }
 618
 619 // the hash function is a simple load and rotate
 620 // algorithm borrowed
 621 int HashMgr::hash(const char* word) const {
 622   unsigned long hv = 0;
 623   for (int i = 0; i < 4 && *word != 0; i++)
 624     hv = (hv << 8) | (*word++);
 625   while (*word != 0) {
 626     ROTATE(hv, ROTATE_LEN);
 627     hv ^= (*word++);
 628   }
 629   return (unsigned long)hv % tablesize;
 630 }
 631
 632 int HashMgr::decode_flags(unsigned short** result, const std::string& flags, FileMgr* af) const {
 633   int len;
 634   if (flags.empty()) {
 635     *result = NULL;
 636     return 0;
 637   }
 638   switch (flag_mode) {
 639     case FLAG_LONG: {  // two-character flags (1x2yZz -> 1x 2y Zz)
 640       len = flags.size();
 641       if (len % 2 == 1)
 642         HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
 643                          af->getlinenum());
 644       len /= 2;
 645       *result = (unsigned short*)malloc(len * sizeof(unsigned short));
 646       if (!*result)
 647         return -1;
 648       for (int i = 0; i < len; i++) {
 649         (*result)[i] = ((unsigned short)((unsigned char)flags[i * 2]) << 8) +
 650                        (unsigned char)flags[i * 2 + 1];
 651       }
 652       break;
 653     }
 654     case FLAG_NUM: {  // decimal numbers separated by comma (4521,23,233 -> 4521
 655                       // 23 233)
 656       len = 1;
 657       unsigned short* dest;
 658       for (size_t i = 0; i < flags.size(); ++i) {
 659         if (flags[i] == ',')
 660           len++;
 661       }
 662       *result = (unsigned short*)malloc(len * sizeof(unsigned short));
 663       if (!*result)
 664         return -1;
 665       dest = *result;
 666       const char* src = flags.c_str();
 667       for (const char* p = src; *p; p++) {
 668         if (*p == ',') {
 669           int i = atoi(src);
 670           if (i >= DEFAULTFLAGS)
 671             HUNSPELL_WARNING(
 672                 stderr, "error: line %d: flag id %d is too large (max: %d)\n",
 673                 af->getlinenum(), i, DEFAULTFLAGS - 1);
 674           *dest = (unsigned short)i;
 675           if (*dest == 0)
 676             HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
 677                              af->getlinenum());
 678           src = p + 1;
 679           dest++;
 680         }
 681       }
 682       int i = atoi(src);
 683       if (i >= DEFAULTFLAGS)
 684         HUNSPELL_WARNING(stderr,
 685                          "error: line %d: flag id %d is too large (max: %d)\n",
 686                          af->getlinenum(), i, DEFAULTFLAGS - 1);
 687       *dest = (unsigned short)i;
 688       if (*dest == 0)
 689         HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
 690                          af->getlinenum());
 691       break;
 692     }
 693     case FLAG_UNI: {  // UTF-8 characters
 694       std::vector<w_char> w;
 695       u8_u16(w, flags);
 696       len = w.size();
 697       *result = (unsigned short*)malloc(len * sizeof(unsigned short));
 698       if (!*result)
 699         return -1;
 700       memcpy(*result, &w[0], len * sizeof(short));
 701       break;
 702     }
 703     default: {  // Ispell's one-character flags (erfg -> e r f g)
 704       unsigned short* dest;
 705       len = flags.size();
 706       *result = (unsigned short*)malloc(len * sizeof(unsigned short));
 707       if (!*result)
 708         return -1;
 709       dest = *result;
 710       for (size_t i = 0; i < flags.size(); ++i) {
 711         *dest = (unsigned char)flags[i];
 712         dest++;
 713       }
 714     }
 715   }
 716   return len;
 717 }
 718
 719 bool HashMgr::decode_flags(std::vector<unsigned short>& result, const std::string& flags, FileMgr* af) const {
 720   if (flags.empty()) {
 721     return false;
 722   }
 723   switch (flag_mode) {
 724     case FLAG_LONG: {  // two-character flags (1x2yZz -> 1x 2y Zz)
 725       size_t len = flags.size();
 726       if (len % 2 == 1)
 727         HUNSPELL_WARNING(stderr, "error: line %d: bad flagvector\n",
 728                          af->getlinenum());
 729       len /= 2;
 730       result.reserve(result.size() + len);
 731       for (size_t i = 0; i < len; ++i) {
 732         result.push_back(((unsigned short)((unsigned char)flags[i * 2]) << 8) +
 733                          (unsigned char)flags[i * 2 + 1]);
 734       }
 735       break;
 736     }
 737     case FLAG_NUM: {  // decimal numbers separated by comma (4521,23,233 -> 4521
 738                       // 23 233)
 739       const char* src = flags.c_str();
 740       for (const char* p = src; *p; p++) {
 741         if (*p == ',') {
 742           int i = atoi(src);
 743           if (i >= DEFAULTFLAGS)
 744             HUNSPELL_WARNING(
 745                 stderr, "error: line %d: flag id %d is too large (max: %d)\n",
 746                 af->getlinenum(), i, DEFAULTFLAGS - 1);
 747           result.push_back((unsigned short)i);
 748           if (result.back() == 0)
 749             HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
 750                              af->getlinenum());
 751           src = p + 1;
 752         }
 753       }
 754       int i = atoi(src);
 755       if (i >= DEFAULTFLAGS)
 756         HUNSPELL_WARNING(stderr,
 757                          "error: line %d: flag id %d is too large (max: %d)\n",
 758                          af->getlinenum(), i, DEFAULTFLAGS - 1);
 759       result.push_back((unsigned short)i);
 760       if (result.back() == 0)
 761         HUNSPELL_WARNING(stderr, "error: line %d: 0 is wrong flag id\n",
 762                          af->getlinenum());
 763       break;
 764     }
 765     case FLAG_UNI: {  // UTF-8 characters
 766       std::vector<w_char> w;
 767       u8_u16(w, flags);
 768       size_t len = w.size();
 769       size_t origsize = result.size();
 770       result.resize(origsize + len);
 771       memcpy(&result[origsize], &w[0], len * sizeof(short));
 772       break;
 773     }
 774     default: {  // Ispell's one-character flags (erfg -> e r f g)
 775       result.reserve(flags.size());
 776       for (size_t i = 0; i < flags.size(); ++i) {
 777         result.push_back((unsigned char)flags[i]);
 778       }
 779     }
 780   }
 781   return true;
 782 }
 783
 784 unsigned short HashMgr::decode_flag(const char* f) const {
 785   unsigned short s = 0;
 786   int i;
 787   switch (flag_mode) {
 788     case FLAG_LONG:
 789       s = ((unsigned short)((unsigned char)f[0]) << 8) + (unsigned char)f[1];
 790       break;
 791     case FLAG_NUM:
 792       i = atoi(f);
 793       if (i >= DEFAULTFLAGS)
 794         HUNSPELL_WARNING(stderr, "error: flag id %d is too large (max: %d)\n",
 795                          i, DEFAULTFLAGS - 1);
 796       s = (unsigned short)i;
 797       break;
 798     case FLAG_UNI: {
 799       std::vector<w_char> w;
 800       u8_u16(w, f);
 801       if (!w.empty())
 802           memcpy(&s, &w[0], 1 * sizeof(short));
 803       break;
 804     }
 805     default:
 806       s = *(unsigned char*)f;
 807   }
 808   if (s == 0)
 809     HUNSPELL_WARNING(stderr, "error: 0 is wrong flag id\n");
 810   return s;
 811 }
 812
 813 char* HashMgr::encode_flag(unsigned short f) const {
 814   if (f == 0)
 815     return mystrdup("(NULL)");
 816   std::string ch;
 817   if (flag_mode == FLAG_LONG) {
 818     ch.push_back((unsigned char)(f >> 8));
 819     ch.push_back((unsigned char)(f - ((f >> 8) << 8)));
 820   } else if (flag_mode == FLAG_NUM) {
 821     std::ostringstream stream;
 822     stream << f;
 823     ch = stream.str();
 824   } else if (flag_mode == FLAG_UNI) {
 825     const w_char* w_c = (const w_char*)&f;
 826     std::vector<w_char> w(w_c, w_c + 1);
 827     u16_u8(ch, w);
 828   } else {
 829     ch.push_back((unsigned char)(f));
 830   }
 831   return mystrdup(ch.c_str());
 832 }
 833
 834 // read in aff file and set flag mode
 835 int HashMgr::load_config(const char* affpath, const char* key) {
 836   int firstline = 1;
 837
 838   // open the affix file
 839   FileMgr* afflst = new FileMgr(affpath, key);
 840   if (!afflst) {
 841     HUNSPELL_WARNING(
 842         stderr, "Error - could not open affix description file %s\n", affpath);
 843     return 1;
 844   }
 845
 846   // read in each line ignoring any that do not
 847   // start with a known line type indicator
 848
 849   std::string line;
 850   while (afflst->getline(line)) {
 851     mychomp(line);
 852
 853     /* remove byte order mark */
 854     if (firstline) {
 855       firstline = 0;
 856       if (line.compare(0, 3, "\xEF\xBB\xBF", 3) == 0) {
 857         line.erase(0, 3);
 858       }
 859     }
 860
 861     /* parse in the try string */
 862     if ((line.compare(0, 4, "FLAG", 4) == 0) && line.size() > 4 && isspace(line[4])) {
 863       if (flag_mode != FLAG_CHAR) {
 864         HUNSPELL_WARNING(stderr,
 865                          "error: line %d: multiple definitions of the FLAG "
 866                          "affix file parameter\n",
 867                          afflst->getlinenum());
 868       }
 869       if (line.find("long") != std::string::npos)
 870         flag_mode = FLAG_LONG;
 871       if (line.find("num") != std::string::npos)
 872         flag_mode = FLAG_NUM;
 873       if (line.find("UTF-8") != std::string::npos)
 874         flag_mode = FLAG_UNI;
 875       if (flag_mode == FLAG_CHAR) {
 876         HUNSPELL_WARNING(
 877             stderr,
 878             "error: line %d: FLAG needs `num', `long' or `UTF-8' parameter\n",
 879             afflst->getlinenum());
 880       }
 881     }
 882
 883     if (line.compare(0, 13, "FORBIDDENWORD", 13) == 0) {
 884       std::string st;
 885       if (!parse_string(line, st, afflst->getlinenum())) {
 886         delete afflst;
 887         return 1;
 888       }
 889       forbiddenword = decode_flag(st.c_str());
 890     }
 891
 892     if (line.compare(0, 3, "SET", 3) == 0) {
 893       if (!parse_string(line, enc, afflst->getlinenum())) {
 894         delete afflst;
 895         return 1;
 896       }
 897       if (enc == "UTF-8") {
 898         utf8 = 1;
 899 #ifndef OPENOFFICEORG
 900 #ifndef MOZILLA_CLIENT
 901         initialize_utf_tbl();
 902 #endif
 903 #endif
 904       } else
 905         csconv = get_current_cs(enc);
 906     }
 907
 908     if (line.compare(0, 4, "LANG", 4) == 0) {
 909       if (!parse_string(line, lang, afflst->getlinenum())) {
 910         delete afflst;
 911         return 1;
 912       }
 913       langnum = get_lang_num(lang);
 914     }
 915
 916     /* parse in the ignored characters (for example, Arabic optional diacritics
 917      * characters */
 918     if (line.compare(0, 6, "IGNORE", 6) == 0) {
 919       if (!parse_array(line, ignorechars, ignorechars_utf16,
 920                        utf8, afflst->getlinenum())) {
 921         delete afflst;
 922         return 1;
 923       }
 924     }
 925
 926     if ((line.compare(0, 2, "AF", 2) == 0) && line.size() > 2 && isspace(line[2])) {
 927       if (!parse_aliasf(line, afflst)) {
 928         delete afflst;
 929         return 1;
 930       }
 931     }
 932
 933     if ((line.compare(0, 2, "AM", 2) == 0) && line.size() > 2 && isspace(line[2])) {
 934       if (!parse_aliasm(line, afflst)) {
 935         delete afflst;
 936         return 1;
 937       }
 938     }
 939
 940     if (line.compare(0, 15, "COMPLEXPREFIXES", 15) == 0)
 941       complexprefixes = 1;
 942
 943     if (((line.compare(0, 3, "SFX", 3) == 0) ||
 944          (line.compare(0, 3, "PFX", 3) == 0)) && line.size() > 3 && isspace(line[3]))
 945       break;
 946   }
 947
 948   if (csconv == NULL)
 949     csconv = get_current_cs(SPELL_ENCODING);
 950   delete afflst;
 951   return 0;
 952 }
 953
 954 /* parse in the ALIAS table */
 955 bool HashMgr::parse_aliasf(const std::string& line, FileMgr* af) {
 956   if (numaliasf != 0) {
 957     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
 958                      af->getlinenum());
 959     return false;
 960   }
 961   int i = 0;
 962   int np = 0;
 963   std::string::const_iterator iter = line.begin();
 964   std::string::const_iterator start_piece = mystrsep(line, iter);
 965   while (start_piece != line.end()) {
 966     switch (i) {
 967       case 0: {
 968         np++;
 969         break;
 970       }
 971       case 1: {
 972         numaliasf = atoi(std::string(start_piece, iter).c_str());
 973         if (numaliasf < 1) {
 974           numaliasf = 0;
 975           aliasf = NULL;
 976           aliasflen = NULL;
 977           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
 978                            af->getlinenum());
 979           return false;
 980         }
 981         aliasf =
 982             (unsigned short**)malloc(numaliasf * sizeof(unsigned short*));
 983         aliasflen =
 984             (unsigned short*)malloc(numaliasf * sizeof(unsigned short));
 985         if (!aliasf || !aliasflen) {
 986           numaliasf = 0;
 987           if (aliasf)
 988             free(aliasf);
 989           if (aliasflen)
 990             free(aliasflen);
 991           aliasf = NULL;
 992           aliasflen = NULL;
 993           return false;
 994         }
 995         np++;
 996         break;
 997       }
 998       default:
 999         break;
1000     }
1001     ++i;
1002     start_piece = mystrsep(line, iter);
1003   }
1004   if (np != 2) {
1005     numaliasf = 0;
1006     free(aliasf);
1007     free(aliasflen);
1008     aliasf = NULL;
1009     aliasflen = NULL;
1010     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1011                      af->getlinenum());
1012     return false;
1013   }
1014
1015   /* now parse the numaliasf lines to read in the remainder of the table */
1016   for (int j = 0; j < numaliasf; j++) {
1017     std::string nl;
1018     if (!af->getline(nl))
1019       return false;
1020     mychomp(nl);
1021     i = 0;
1022     aliasf[j] = NULL;
1023     aliasflen[j] = 0;
1024     iter = nl.begin();
1025     start_piece = mystrsep(nl, iter);
1026     while (start_piece != nl.end()) {
1027       switch (i) {
1028         case 0: {
1029           if (nl.compare(start_piece - nl.begin(), 2, "AF", 2) != 0) {
1030             numaliasf = 0;
1031             free(aliasf);
1032             free(aliasflen);
1033             aliasf = NULL;
1034             aliasflen = NULL;
1035             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1036                              af->getlinenum());
1037             return false;
1038           }
1039           break;
1040         }
1041         case 1: {
1042           std::string piece(start_piece, iter);
1043           aliasflen[j] =
1044               (unsigned short)decode_flags(&(aliasf[j]), piece, af);
1045           std::sort(aliasf[j], aliasf[j] + aliasflen[j]);
1046           break;
1047         }
1048         default:
1049           break;
1050       }
1051       ++i;
1052       start_piece = mystrsep(nl, iter);
1053     }
1054     if (!aliasf[j]) {
1055       free(aliasf);
1056       free(aliasflen);
1057       aliasf = NULL;
1058       aliasflen = NULL;
1059       numaliasf = 0;
1060       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1061                        af->getlinenum());
1062       return false;
1063     }
1064   }
1065   return true;
1066 }
1067
1068 int HashMgr::is_aliasf() const {
1069   return (aliasf != NULL);
1070 }
1071
1072 int HashMgr::get_aliasf(int index, unsigned short** fvec, FileMgr* af) const {
1073   if ((index > 0) && (index <= numaliasf)) {
1074     *fvec = aliasf[index - 1];
1075     return aliasflen[index - 1];
1076   }
1077   HUNSPELL_WARNING(stderr, "error: line %d: bad flag alias index: %d\n",
1078                    af->getlinenum(), index);
1079   *fvec = NULL;
1080   return 0;
1081 }
1082
1083 /* parse morph alias definitions */
1084 bool HashMgr::parse_aliasm(const std::string& line, FileMgr* af) {
1085   if (numaliasm != 0) {
1086     HUNSPELL_WARNING(stderr, "error: line %d: multiple table definitions\n",
1087                      af->getlinenum());
1088     return false;
1089   }
1090   int i = 0;
1091   int np = 0;
1092   std::string::const_iterator iter = line.begin();
1093   std::string::const_iterator start_piece = mystrsep(line, iter);
1094   while (start_piece != line.end()) {
1095     switch (i) {
1096       case 0: {
1097         np++;
1098         break;
1099       }
1100       case 1: {
1101         numaliasm = atoi(std::string(start_piece, iter).c_str());
1102         if (numaliasm < 1) {
1103           HUNSPELL_WARNING(stderr, "error: line %d: bad entry number\n",
1104                            af->getlinenum());
1105           return false;
1106         }
1107         aliasm = (char**)malloc(numaliasm * sizeof(char*));
1108         if (!aliasm) {
1109           numaliasm = 0;
1110           return false;
1111         }
1112         np++;
1113         break;
1114       }
1115       default:
1116         break;
1117     }
1118     ++i;
1119     start_piece = mystrsep(line, iter);
1120   }
1121   if (np != 2) {
1122     numaliasm = 0;
1123     free(aliasm);
1124     aliasm = NULL;
1125     HUNSPELL_WARNING(stderr, "error: line %d: missing data\n",
1126                      af->getlinenum());
1127     return false;
1128   }
1129
1130   /* now parse the numaliasm lines to read in the remainder of the table */
1131   for (int j = 0; j < numaliasm; j++) {
1132     std::string nl;
1133     if (!af->getline(nl))
1134       return false;
1135     mychomp(nl);
1136     aliasm[j] = NULL;
1137     iter = nl.begin();
1138     i = 0;
1139     start_piece = mystrsep(nl, iter);
1140     while (start_piece != nl.end()) {
1141       switch (i) {
1142         case 0: {
1143           if (nl.compare(start_piece - nl.begin(), 2, "AM", 2) != 0) {
1144             HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1145                              af->getlinenum());
1146             numaliasm = 0;
1147             free(aliasm);
1148             aliasm = NULL;
1149             return false;
1150           }
1151           break;
1152         }
1153         case 1: {
1154           // add the remaining of the line
1155           std::string::const_iterator end = nl.end();
1156           std::string chunk(start_piece, end);
1157           if (complexprefixes) {
1158             if (utf8)
1159               reverseword_utf(chunk);
1160             else
1161               reverseword(chunk);
1162           }
1163           aliasm[j] = mystrdup(chunk.c_str());
1164           break;
1165         }
1166         default:
1167           break;
1168       }
1169       ++i;
1170       start_piece = mystrsep(nl, iter);
1171     }
1172     if (!aliasm[j]) {
1173       numaliasm = 0;
1174       free(aliasm);
1175       aliasm = NULL;
1176       HUNSPELL_WARNING(stderr, "error: line %d: table is corrupt\n",
1177                        af->getlinenum());
1178       return false;
1179     }
1180   }
1181   return true;
1182 }
1183
1184 int HashMgr::is_aliasm() const {
1185   return (aliasm != NULL);
1186 }
1187
1188 char* HashMgr::get_aliasm(int index) const {
1189   if ((index > 0) && (index <= numaliasm))
1190     return aliasm[index - 1];
1191   HUNSPELL_WARNING(stderr, "error: bad morph. alias index: %d\n", index);
1192   return NULL;
1193 }