3rdparty/hunspell/1.7.0/src/parsers/xmlparser.cxx

   1 /* ***** BEGIN LICENSE BLOCK *****
   2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3  *
   4  * Copyright (C) 2002-2017 Németh László
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
  17  *
  18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
  19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
  20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
  21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
  22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
  23  *
  24  * Alternatively, the contents of this file may be used under the terms of
  25  * either the GNU General Public License Version 2 or later (the "GPL"), or
  26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27  * in which case the provisions of the GPL or the LGPL are applicable instead
  28  * of those above. If you wish to allow use of your version of this file only
  29  * under the terms of either the GPL or the LGPL, and not to allow others to
  30  * use your version of this file under the terms of the MPL, indicate your
  31  * decision by deleting the provisions above and replace them with the notice
  32  * and other provisions required by the GPL or the LGPL. If you do not delete
  33  * the provisions above, a recipient may use your version of this file under
  34  * the terms of any one of the MPL, the GPL or the LGPL.
  35  *
  36  * ***** END LICENSE BLOCK ***** */
  37
  38 #include <cstdlib>
  39 #include <cstring>
  40 #include <cstdio>
  41 #include <ctype.h>
  42
  43 #include "../hunspell/csutil.hxx"
  44 #include "xmlparser.hxx"
  45
  46 #ifndef W32
  47 using namespace std;
  48 #endif
  49
  50 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
  51
  52 static const char* __PATTERN__[][2] = {{"<!--", "-->"},
  53                                        {"<[cdata[", "]]>"},  // XML comment
  54                                        {"<", ">"}};
  55
  56 #define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char*) * 2))
  57
  58 // for checking attributes, eg. <img alt="text"> in HTML
  59 static const char* (*__PATTERN2__)[2] = NULL;
  60
  61 #define __PATTERN_LEN2__ 0
  62
  63 // for checking words with in-word patterns
  64 // for example, "exam<text:span>p</text:span>le" in ODT
  65 static const char* (*__PATTERN3__)[2] = NULL;
  66
  67 #define __PATTERN_LEN3__ 0
  68
  69 #define ENTITY_APOS "&apos;"
  70 #define UTF8_APOS "\xe2\x80\x99"
  71 #define APOSTROPHE "'"
  72
  73 XMLParser::XMLParser(const char* wordchars)
  74     : TextParser(wordchars)
  75     , pattern_num(0), pattern2_num(0), pattern3_num(0), prevstate(0), checkattr(0), quotmark(0) {
  76 }
  77
  78 XMLParser::XMLParser(const w_char* wordchars, int len)
  79     : TextParser(wordchars, len)
  80     , pattern_num(0), pattern2_num(0), pattern3_num(0), prevstate(0), checkattr(0), quotmark(0) {
  81 }
  82
  83 XMLParser::~XMLParser() {}
  84
  85 int XMLParser::look_pattern(const char* p[][2], unsigned int len, int column) {
  86   for (unsigned int i = 0; i < len; i++) {
  87     const char* j = line[actual].c_str() + head;
  88     const char* k = p[i][column];
  89     while ((*k != '\0') && (tolower(*j) == *k)) {
  90       j++;
  91       k++;
  92     }
  93     if (*k == '\0')
  94       return i;
  95   }
  96   return -1;
  97 }
  98
  99 /*
 100  * XML parser
 101  *
 102  */
 103
 104 bool XMLParser::next_token(const char* PATTERN[][2],
 105                            unsigned int PATTERN_LEN,
 106                            const char* PATTERN2[][2],
 107                            unsigned int PATTERN_LEN2,
 108                            const char* PATTERN3[][2],
 109                            unsigned int PATTERN_LEN3,
 110                            std::string& t) {
 111   t.clear();
 112   const char* latin1;
 113
 114   for (;;) {
 115     switch (state) {
 116       case ST_NON_WORD:  // non word chars
 117         prevstate = ST_NON_WORD;
 118         if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
 119           checkattr = 0;
 120           if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
 121             checkattr = 1;
 122           }
 123           state = ST_TAG;
 124         } else if (is_wordchar(line[actual].c_str() + head)) {
 125           state = ST_WORD;
 126           token = head;
 127         } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
 128           state = ST_WORD;
 129           token = head;
 130           head += strlen(latin1);
 131         } else if (line[actual][head] == '&') {
 132           state = ST_CHAR_ENTITY;
 133         }
 134         break;
 135       case ST_WORD:  // wordchar
 136         if ((latin1 = get_latin1(line[actual].c_str() + head))) {
 137           head += strlen(latin1);
 138         } else if ((is_wordchar((char*)APOSTROPHE) ||
 139                     (is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
 140                    strncmp(line[actual].c_str() + head, ENTITY_APOS,
 141                            strlen(ENTITY_APOS)) == 0 &&
 142                    is_wordchar(line[actual].c_str() + head + strlen(ENTITY_APOS))) {
 143           head += strlen(ENTITY_APOS) - 1;
 144         } else if (is_utf8() &&
 145                    is_wordchar((char*)APOSTROPHE) &&  // add Unicode apostrophe
 146                                                       // to the WORDCHARS, if
 147                                                       // needed
 148                    strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
 149                        0 &&
 150                    is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
 151           head += strlen(UTF8_APOS) - 1;
 152         } else if (!is_wordchar(line[actual].c_str() + head)) {
 153           // in-word patterns
 154           if ((pattern3_num = look_pattern(PATTERN3, PATTERN_LEN3, 0)) != -1) {
 155             size_t pos = line[actual].find(PATTERN3[pattern3_num][1], head);
 156             if (pos != std::string::npos) {
 157               size_t endpos = pos + strlen(PATTERN3[pattern3_num][1]) - 1;
 158               if (is_wordchar(line[actual].c_str() + endpos + 1)) {
 159                 head = endpos;
 160                 break;
 161               }
 162             }
 163           }
 164           state = prevstate;
 165           // return with the token, except in the case of in-word patterns
 166           if (alloc_token(token, &head, t))
 167             return true;
 168         }
 169         break;
 170       case ST_TAG:  // comment, labels, etc
 171         int i;
 172         if ((checkattr == 1) &&
 173             ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1) &&
 174             (strcmp(PATTERN2[i][0], PATTERN2[pattern2_num][0]) == 0)) {
 175           checkattr = 2;
 176         } else if ((checkattr > 0) && (line[actual][head] == '>')) {
 177           state = ST_NON_WORD;
 178         } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
 179                    (strcmp(PATTERN[i][1], PATTERN[pattern_num][1]) == 0)) {
 180           state = ST_NON_WORD;
 181           head += strlen(PATTERN[pattern_num][1]) - 1;
 182         } else if ((strcmp(PATTERN[pattern_num][0], "<") == 0) &&
 183                    ((line[actual][head] == '"') ||
 184                     (line[actual][head] == '\''))) {
 185           quotmark = line[actual][head];
 186           state = ST_ATTRIB;
 187         }
 188         break;
 189       case ST_ATTRIB:  // non word chars
 190         prevstate = ST_ATTRIB;
 191         if (line[actual][head] == quotmark) {
 192           state = ST_TAG;
 193           if (checkattr == 2)
 194             checkattr = 1;
 195           // for IMG ALT
 196         } else if (is_wordchar(line[actual].c_str() + head) && (checkattr == 2)) {
 197           state = ST_WORD;
 198           token = head;
 199         } else if (line[actual][head] == '&') {
 200           state = ST_CHAR_ENTITY;
 201         }
 202         break;
 203       case ST_CHAR_ENTITY:  // SGML element
 204         if ((tolower(line[actual][head]) == ';')) {
 205           state = prevstate;
 206           head--;
 207         }
 208     }
 209     if (next_char(line[actual].c_str(), &head))
 210       return false;
 211   }
 212   //FIXME No return, in function returning non-void
 213 }
 214
 215 bool XMLParser::next_token(std::string& t) {
 216   return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__,
 217                     __PATTERN_LEN2__, __PATTERN3__, __PATTERN_LEN3__, t);
 218 }
 219
 220 // remove in-word patterns
 221 std::string XMLParser::get_word2(
 222         const char* PATTERN3[][2],
 223         unsigned int PATTERN_LEN3,
 224         const std::string &tok) {
 225   std::string word = tok;
 226   for (unsigned int i = 0; i < PATTERN_LEN3; i++) {
 227     size_t pos;
 228     while ((pos = word.find(PATTERN3[i][0])) != word.npos) {
 229       size_t endpos = word.find(PATTERN3[i][1], pos);
 230       if (endpos != word.npos) {
 231         word.erase(pos, endpos + strlen(PATTERN3[i][1]) - pos);
 232       } else
 233         return word;
 234     }
 235   }
 236   return word;
 237 }
 238
 239 int XMLParser::change_token(const char* word) {
 240   if (strstr(word, APOSTROPHE) != NULL || strchr(word, '"') != NULL ||
 241       strchr(word, '&') != NULL || strchr(word, '<') != NULL ||
 242       strchr(word, '>') != NULL) {
 243     std::string r(word);
 244     mystrrep(r, "&", "__namp;__");
 245     mystrrep(r, "__namp;__", "&amp;");
 246     mystrrep(r, APOSTROPHE, ENTITY_APOS);
 247     mystrrep(r, "\"", "&quot;");
 248     mystrrep(r, ">", "&gt;");
 249     mystrrep(r, "<", "&lt;");
 250     return TextParser::change_token(r.c_str());
 251   }
 252   return TextParser::change_token(word);
 253 }