3rdparty/hunspell/1.6.2/src/parsers/xmlparser.cxx

   1 /* ***** BEGIN LICENSE BLOCK *****
   2  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   3  *
   4  * Copyright (C) 2002-2017 Németh László
   5  *
   6  * The contents of this file are subject to the Mozilla Public License Version
   7  * 1.1 (the "License"); you may not use this file except in compliance with
   8  * the License. You may obtain a copy of the License at
   9  * http://www.mozilla.org/MPL/
  10  *
  11  * Software distributed under the License is distributed on an "AS IS" basis,
  12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
  13  * for the specific language governing rights and limitations under the
  14  * License.
  15  *
  16  * Hunspell is based on MySpell which is Copyright (C) 2002 Kevin Hendricks.
  17  *
  18  * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
  19  * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
  20  * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
  21  * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
  22  * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
  23  *
  24  * Alternatively, the contents of this file may be used under the terms of
  25  * either the GNU General Public License Version 2 or later (the "GPL"), or
  26  * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
  27  * in which case the provisions of the GPL or the LGPL are applicable instead
  28  * of those above. If you wish to allow use of your version of this file only
  29  * under the terms of either the GPL or the LGPL, and not to allow others to
  30  * use your version of this file under the terms of the MPL, indicate your
  31  * decision by deleting the provisions above and replace them with the notice
  32  * and other provisions required by the GPL or the LGPL. If you do not delete
  33  * the provisions above, a recipient may use your version of this file under
  34  * the terms of any one of the MPL, the GPL or the LGPL.
  35  *
  36  * ***** END LICENSE BLOCK ***** */
  37
  38 #include <cstdlib>
  39 #include <cstring>
  40 #include <cstdio>
  41 #include <ctype.h>
  42
  43 #include "../hunspell/csutil.hxx"
  44 #include "xmlparser.hxx"
  45
  46 #ifndef W32
  47 using namespace std;
  48 #endif
  49
  50 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
  51
  52 static const char* __PATTERN__[][2] = {{"<!--", "-->"},
  53                                        {"<[cdata[", "]]>"},  // XML comment
  54                                        {"<", ">"}};
  55
  56 #define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char*) * 2))
  57
  58 static const char* (*__PATTERN2__)[2] = NULL;
  59
  60 #define __PATTERN_LEN2__ 0
  61
  62 #define ENTITY_APOS "&apos;"
  63 #define UTF8_APOS "\xe2\x80\x99"
  64 #define APOSTROPHE "'"
  65
  66 XMLParser::XMLParser(const char* wordchars)
  67     : TextParser(wordchars)
  68     , pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) {
  69 }
  70
  71 XMLParser::XMLParser(const w_char* wordchars, int len)
  72     : TextParser(wordchars, len)
  73     , pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) {
  74 }
  75
  76 XMLParser::~XMLParser() {}
  77
  78 int XMLParser::look_pattern(const char* p[][2], unsigned int len, int column) {
  79   for (unsigned int i = 0; i < len; i++) {
  80     const char* j = line[actual].c_str() + head;
  81     const char* k = p[i][column];
  82     while ((*k != '\0') && (tolower(*j) == *k)) {
  83       j++;
  84       k++;
  85     }
  86     if (*k == '\0')
  87       return i;
  88   }
  89   return -1;
  90 }
  91
  92 /*
  93  * XML parser
  94  *
  95  */
  96
  97 bool XMLParser::next_token(const char* PATTERN[][2],
  98                            unsigned int PATTERN_LEN,
  99                            const char* PATTERN2[][2],
 100                            unsigned int PATTERN_LEN2,
 101                            std::string& t) {
 102   t.clear();
 103   const char* latin1;
 104
 105   for (;;) {
 106     switch (state) {
 107       case ST_NON_WORD:  // non word chars
 108         prevstate = ST_NON_WORD;
 109         if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
 110           checkattr = 0;
 111           if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
 112             checkattr = 1;
 113           }
 114           state = ST_TAG;
 115         } else if (is_wordchar(line[actual].c_str() + head)) {
 116           state = ST_WORD;
 117           token = head;
 118         } else if ((latin1 = get_latin1(line[actual].c_str() + head))) {
 119           state = ST_WORD;
 120           token = head;
 121           head += strlen(latin1);
 122         } else if (line[actual][head] == '&') {
 123           state = ST_CHAR_ENTITY;
 124         }
 125         break;
 126       case ST_WORD:  // wordchar
 127         if ((latin1 = get_latin1(line[actual].c_str() + head))) {
 128           head += strlen(latin1);
 129         } else if ((is_wordchar((char*)APOSTROPHE) ||
 130                     (is_utf8() && is_wordchar((char*)UTF8_APOS))) &&
 131                    strncmp(line[actual].c_str() + head, ENTITY_APOS,
 132                            strlen(ENTITY_APOS)) == 0 &&
 133                    is_wordchar(line[actual].c_str() + head + strlen(ENTITY_APOS))) {
 134           head += strlen(ENTITY_APOS) - 1;
 135         } else if (is_utf8() &&
 136                    is_wordchar((char*)APOSTROPHE) &&  // add Unicode apostrophe
 137                                                       // to the WORDCHARS, if
 138                                                       // needed
 139                    strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) ==
 140                        0 &&
 141                    is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) {
 142           head += strlen(UTF8_APOS) - 1;
 143         } else if (!is_wordchar(line[actual].c_str() + head)) {
 144           state = prevstate;
 145           if (alloc_token(token, &head, t))
 146             return true;
 147         }
 148         break;
 149       case ST_TAG:  // comment, labels, etc
 150         int i;
 151         if ((checkattr == 1) &&
 152             ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1) &&
 153             (strcmp(PATTERN2[i][0], PATTERN2[pattern2_num][0]) == 0)) {
 154           checkattr = 2;
 155         } else if ((checkattr > 0) && (line[actual][head] == '>')) {
 156           state = ST_NON_WORD;
 157         } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
 158                    (strcmp(PATTERN[i][1], PATTERN[pattern_num][1]) == 0)) {
 159           state = ST_NON_WORD;
 160           head += strlen(PATTERN[pattern_num][1]) - 1;
 161         } else if ((strcmp(PATTERN[pattern_num][0], "<") == 0) &&
 162                    ((line[actual][head] == '"') ||
 163                     (line[actual][head] == '\''))) {
 164           quotmark = line[actual][head];
 165           state = ST_ATTRIB;
 166         }
 167         break;
 168       case ST_ATTRIB:  // non word chars
 169         prevstate = ST_ATTRIB;
 170         if (line[actual][head] == quotmark) {
 171           state = ST_TAG;
 172           if (checkattr == 2)
 173             checkattr = 1;
 174           // for IMG ALT
 175         } else if (is_wordchar(line[actual].c_str() + head) && (checkattr == 2)) {
 176           state = ST_WORD;
 177           token = head;
 178         } else if (line[actual][head] == '&') {
 179           state = ST_CHAR_ENTITY;
 180         }
 181         break;
 182       case ST_CHAR_ENTITY:  // SGML element
 183         if ((tolower(line[actual][head]) == ';')) {
 184           state = prevstate;
 185           head--;
 186         }
 187     }
 188     if (next_char(line[actual].c_str(), &head))
 189       return false;
 190   }
 191   //FIXME No return, in function returning non-void
 192 }
 193
 194 bool XMLParser::next_token(std::string& t) {
 195   return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__,
 196                     __PATTERN_LEN2__, t);
 197 }
 198
 199 int XMLParser::change_token(const char* word) {
 200   if (strstr(word, APOSTROPHE) != NULL || strchr(word, '"') != NULL ||
 201       strchr(word, '&') != NULL || strchr(word, '<') != NULL ||
 202       strchr(word, '>') != NULL) {
 203     std::string r(word);
 204     mystrrep(r, "&", "__namp;__");
 205     mystrrep(r, "__namp;__", "&amp;");
 206     mystrrep(r, APOSTROPHE, ENTITY_APOS);
 207     mystrrep(r, "\"", "&quot;");
 208     mystrrep(r, ">", "&gt;");
 209     mystrrep(r, "<", "&lt;");
 210     return TextParser::change_token(r.c_str());
 211   }
 212   return TextParser::change_token(word);
 213 }