src/3rdparty/hunspell/1.3.3/src/parsers/xmlparser.cxx

   1 #include <cstdlib>
   2 #include <cstring>
   3 #include <cstdio>
   4 #include <ctype.h>
   5
   6 #include "../hunspell/csutil.hxx"
   7 #include "xmlparser.hxx"
   8
   9
  10 #ifndef W32
  11 using namespace std;
  12 #endif
  13
  14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
  15
  16 static const char * __PATTERN__[][2] = {
  17         { "<!--", "-->" },
  18         { "<[cdata[", "]]>" }, // XML comment
  19         { "<", ">" }
  20 };
  21
  22 #define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char *) * 2))
  23
  24 static const char * __PATTERN2__[][2] = {
  25 };
  26
  27 #define __PATTERN_LEN2__ (sizeof(__PATTERN2__) / (sizeof(char *) * 2))
  28
  29 #define ENTITY_APOS "&apos;"
  30 #define UTF8_APOS "\xe2\x80\x99"
  31 #define APOSTROPHE "'"
  32
  33 XMLParser::XMLParser()
  34 {
  35 }
  36
  37 XMLParser::XMLParser(const char * wordchars)
  38 {
  39         init(wordchars);
  40 }
  41
  42 XMLParser::XMLParser(unsigned short * wordchars, int len)
  43 {
  44         init(wordchars, len);
  45 }
  46
  47 XMLParser::~XMLParser()
  48 {
  49 }
  50
  51 int XMLParser::look_pattern(const char * p[][2], unsigned int len, int column)
  52 {
  53         for (unsigned int i = 0; i < len; i++) {
  54                 char * j = line[actual] + head;
  55                 const char * k = p[i][column];
  56                 while ((*k != '\0') && (tolower(*j) == *k)) {
  57                         j++;
  58                         k++;
  59                 }
  60                 if (*k == '\0') return i;
  61         }
  62         return -1;
  63 }
  64
  65 /*
  66  * XML parser
  67  *
  68  */
  69
  70 char * XMLParser::next_token(const char * PATTERN[][2], unsigned int PATTERN_LEN, const char * PATTERN2[][2], unsigned int PATTERN_LEN2)
  71 {
  72         const char * latin1;
  73
  74         for (;;) {
  75                 switch (state)
  76                 {
  77                 case ST_NON_WORD: // non word chars
  78                         prevstate = ST_NON_WORD;
  79                         if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
  80                                 checkattr = 0;
  81                                 if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
  82                                         checkattr = 1;
  83                                 }
  84                                 state = ST_TAG;
  85                         } else if (is_wordchar(line[actual] + head)) {
  86                                 state = ST_WORD;
  87                                 token = head;
  88                         } else if ((latin1 = get_latin1(line[actual] + head))) {
  89                                 state = ST_WORD;
  90                                 token = head;
  91                                 head += strlen(latin1);
  92                         } else if (line[actual][head] == '&') {
  93                                 state = ST_CHAR_ENTITY;
  94                         }
  95                         break;
  96                 case ST_WORD: // wordchar
  97                         if ((latin1 = get_latin1(line[actual] + head))) {
  98                                 head += strlen(latin1);
  99                         } else if ((is_wordchar((char *) APOSTROPHE) || (is_utf8() && is_wordchar((char *) UTF8_APOS))) &&
 100                                         strncmp(line[actual] + head, ENTITY_APOS, strlen(ENTITY_APOS)) == 0 &&
 101                                         is_wordchar(line[actual] + head + strlen(ENTITY_APOS))) {
 102                                 head += strlen(ENTITY_APOS) - 1;
 103                         } else if (is_utf8() && is_wordchar((char *) APOSTROPHE) && // add Unicode apostrophe to the WORDCHARS, if needed
 104                                         strncmp(line[actual] + head, UTF8_APOS, strlen(UTF8_APOS)) == 0 &&
 105                                         is_wordchar(line[actual] + head + strlen(UTF8_APOS))) {
 106                                 head += strlen(UTF8_APOS) - 1;
 107                         } else if (! is_wordchar(line[actual] + head)) {
 108                                 state = prevstate;
 109                                 char * t = alloc_token(token, &head);
 110                                 if (t) return t;
 111                         }
 112                         break;
 113                 case ST_TAG: // comment, labels, etc
 114                         int i;
 115                         if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1)
 116                                 && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num][0]) == 0)) {
 117                                         checkattr = 2;
 118                         } else if ((checkattr > 0) && (line[actual][head] == '>')) {
 119                                         state = ST_NON_WORD;
 120                         } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
 121                                 (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) == 0)) {
 122                                         state = ST_NON_WORD;
 123                                         head += strlen(PATTERN[pattern_num][1]) - 1;
 124                         } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) &&
 125                                 ((line[actual][head] == '"') || (line[actual][head] == '\''))) {
 126                                 quotmark = line[actual][head];
 127                                 state = ST_ATTRIB;
 128                         }
 129                         break;
 130                 case ST_ATTRIB: // non word chars
 131                         prevstate = ST_ATTRIB;
 132                         if (line[actual][head] == quotmark) {
 133                                 state = ST_TAG;
 134                                 if (checkattr == 2) checkattr = 1;
 135                          // for IMG ALT
 136                         } else if (is_wordchar(line[actual] + head) && (checkattr == 2)) {
 137                                 state = ST_WORD;
 138                                 token = head;
 139                         } else if (line[actual][head] == '&') {
 140                                 state = ST_CHAR_ENTITY;
 141                         }
 142                         break;
 143                 case ST_CHAR_ENTITY: // SGML element
 144                         if ((tolower(line[actual][head]) == ';')) {
 145                                 state = prevstate;
 146                                 head--;
 147                         }
 148                 }
 149                 if (next_char(line[actual], &head)) return NULL;
 150         }
 151 }
 152
 153 char * XMLParser::next_token()
 154 {
 155         return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__, __PATTERN_LEN2__);
 156 }
 157
 158 int XMLParser::change_token(const char * word)
 159 {
 160         if (strstr(word, APOSTROPHE) != NULL ||
 161             strchr(word, '"') != NULL ||
 162             strchr(word, '&') != NULL ||
 163             strchr(word, '<') != NULL ||
 164             strchr(word, '>') != NULL) {
 165                 char r[MAXLNLEN];
 166                 strcpy(r, word);
 167                 return TextParser::change_token(mystrrep(mystrrep(mystrrep(mystrrep(mystrrep(mystrrep(r,
 168                         "&", "__namp;__"),
 169                         "__namp;__", "&amp;"),
 170                         APOSTROPHE, ENTITY_APOS),
 171                         "\"", "&quot;"),
 172                         ">", "&gt;"),
 173                         "<", "&lt;"));
 174         }
 175         return TextParser::change_token(word);
 176 }