]> git.lyx.org Git - features.git/blob - src/3rdparty/hunspell/1.3.3/src/parsers/xmlparser.cxx
add stripped down hunspell 1.3.3
[features.git] / src / 3rdparty / hunspell / 1.3.3 / src / parsers / xmlparser.cxx
1 #include <cstdlib>
2 #include <cstring>
3 #include <cstdio>
4 #include <ctype.h>
5
6 #include "../hunspell/csutil.hxx"
7 #include "xmlparser.hxx"
8
9
10 #ifndef W32
11 using namespace std;
12 #endif
13
14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
15
16 static const char * __PATTERN__[][2] = {
17         { "<!--", "-->" },
18         { "<[cdata[", "]]>" }, // XML comment
19         { "<", ">" }
20 };
21
22 #define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char *) * 2))
23
24 static const char * __PATTERN2__[][2] = {
25 };
26
27 #define __PATTERN_LEN2__ (sizeof(__PATTERN2__) / (sizeof(char *) * 2))
28
29 #define ENTITY_APOS "&apos;"
30 #define UTF8_APOS "\xe2\x80\x99"
31 #define APOSTROPHE "'"
32
33 XMLParser::XMLParser()
34 {
35 }
36
37 XMLParser::XMLParser(const char * wordchars)
38 {
39         init(wordchars);
40 }
41
42 XMLParser::XMLParser(unsigned short * wordchars, int len)
43 {
44         init(wordchars, len);
45 }
46
47 XMLParser::~XMLParser()
48 {
49 }
50
51 int XMLParser::look_pattern(const char * p[][2], unsigned int len, int column)
52 {
53         for (unsigned int i = 0; i < len; i++) {
54                 char * j = line[actual] + head;
55                 const char * k = p[i][column];
56                 while ((*k != '\0') && (tolower(*j) == *k)) {
57                         j++;
58                         k++;
59                 }
60                 if (*k == '\0') return i;
61         }
62         return -1;
63 }
64
65 /*
66  * XML parser
67  *
68  */
69
70 char * XMLParser::next_token(const char * PATTERN[][2], unsigned int PATTERN_LEN, const char * PATTERN2[][2], unsigned int PATTERN_LEN2)
71 {
72         const char * latin1;
73
74         for (;;) {
75                 switch (state)
76                 {
77                 case ST_NON_WORD: // non word chars
78                         prevstate = ST_NON_WORD;
79                         if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
80                                 checkattr = 0;
81                                 if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
82                                         checkattr = 1;
83                                 }
84                                 state = ST_TAG;
85                         } else if (is_wordchar(line[actual] + head)) {
86                                 state = ST_WORD;
87                                 token = head;
88                         } else if ((latin1 = get_latin1(line[actual] + head))) {
89                                 state = ST_WORD;
90                                 token = head;
91                                 head += strlen(latin1);
92                         } else if (line[actual][head] == '&') {
93                                 state = ST_CHAR_ENTITY;
94                         }                       
95                         break;
96                 case ST_WORD: // wordchar
97                         if ((latin1 = get_latin1(line[actual] + head))) {
98                                 head += strlen(latin1);
99                         } else if ((is_wordchar((char *) APOSTROPHE) || (is_utf8() && is_wordchar((char *) UTF8_APOS))) &&
100                                         strncmp(line[actual] + head, ENTITY_APOS, strlen(ENTITY_APOS)) == 0 &&
101                                         is_wordchar(line[actual] + head + strlen(ENTITY_APOS))) {
102                                 head += strlen(ENTITY_APOS) - 1;
103                         } else if (is_utf8() && is_wordchar((char *) APOSTROPHE) && // add Unicode apostrophe to the WORDCHARS, if needed
104                                         strncmp(line[actual] + head, UTF8_APOS, strlen(UTF8_APOS)) == 0 &&
105                                         is_wordchar(line[actual] + head + strlen(UTF8_APOS))) {
106                                 head += strlen(UTF8_APOS) - 1;
107                         } else if (! is_wordchar(line[actual] + head)) {
108                                 state = prevstate;
109                                 char * t = alloc_token(token, &head);
110                                 if (t) return t;
111                         }
112                         break;
113                 case ST_TAG: // comment, labels, etc
114                         int i;
115                         if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1)
116                                 && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num][0]) == 0)) {
117                                         checkattr = 2;
118                         } else if ((checkattr > 0) && (line[actual][head] == '>')) {
119                                         state = ST_NON_WORD;
120                         } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) && 
121                                 (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) == 0)) {
122                                         state = ST_NON_WORD;
123                                         head += strlen(PATTERN[pattern_num][1]) - 1;
124                         } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) &&
125                                 ((line[actual][head] == '"') || (line[actual][head] == '\''))) {
126                                 quotmark = line[actual][head];
127                                 state = ST_ATTRIB;
128                         }
129                         break;
130                 case ST_ATTRIB: // non word chars
131                         prevstate = ST_ATTRIB;
132                         if (line[actual][head] == quotmark) {
133                                 state = ST_TAG;
134                                 if (checkattr == 2) checkattr = 1;
135                          // for IMG ALT
136                         } else if (is_wordchar(line[actual] + head) && (checkattr == 2)) {
137                                 state = ST_WORD;
138                                 token = head;
139                         } else if (line[actual][head] == '&') {
140                                 state = ST_CHAR_ENTITY;
141                         }                       
142                         break;
143                 case ST_CHAR_ENTITY: // SGML element
144                         if ((tolower(line[actual][head]) == ';')) {
145                                 state = prevstate;
146                                 head--;
147                         }
148                 }
149                 if (next_char(line[actual], &head)) return NULL;
150         }
151 }
152
153 char * XMLParser::next_token()
154 {
155         return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__, __PATTERN_LEN2__);
156 }
157
158 int XMLParser::change_token(const char * word)
159 {
160         if (strstr(word, APOSTROPHE) != NULL ||
161             strchr(word, '"') != NULL ||
162             strchr(word, '&') != NULL ||
163             strchr(word, '<') != NULL ||
164             strchr(word, '>') != NULL) {
165                 char r[MAXLNLEN];
166                 strcpy(r, word);
167                 return TextParser::change_token(mystrrep(mystrrep(mystrrep(mystrrep(mystrrep(mystrrep(r,
168                         "&", "__namp;__"),
169                         "__namp;__", "&amp;"),
170                         APOSTROPHE, ENTITY_APOS),
171                         "\"", "&quot;"),
172                         ">", "&gt;"),
173                         "<", "&lt;"));
174         }
175         return TextParser::change_token(word);
176 }