6 #include "../hunspell/csutil.hxx"
7 #include "xmlparser.hxx"
14 enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB };
16 static const char * __PATTERN__[][2] = {
18 { "<[cdata[", "]]>" }, // XML comment
22 #define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char *) * 2))
24 static const char * __PATTERN2__[][2] = {
27 #define __PATTERN_LEN2__ (sizeof(__PATTERN2__) / (sizeof(char *) * 2))
29 #define ENTITY_APOS "'"
30 #define UTF8_APOS "\xe2\x80\x99"
31 #define APOSTROPHE "'"
33 XMLParser::XMLParser()
37 XMLParser::XMLParser(const char * wordchars)
42 XMLParser::XMLParser(unsigned short * wordchars, int len)
47 XMLParser::~XMLParser()
51 int XMLParser::look_pattern(const char * p[][2], unsigned int len, int column)
53 for (unsigned int i = 0; i < len; i++) {
54 char * j = line[actual] + head;
55 const char * k = p[i][column];
56 while ((*k != '\0') && (tolower(*j) == *k)) {
60 if (*k == '\0') return i;
70 char * XMLParser::next_token(const char * PATTERN[][2], unsigned int PATTERN_LEN, const char * PATTERN2[][2], unsigned int PATTERN_LEN2)
77 case ST_NON_WORD: // non word chars
78 prevstate = ST_NON_WORD;
79 if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) {
81 if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) {
85 } else if (is_wordchar(line[actual] + head)) {
88 } else if ((latin1 = get_latin1(line[actual] + head))) {
91 head += strlen(latin1);
92 } else if (line[actual][head] == '&') {
93 state = ST_CHAR_ENTITY;
96 case ST_WORD: // wordchar
97 if ((latin1 = get_latin1(line[actual] + head))) {
98 head += strlen(latin1);
99 } else if ((is_wordchar((char *) APOSTROPHE) || (is_utf8() && is_wordchar((char *) UTF8_APOS))) &&
100 strncmp(line[actual] + head, ENTITY_APOS, strlen(ENTITY_APOS)) == 0 &&
101 is_wordchar(line[actual] + head + strlen(ENTITY_APOS))) {
102 head += strlen(ENTITY_APOS) - 1;
103 } else if (is_utf8() && is_wordchar((char *) APOSTROPHE) && // add Unicode apostrophe to the WORDCHARS, if needed
104 strncmp(line[actual] + head, UTF8_APOS, strlen(UTF8_APOS)) == 0 &&
105 is_wordchar(line[actual] + head + strlen(UTF8_APOS))) {
106 head += strlen(UTF8_APOS) - 1;
107 } else if (! is_wordchar(line[actual] + head)) {
109 char * t = alloc_token(token, &head);
113 case ST_TAG: // comment, labels, etc
115 if ((checkattr == 1) && ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1)
116 && (strcmp(PATTERN2[i][0],PATTERN2[pattern2_num][0]) == 0)) {
118 } else if ((checkattr > 0) && (line[actual][head] == '>')) {
120 } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) &&
121 (strcmp(PATTERN[i][1],PATTERN[pattern_num][1]) == 0)) {
123 head += strlen(PATTERN[pattern_num][1]) - 1;
124 } else if ( (strcmp(PATTERN[pattern_num][0], "<") == 0) &&
125 ((line[actual][head] == '"') || (line[actual][head] == '\''))) {
126 quotmark = line[actual][head];
130 case ST_ATTRIB: // non word chars
131 prevstate = ST_ATTRIB;
132 if (line[actual][head] == quotmark) {
134 if (checkattr == 2) checkattr = 1;
136 } else if (is_wordchar(line[actual] + head) && (checkattr == 2)) {
139 } else if (line[actual][head] == '&') {
140 state = ST_CHAR_ENTITY;
143 case ST_CHAR_ENTITY: // SGML element
144 if ((tolower(line[actual][head]) == ';')) {
149 if (next_char(line[actual], &head)) return NULL;
153 char * XMLParser::next_token()
155 return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__, __PATTERN_LEN2__);
158 int XMLParser::change_token(const char * word)
160 if (strstr(word, APOSTROPHE) != NULL ||
161 strchr(word, '"') != NULL ||
162 strchr(word, '&') != NULL ||
163 strchr(word, '<') != NULL ||
164 strchr(word, '>') != NULL) {
167 return TextParser::change_token(mystrrep(mystrrep(mystrrep(mystrrep(mystrrep(mystrrep(r,
169 "__namp;__", "&"),
170 APOSTROPHE, ENTITY_APOS),
175 return TextParser::change_token(word);