6 #include "../hunspell/csutil.hxx"
7 #include "textparser.hxx"
13 // ISO-8859-1 HTML character entities
15 static const char * LATIN1[] = {
47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *))
49 #define ENTITY_APOS "'"
50 #define UTF8_APOS "\xe2\x80\x99"
51 #define APOSTROPHE "'"
53 TextParser::TextParser() {
57 TextParser::TextParser(const char * wordchars)
62 TextParser::TextParser(unsigned short * wordchars, int len)
67 TextParser::~TextParser()
71 int TextParser::is_wordchar(char * w)
73 if (*w == '\0') return 0;
78 idx = (wc.h << 8) + wc.l;
79 return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch(wordchars_utf16, *((unsigned short *) &wc), wclen)));
81 return wordcharacters[(*w + 256) % 256];
85 const char * TextParser::get_latin1(char * s)
89 while ((i < LATIN1_LEN) &&
90 strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++;
91 if (i != LATIN1_LEN) return LATIN1[i];
96 void TextParser::init(const char * wordchars)
98 for (int i = 0; i < MAXPREVLINE; i++) {
108 for (j = 0; j < 256; j++) {
109 wordcharacters[j] = 0;
111 if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
112 for (j = 0; j < strlen(wordchars); j++) {
113 wordcharacters[(wordchars[j] + 256) % 256] = 1;
117 void TextParser::init(unsigned short * wc, int len)
119 for (int i = 0; i < MAXPREVLINE; i++) {
128 wordchars_utf16 = wc;
132 int TextParser::next_char(char * line, int * pos) {
133 if (*(line + *pos) == '\0') return 1;
135 if (*(line + *pos) >> 7) {
136 // jump to next UTF-8 character
137 for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++);
145 void TextParser::put_line(char * word)
147 actual = (actual + 1) % MAXPREVLINE;
148 strcpy(line[actual], word);
154 char * TextParser::get_prevline(int n)
156 return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]);
159 char * TextParser::get_line()
161 return get_prevline(0);
164 char * TextParser::next_token()
171 case 0: // non word chars
172 if (is_wordchar(line[actual] + head)) {
175 } else if ((latin1 = get_latin1(line[actual] + head))) {
178 head += strlen(latin1);
182 if ((latin1 = get_latin1(line[actual] + head))) {
183 head += strlen(latin1);
184 } else if ((is_wordchar((char *) APOSTROPHE) || (is_utf8() && is_wordchar((char *) UTF8_APOS))) && line[actual][head] == '\'' &&
185 is_wordchar(line[actual] + head + 1)) {
187 } else if (is_utf8() && is_wordchar((char *) APOSTROPHE) && // add Unicode apostrophe to the WORDCHARS, if needed
188 strncmp(line[actual] + head, UTF8_APOS, strlen(UTF8_APOS)) == 0 &&
189 is_wordchar(line[actual] + head + strlen(UTF8_APOS))) {
190 head += strlen(UTF8_APOS) - 1;
191 } else if (! is_wordchar(line[actual] + head)) {
193 char * t = alloc_token(token, &head);
198 if (next_char(line[actual], &head)) return NULL;
202 int TextParser::get_tokenpos()
207 int TextParser::change_token(const char * word)
210 char * r = mystrdup(line[actual] + head);
211 strcpy(line[actual] + token, word);
212 strcat(line[actual], r);
220 void TextParser::check_urls()
229 case 0: // non word chars
230 if (is_wordchar(line[actual] + url_head)) {
232 url_token = url_head;
234 } else if (*(line[actual] + url_head) == '/') {
236 url_token = url_head;
241 char ch = *(line[actual] + url_head);
244 // MS-DOS, Windows path
245 (strncmp(line[actual] + url_head, ":\\", 2) == 0) ||
247 (strncmp(line[actual] + url_head, "://", 3) == 0)) {
249 } else if (! (is_wordchar(line[actual] + url_head) ||
250 (ch == '-') || (ch == '_') || (ch == '\\') ||
251 (ch == '.') || (ch == ':') || (ch == '/') ||
252 (ch == '~') || (ch == '%') || (ch == '*') ||
253 (ch == '$') || (ch == '[') || (ch == ']') ||
254 (ch == '?') || (ch == '!') ||
255 ((ch >= '0') && (ch <= '9')))) {
258 for (int i = url_token; i < url_head; i++) {
266 *(urlline + url_head) = 0;
267 if (next_char(line[actual], &url_head)) return;
271 int TextParser::get_url(int token_pos, int * head)
273 for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++);
274 return checkurl ? 0 : urlline[token_pos];
277 void TextParser::set_url_checking(int check)
283 char * TextParser::alloc_token(int token, int * head)
285 int url_head = *head;
286 if (get_url(token, &url_head)) return NULL;
287 char * t = (char *) malloc(*head - token + 1);
289 t[*head - token] = '\0';
290 strncpy(t, line[actual] + token, *head - token);
291 // remove colon for Finnish and Swedish language
292 if (t[*head - token - 1] == ':') {
293 t[*head - token - 1] = '\0';
301 fprintf(stderr,"Error - Insufficient Memory\n");