src/3rdparty/hunspell/1.3.3/src/parsers/textparser.cxx

   1 #include <cstdlib>
   2 #include <cstring>
   3 #include <cstdio>
   4 #include <ctype.h>
   5
   6 #include "../hunspell/csutil.hxx"
   7 #include "textparser.hxx"
   8
   9 #ifndef W32
  10 using namespace std;
  11 #endif
  12
  13 // ISO-8859-1 HTML character entities
  14
  15 static const char * LATIN1[] = {
  16         "&Agrave;",
  17         "&Atilde;",
  18         "&Aring;",
  19         "&AElig;",
  20         "&Egrave;",
  21         "&Ecirc;",
  22         "&Igrave;",
  23         "&Iuml;",
  24         "&ETH;",
  25         "&Ntilde;",
  26         "&Ograve;",
  27         "&Oslash;",
  28         "&Ugrave;",
  29         "&THORN;",
  30         "&agrave;",
  31         "&atilde;",
  32         "&aring;",
  33         "&aelig;",
  34         "&egrave;",
  35         "&ecirc;",
  36         "&igrave;",
  37         "&iuml;",
  38         "&eth;",
  39         "&ntilde;",
  40         "&ograve;",
  41         "&oslash;",
  42         "&ugrave;",
  43         "&thorn;",
  44         "&yuml;"
  45 };
  46
  47 #define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *))
  48
  49 #define ENTITY_APOS "&apos;"
  50 #define UTF8_APOS "\xe2\x80\x99"
  51 #define APOSTROPHE "'"
  52
  53 TextParser::TextParser() {
  54         init((char *) NULL);
  55 }
  56
  57 TextParser::TextParser(const char * wordchars)
  58 {
  59         init(wordchars);
  60 }
  61
  62 TextParser::TextParser(unsigned short * wordchars, int len)
  63 {
  64         init(wordchars, len);
  65 }
  66
  67 TextParser::~TextParser()
  68 {
  69 }
  70
  71 int TextParser::is_wordchar(char * w)
  72 {
  73         if (*w == '\0') return 0;
  74         if (utf8) {
  75                 w_char wc;
  76                 unsigned short idx;
  77                 u8_u16(&wc, 1, w);
  78                 idx = (wc.h << 8) + wc.l;
  79                 return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch(wordchars_utf16, *((unsigned short *) &wc), wclen)));
  80         } else {
  81                 return wordcharacters[(*w + 256) % 256];
  82         }
  83 }
  84
  85 const char * TextParser::get_latin1(char * s)
  86 {
  87         if (s[0] == '&') {
  88                 unsigned int i = 0;
  89                 while ((i < LATIN1_LEN) &&
  90                         strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++;
  91                 if (i != LATIN1_LEN) return LATIN1[i];
  92         }
  93         return NULL;
  94 }
  95
  96 void TextParser::init(const char * wordchars)
  97 {
  98         for (int i = 0; i < MAXPREVLINE; i++) {
  99                 line[i][0] = '\0';
 100         }
 101         actual = 0;
 102         head = 0;
 103         token = 0;
 104         state = 0;
 105         utf8 = 0;
 106         checkurl = 0;
 107         unsigned int j;
 108         for (j = 0; j < 256; j++) {
 109                 wordcharacters[j] = 0;
 110         }
 111         if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
 112         for (j = 0; j < strlen(wordchars); j++) {
 113                 wordcharacters[(wordchars[j] + 256) % 256] = 1;
 114         }
 115 }
 116
 117 void TextParser::init(unsigned short * wc, int len)
 118 {
 119         for (int i = 0; i < MAXPREVLINE; i++) {
 120                 line[i][0] = '\0';
 121         }
 122         actual = 0;
 123         head = 0;
 124         token = 0;
 125         state = 0;
 126         utf8 = 1;
 127         checkurl = 0;
 128         wordchars_utf16 = wc;
 129         wclen = len;
 130 }
 131
 132 int TextParser::next_char(char * line, int * pos) {
 133         if (*(line + *pos) == '\0') return 1;
 134         if (utf8) {
 135             if (*(line + *pos) >> 7) {
 136                 // jump to next UTF-8 character
 137                 for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++);
 138             } else {
 139                 (*pos)++;
 140             }
 141         } else (*pos)++;
 142         return 0;
 143 }
 144
 145 void TextParser::put_line(char * word)
 146 {
 147         actual = (actual + 1) % MAXPREVLINE;
 148         strcpy(line[actual], word);
 149         token = 0;
 150         head = 0;
 151         check_urls();
 152 }
 153
 154 char * TextParser::get_prevline(int n)
 155 {
 156         return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]);
 157 }
 158
 159 char * TextParser::get_line()
 160 {
 161         return get_prevline(0);
 162 }
 163
 164 char * TextParser::next_token()
 165 {
 166         const char * latin1;
 167
 168         for (;;) {
 169                 switch (state)
 170                 {
 171                 case 0: // non word chars
 172                         if (is_wordchar(line[actual] + head)) {
 173                                 state = 1;
 174                                 token = head;
 175                         } else if ((latin1 = get_latin1(line[actual] + head))) {
 176                                 state = 1;
 177                                 token = head;
 178                                 head += strlen(latin1);
 179                         }
 180                         break;
 181                 case 1: // wordchar
 182                         if ((latin1 = get_latin1(line[actual] + head))) {
 183                                 head += strlen(latin1);
 184                         } else if ((is_wordchar((char *) APOSTROPHE) || (is_utf8() && is_wordchar((char *) UTF8_APOS))) && line[actual][head] == '\'' &&
 185                                         is_wordchar(line[actual] + head + 1)) {
 186                                 head++;
 187                         } else if (is_utf8() && is_wordchar((char *) APOSTROPHE) && // add Unicode apostrophe to the WORDCHARS, if needed
 188                                         strncmp(line[actual] + head, UTF8_APOS, strlen(UTF8_APOS)) == 0 &&
 189                                         is_wordchar(line[actual] + head + strlen(UTF8_APOS))) {
 190                                 head += strlen(UTF8_APOS) - 1;
 191                         } else if (! is_wordchar(line[actual] + head)) {
 192                                 state = 0;
 193                                 char * t = alloc_token(token, &head);
 194                                 if (t) return t;
 195                         }
 196                         break;
 197                 }
 198                 if (next_char(line[actual], &head)) return NULL;
 199         }
 200 }
 201
 202 int TextParser::get_tokenpos()
 203 {
 204         return token;
 205 }
 206
 207 int TextParser::change_token(const char * word)
 208 {
 209         if (word) {
 210                 char * r = mystrdup(line[actual] + head);
 211                 strcpy(line[actual] + token, word);
 212                 strcat(line[actual], r);
 213                 head = token;
 214                 free(r);
 215                 return 1;
 216         }
 217         return 0;
 218 }
 219
 220 void TextParser::check_urls()
 221 {
 222         int url_state = 0;
 223         int url_head = 0;
 224         int url_token = 0;
 225         int url = 0;
 226         for (;;) {
 227                 switch (url_state)
 228                 {
 229                 case 0: // non word chars
 230                         if (is_wordchar(line[actual] + url_head)) {
 231                                 url_state = 1;
 232                                 url_token = url_head;
 233                         // Unix path
 234                         } else if (*(line[actual] + url_head) == '/') {
 235                                 url_state = 1;
 236                                 url_token = url_head;
 237                                 url = 1;
 238                         }
 239                         break;
 240                 case 1: // wordchar
 241                         char ch = *(line[actual] + url_head);
 242                         // e-mail address
 243                         if ((ch == '@') ||
 244                             // MS-DOS, Windows path
 245                             (strncmp(line[actual] + url_head, ":\\", 2) == 0) ||
 246                             // URL
 247                             (strncmp(line[actual] + url_head, "://", 3) == 0)) {
 248                                 url = 1;
 249                         } else if (! (is_wordchar(line[actual] + url_head) ||
 250                           (ch == '-') || (ch == '_') || (ch == '\\') ||
 251                           (ch == '.') || (ch == ':') || (ch == '/') ||
 252                           (ch == '~') || (ch == '%') || (ch == '*') ||
 253                           (ch == '$') || (ch == '[') || (ch == ']') ||
 254                           (ch == '?') || (ch == '!') ||
 255                           ((ch >= '0') && (ch <= '9')))) {
 256                                 url_state = 0;
 257                                 if (url == 1) {
 258                                         for (int i = url_token; i < url_head; i++) {
 259                                                 *(urlline + i) = 1;
 260                                         }
 261                                 }
 262                                 url = 0;
 263                         }
 264                         break;
 265                 }
 266                 *(urlline + url_head) = 0;
 267                 if (next_char(line[actual], &url_head)) return;
 268         }
 269 }
 270
 271 int TextParser::get_url(int token_pos, int * head)
 272 {
 273         for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++);
 274         return checkurl ? 0 : urlline[token_pos];
 275 }
 276
 277 void TextParser::set_url_checking(int check)
 278 {
 279         checkurl = check;
 280 }
 281
 282
 283 char * TextParser::alloc_token(int token, int * head)
 284 {
 285     int url_head = *head;
 286     if (get_url(token, &url_head)) return NULL;
 287     char * t = (char *) malloc(*head - token + 1);
 288     if (t) {
 289         t[*head - token] = '\0';
 290         strncpy(t, line[actual] + token, *head - token);
 291         // remove colon for Finnish and Swedish language
 292         if (t[*head - token - 1] == ':') {
 293             t[*head - token - 1] = '\0';
 294             if (!t[0]) {
 295                 free(t);
 296                 return NULL;
 297             }
 298         }
 299         return t;
 300     }
 301     fprintf(stderr,"Error - Insufficient Memory\n");
 302     return NULL;
 303 }