src/tex2lyx/texparser.C

   1 /**
   2  * \file texparser.C
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author André Pönitz
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "texparser.h"
  14
  15 #include <iostream>
  16 #include <sstream>
  17
  18 using std::cerr;
  19 using std::endl;
  20 using std::fill;
  21 using std::istream;
  22 using std::istringstream;
  23 using std::ostream;
  24 using std::string;
  25
  26
  27 namespace {
  28
  29 CatCode theCatcode[256];
  30
  31 void catInit()
  32 {
  33         fill(theCatcode, theCatcode + 256, catOther);
  34         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
  35         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
  36
  37         theCatcode[int('\\')] = catEscape;
  38         theCatcode[int('{')]  = catBegin;
  39         theCatcode[int('}')]  = catEnd;
  40         theCatcode[int('$')]  = catMath;
  41         theCatcode[int('&')]  = catAlign;
  42         theCatcode[10]   = catNewline;
  43         theCatcode[int('#')]  = catParameter;
  44         theCatcode[int('^')]  = catSuper;
  45         theCatcode[int('_')]  = catSub;
  46         theCatcode[0x7f] = catIgnore;
  47         theCatcode[int(' ')]  = catSpace;
  48         theCatcode[int('\t')] = catSpace;
  49         theCatcode[13]   = catIgnore;
  50         theCatcode[int('~')]  = catActive;
  51         theCatcode[int('%')]  = catComment;
  52
  53         // This is wrong!
  54         theCatcode[int('@')]  = catLetter;
  55 }
  56
  57 }
  58
  59
  60 //
  61 // catcodes
  62 //
  63
  64 CatCode catcode(unsigned char c)
  65 {
  66         return theCatcode[c];
  67 }
  68
  69
  70
  71 //
  72 // Token
  73 //
  74
  75 ostream & operator<<(ostream & os, Token const & t)
  76 {
  77         if (t.cat() == catComment)
  78                 os << '%' << t.cs() << '\n';
  79         else if (t.cat() == catSpace)
  80                 os << t.cs();
  81         else if (t.cat() == catEscape)
  82                 os << '\\' << t.cs() << ' ';
  83         else if (t.cat() == catLetter)
  84                 os << t.character();
  85         else if (t.cat() == catNewline)
  86                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
  87         else
  88                 os << '[' << t.character() << ',' << t.cat() << ']';
  89         return os;
  90 }
  91
  92
  93 string Token::asString() const
  94 {
  95         return cs_.size() ? cs_ : string(1, char_);
  96 }
  97
  98
  99 string Token::asInput() const
 100 {
 101         if (cat_ == catComment)
 102                 return '%' + cs_ + '\n';
 103         if (cat_ == catSpace || cat_ == catNewline)
 104                 return cs_;
 105         return char_ ? string(1, char_) : '\\' + cs_;
 106 }
 107
 108
 109 //
 110 // Parser
 111 //
 112
 113
 114 Parser::Parser(istream & is)
 115         : lineno_(0), pos_(0)
 116 {
 117         tokenize(is);
 118 }
 119
 120
 121 Parser::Parser(string const & s)
 122         : lineno_(0), pos_(0)
 123 {
 124         istringstream is(s);
 125         tokenize(is);
 126 }
 127
 128
 129 void Parser::push_back(Token const & t)
 130 {
 131         tokens_.push_back(t);
 132 }
 133
 134
 135 void Parser::pop_back()
 136 {
 137         tokens_.pop_back();
 138 }
 139
 140
 141 Token const & Parser::prev_token() const
 142 {
 143         static const Token dummy;
 144         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
 145 }
 146
 147
 148 Token const & Parser::curr_token() const
 149 {
 150         static const Token dummy;
 151         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
 152 }
 153
 154
 155 Token const & Parser::next_token() const
 156 {
 157         static const Token dummy;
 158         return good() ? tokens_[pos_] : dummy;
 159 }
 160
 161
 162 Token const & Parser::get_token()
 163 {
 164         static const Token dummy;
 165         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
 166         return good() ? tokens_[pos_++] : dummy;
 167 }
 168
 169
 170 bool Parser::isParagraph() const
 171 {
 172         // A new paragraph in TeX ist started
 173         // - either by a newline, following any amount of whitespace
 174         //   characters (including zero), and another newline
 175         // - or the token \par
 176         if (curr_token().cat() == catNewline &&
 177             (curr_token().cs().size() > 1 ||
 178              (next_token().cat() == catSpace &&
 179               pos_ < tokens_.size() - 1 &&
 180               tokens_[pos_ + 1].cat() == catNewline)))
 181                 return true;
 182         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
 183                 return true;
 184         return false;
 185 }
 186
 187
 188 void Parser::skip_spaces(bool skip_comments)
 189 {
 190         // We just silently return if we have no more tokens.
 191         // skip_spaces() should be callable at any time,
 192         // the caller must check p::good() anyway.
 193         while (good()) {
 194                 get_token();
 195                 if (isParagraph()) {
 196                         putback();
 197                         break;
 198                 }
 199                 if ( curr_token().cat() == catSpace ||
 200                      curr_token().cat() == catNewline ||
 201                     (curr_token().cat() == catComment && curr_token().cs().empty()))
 202                         continue;
 203                 if (skip_comments && curr_token().cat() == catComment)
 204                         cerr << "  Ignoring comment: " << curr_token().asInput();
 205                 else {
 206                         putback();
 207                         break;
 208                 }
 209         }
 210 }
 211
 212
 213 void Parser::unskip_spaces(bool skip_comments)
 214 {
 215         while (pos_ > 0) {
 216                 if ( curr_token().cat() == catSpace ||
 217                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
 218                         putback();
 219                 else if (skip_comments && curr_token().cat() == catComment) {
 220                         // TODO: Get rid of this
 221                         cerr << "Unignoring comment: " << curr_token().asInput();
 222                         putback();
 223                 }
 224                 else
 225                         break;
 226         }
 227 }
 228
 229
 230 void Parser::putback()
 231 {
 232         --pos_;
 233 }
 234
 235
 236 bool Parser::good() const
 237 {
 238         return pos_ < tokens_.size();
 239 }
 240
 241
 242 char Parser::getChar()
 243 {
 244         if (!good())
 245                 error("The input stream is not well...");
 246         return tokens_[pos_++].character();
 247 }
 248
 249
 250 string Parser::getArg(char left, char right)
 251 {
 252         skip_spaces(true);
 253
 254         // This is needed if a partial file ends with a command without arguments,
 255         // e. g. \medskip
 256         if (! good())
 257                 return string();
 258
 259         string result;
 260         char c = getChar();
 261
 262         if (c != left)
 263                 putback();
 264         else
 265                 while ((c = getChar()) != right && good()) {
 266                         // Ignore comments
 267                         if (curr_token().cat() == catComment) {
 268                                 if (!curr_token().cs().empty())
 269                                         cerr << "Ignoring comment: " << curr_token().asInput();
 270                         }
 271                         else
 272                                 result += curr_token().asInput();
 273                 }
 274
 275         return result;
 276 }
 277
 278
 279 string Parser::getOpt()
 280 {
 281         string const res = getArg('[', ']');
 282         return res.size() ? '[' + res + ']' : string();
 283 }
 284
 285
 286 void Parser::tokenize(istream & is)
 287 {
 288         static bool init_done = false;
 289
 290         if (!init_done) {
 291                 catInit();
 292                 init_done = true;
 293         }
 294
 295         char c;
 296         while (is.get(c)) {
 297                 //cerr << "reading c: " << c << "\n";
 298
 299                 switch (catcode(c)) {
 300                         case catSpace: {
 301                                 string s(1, c);
 302                                 while (is.get(c) && catcode(c) == catSpace)
 303                                         s += c;
 304                                 if (catcode(c) != catSpace)
 305                                         is.putback(c);
 306                                 push_back(Token(s, catSpace));
 307                                 break;
 308                         }
 309
 310                         case catNewline: {
 311                                 ++lineno_;
 312                                 string s(1, c);
 313                                 while (is.get(c) && catcode(c) == catNewline) {
 314                                         ++lineno_;
 315                                         s += c;
 316                                 }
 317                                 if (catcode(c) != catNewline)
 318                                         is.putback(c);
 319                                 push_back(Token(s, catNewline));
 320                                 break;
 321                         }
 322
 323                         case catComment: {
 324                                 // We don't treat "%\n" combinations here specially because
 325                                 // we want to preserve them in the preamble
 326                                 string s;
 327                                 while (is.get(c) && catcode(c) != catNewline)
 328                                         s += c;
 329                                 // Note: The '%' at the beginning and the '\n' at the end
 330                                 // of the comment are not stored.
 331                                 ++lineno_;
 332                                 push_back(Token(s, catComment));
 333                                 break;
 334                         }
 335
 336                         case catEscape: {
 337                                 is.get(c);
 338                                 if (!is) {
 339                                         error("unexpected end of input");
 340                                 } else {
 341                                         string s(1, c);
 342                                         if (catcode(c) == catLetter) {
 343                                                 // collect letters
 344                                                 while (is.get(c) && catcode(c) == catLetter)
 345                                                         s += c;
 346                                                 if (catcode(c) != catLetter)
 347                                                         is.putback(c);
 348                                         }
 349                                         push_back(Token(s, catEscape));
 350                                 }
 351                                 break;
 352                         }
 353
 354                         case catIgnore: {
 355                                 if (c != 13)
 356                                         cerr << "ignoring a char: " << int(c) << "\n";
 357                                 break;
 358                         }
 359
 360                         default:
 361                                 push_back(Token(c, catcode(c)));
 362                 }
 363         }
 364 }
 365
 366
 367 void Parser::dump() const
 368 {
 369         cerr << "\nTokens: ";
 370         for (unsigned i = 0; i < tokens_.size(); ++i) {
 371                 if (i == pos_)
 372                         cerr << " <#> ";
 373                 cerr << tokens_[i];
 374         }
 375         cerr << " pos: " << pos_ << "\n";
 376 }
 377
 378
 379 void Parser::error(string const & msg)
 380 {
 381         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
 382         dump();
 383         //exit(1);
 384 }
 385
 386
 387 string Parser::verbatimOption()
 388 {
 389         string res;
 390         if (next_token().character() == '[') {
 391                 Token t = get_token();
 392                 for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
 393                         if (t.cat() == catBegin) {
 394                                 putback();
 395                                 res += '{' + verbatim_item() + '}';
 396                         } else
 397                                 res += t.asString();
 398                 }
 399         }
 400         return res;
 401 }
 402
 403
 404 string Parser::verbatim_item()
 405 {
 406         if (!good())
 407                 error("stream bad");
 408         skip_spaces();
 409         if (next_token().cat() == catBegin) {
 410                 Token t = get_token(); // skip brace
 411                 string res;
 412                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
 413                         if (t.cat() == catBegin) {
 414                                 putback();
 415                                 res += '{' + verbatim_item() + '}';
 416                         }
 417                         else
 418                                 res += t.asInput();
 419                 }
 420                 return res;
 421         }
 422         return get_token().asInput();
 423 }
 424
 425
 426 void Parser::reset()
 427 {
 428         pos_ = 0;
 429 }
 430
 431
 432 void Parser::setCatCode(char c, CatCode cat)
 433 {
 434         theCatcode[(unsigned char)c] = cat;
 435 }
 436
 437
 438 CatCode Parser::getCatCode(char c) const
 439 {
 440         return theCatcode[(unsigned char)c];
 441 }