src/tex2lyx/Parser.cpp

   1 /**
   2  * \file Parser.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author André Pönitz
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "Parser.h"
  14
  15 #include <iostream>
  16 #include <sstream>
  17
  18 using namespace std;
  19
  20 namespace lyx {
  21
  22 namespace {
  23
  24 CatCode theCatcode[256];
  25
  26 void catInit()
  27 {
  28         static bool init_done = false;
  29         if (init_done)
  30                 return;
  31         init_done = true;
  32
  33         fill(theCatcode, theCatcode + 256, catOther);
  34         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
  35         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
  36
  37         theCatcode[int('\\')] = catEscape;
  38         theCatcode[int('{')]  = catBegin;
  39         theCatcode[int('}')]  = catEnd;
  40         theCatcode[int('$')]  = catMath;
  41         theCatcode[int('&')]  = catAlign;
  42         theCatcode[int('\n')] = catNewline;
  43         theCatcode[int('#')]  = catParameter;
  44         theCatcode[int('^')]  = catSuper;
  45         theCatcode[int('_')]  = catSub;
  46         theCatcode[0x7f]      = catIgnore;
  47         theCatcode[int(' ')]  = catSpace;
  48         theCatcode[int('\t')] = catSpace;
  49         theCatcode[int('\r')] = catNewline;
  50         theCatcode[int('~')]  = catActive;
  51         theCatcode[int('%')]  = catComment;
  52
  53         // This is wrong!
  54         theCatcode[int('@')]  = catLetter;
  55 }
  56
  57
  58 /*!
  59  * Translate a line ending to '\n'.
  60  * \p c must have catcode catNewline, and it must be the last character read
  61  * from \p is.
  62  */
  63 char getNewline(istream & is, char c)
  64 {
  65         // we have to handle 3 different line endings:
  66         // - UNIX (\n)
  67         // - MAC  (\r)
  68         // - DOS  (\r\n)
  69         if (c == '\r') {
  70                 // MAC or DOS
  71                 if (is.get(c) && c != '\n') {
  72                         // MAC
  73                         is.putback(c);
  74                 }
  75                 return '\n';
  76         }
  77         // UNIX
  78         return c;
  79 }
  80
  81 }
  82
  83
  84 //
  85 // catcodes
  86 //
  87
  88 CatCode catcode(unsigned char c)
  89 {
  90         return theCatcode[c];
  91 }
  92
  93
  94
  95 //
  96 // Token
  97 //
  98
  99 ostream & operator<<(ostream & os, Token const & t)
 100 {
 101         if (t.cat() == catComment)
 102                 os << '%' << t.cs() << '\n';
 103         else if (t.cat() == catSpace)
 104                 os << t.cs();
 105         else if (t.cat() == catEscape)
 106                 os << '\\' << t.cs() << ' ';
 107         else if (t.cat() == catLetter)
 108                 os << t.character();
 109         else if (t.cat() == catNewline)
 110                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
 111         else
 112                 os << '[' << t.character() << ',' << t.cat() << ']';
 113         return os;
 114 }
 115
 116
 117 string Token::asString() const
 118 {
 119         return cs_.size() ? cs_ : string(1, char_);
 120 }
 121
 122
 123 string Token::asInput() const
 124 {
 125         if (cat_ == catComment)
 126                 return '%' + cs_ + '\n';
 127         if (cat_ == catSpace || cat_ == catNewline)
 128                 return cs_;
 129         return char_ ? string(1, char_) : '\\' + cs_;
 130 }
 131
 132
 133 //
 134 // Parser
 135 //
 136
 137
 138 Parser::Parser(istream & is)
 139         : lineno_(0), pos_(0), iss_(0), is_(is)
 140 {
 141         tokenize();
 142 }
 143
 144
 145 Parser::Parser(string const & s)
 146         : lineno_(0), pos_(0), iss_(new istringstream(s)), is_(*iss_)
 147 {
 148         tokenize();
 149 }
 150
 151
 152 Parser::~Parser()
 153 {
 154         delete iss_;
 155 }
 156
 157
 158 void Parser::push_back(Token const & t)
 159 {
 160         tokens_.push_back(t);
 161 }
 162
 163
 164 Token const & Parser::prev_token() const
 165 {
 166         static const Token dummy;
 167         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
 168 }
 169
 170
 171 Token const & Parser::curr_token() const
 172 {
 173         static const Token dummy;
 174         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
 175 }
 176
 177
 178 Token const & Parser::next_token() const
 179 {
 180         static const Token dummy;
 181         return good() ? tokens_[pos_] : dummy;
 182 }
 183
 184
 185 Token const & Parser::get_token()
 186 {
 187         static const Token dummy;
 188         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
 189         return good() ? tokens_[pos_++] : dummy;
 190 }
 191
 192
 193 bool Parser::isParagraph() const
 194 {
 195         // A new paragraph in TeX ist started
 196         // - either by a newline, following any amount of whitespace
 197         //   characters (including zero), and another newline
 198         // - or the token \par
 199         if (curr_token().cat() == catNewline &&
 200             (curr_token().cs().size() > 1 ||
 201              (next_token().cat() == catSpace &&
 202               pos_ < tokens_.size() - 1 &&
 203               tokens_[pos_ + 1].cat() == catNewline)))
 204                 return true;
 205         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
 206                 return true;
 207         return false;
 208 }
 209
 210
 211 void Parser::skip_spaces(bool skip_comments)
 212 {
 213         // We just silently return if we have no more tokens.
 214         // skip_spaces() should be callable at any time,
 215         // the caller must check p::good() anyway.
 216         while (good()) {
 217                 get_token();
 218                 if (isParagraph()) {
 219                         putback();
 220                         break;
 221                 }
 222                 if ( curr_token().cat() == catSpace ||
 223                      curr_token().cat() == catNewline ||
 224                     (curr_token().cat() == catComment && curr_token().cs().empty()))
 225                         continue;
 226                 if (skip_comments && curr_token().cat() == catComment)
 227                         cerr << "  Ignoring comment: " << curr_token().asInput();
 228                 else {
 229                         putback();
 230                         break;
 231                 }
 232         }
 233 }
 234
 235
 236 void Parser::unskip_spaces(bool skip_comments)
 237 {
 238         while (pos_ > 0) {
 239                 if ( curr_token().cat() == catSpace ||
 240                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
 241                         putback();
 242                 else if (skip_comments && curr_token().cat() == catComment) {
 243                         // TODO: Get rid of this
 244                         cerr << "Unignoring comment: " << curr_token().asInput();
 245                         putback();
 246                 }
 247                 else
 248                         break;
 249         }
 250 }
 251
 252
 253 void Parser::putback()
 254 {
 255         --pos_;
 256 }
 257
 258
 259 bool Parser::good() const
 260 {
 261         return pos_ < tokens_.size();
 262 }
 263
 264
 265 char Parser::getChar()
 266 {
 267         if (!good())
 268                 error("The input stream is not well...");
 269         return tokens_[pos_++].character();
 270 }
 271
 272
 273 Parser::Arg Parser::getFullArg(char left, char right)
 274 {
 275         skip_spaces(true);
 276
 277         // This is needed if a partial file ends with a command without arguments,
 278         // e. g. \medskip
 279         if (! good())
 280                 return make_pair(false, string());
 281
 282         string result;
 283         char c = getChar();
 284
 285         if (c != left) {
 286                 putback();
 287                 return make_pair(false, string());
 288         } else
 289                 while ((c = getChar()) != right && good()) {
 290                         // Ignore comments
 291                         if (curr_token().cat() == catComment) {
 292                                 if (!curr_token().cs().empty())
 293                                         cerr << "Ignoring comment: " << curr_token().asInput();
 294                         }
 295                         else
 296                                 result += curr_token().asInput();
 297                 }
 298
 299         return make_pair(true, result);
 300 }
 301
 302
 303 string Parser::getArg(char left, char right)
 304 {
 305         return getFullArg(left, right).second;
 306 }
 307
 308
 309 string Parser::getFullOpt()
 310 {
 311         Arg arg = getFullArg('[', ']');
 312         if (arg.first)
 313                 return '[' + arg.second + ']';
 314         return string();
 315 }
 316
 317
 318 string Parser::getOpt()
 319 {
 320         string const res = getArg('[', ']');
 321         return res.empty() ? string() : '[' + res + ']';
 322 }
 323
 324
 325 string Parser::getFullParentheseArg()
 326 {
 327         Arg arg = getFullArg('(', ')');
 328         if (arg.first)
 329                 return '(' + arg.second + ')';
 330         return string();
 331 }
 332
 333
 334 string const Parser::verbatimEnvironment(string const & name)
 335 {
 336         if (!good())
 337                 return string();
 338
 339         ostringstream os;
 340         for (Token t = get_token(); good(); t = get_token()) {
 341                 if (t.cat() == catBegin) {
 342                         putback();
 343                         os << '{' << verbatim_item() << '}';
 344                 } else if (t.asInput() == "\\begin") {
 345                         string const env = getArg('{', '}');
 346                         os << "\\begin{" << env << '}'
 347                            << verbatimEnvironment(env)
 348                            << "\\end{" << env << '}';
 349                 } else if (t.asInput() == "\\end") {
 350                         string const end = getArg('{', '}');
 351                         if (end != name)
 352                                 cerr << "\\end{" << end
 353                                      << "} does not match \\begin{" << name
 354                                      << "}." << endl;
 355                         return os.str();
 356                 } else
 357                         os << t.asInput();
 358         }
 359         cerr << "unexpected end of input" << endl;
 360         return os.str();
 361 }
 362
 363
 364 void Parser::tokenize_one()
 365 {
 366         catInit();
 367         char c;
 368         if (!is_.get(c))
 369                 return;
 370         //cerr << "reading c: " << c << "\n";
 371
 372         switch (catcode(c)) {
 373         case catSpace: {
 374                 string s(1, c);
 375                 while (is_.get(c) && catcode(c) == catSpace)
 376                         s += c;
 377                 if (catcode(c) != catSpace)
 378                         is_.putback(c);
 379                 push_back(Token(s, catSpace));
 380                 break;
 381         }
 382
 383         case catNewline: {
 384                 ++lineno_;
 385                 string s(1, getNewline(is_, c));
 386                 while (is_.get(c) && catcode(c) == catNewline) {
 387                         ++lineno_;
 388                         s += getNewline(is_, c);
 389                 }
 390                 if (catcode(c) != catNewline)
 391                         is_.putback(c);
 392                 push_back(Token(s, catNewline));
 393                 break;
 394         }
 395
 396         case catComment: {
 397                 // We don't treat "%\n" combinations here specially because
 398                 // we want to preserve them in the preamble
 399                 string s;
 400                 while (is_.get(c) && catcode(c) != catNewline)
 401                         s += c;
 402                 // handle possible DOS line ending
 403                 if (catcode(c) == catNewline)
 404                         c = getNewline(is_, c);
 405                 // Note: The '%' at the beginning and the '\n' at the end
 406                 // of the comment are not stored.
 407                 ++lineno_;
 408                 push_back(Token(s, catComment));
 409                 break;
 410         }
 411
 412         case catEscape: {
 413                 is_.get(c);
 414                 if (!is_) {
 415                         error("unexpected end of input");
 416                 } else {
 417                         string s(1, c);
 418                         if (catcode(c) == catLetter) {
 419                                 // collect letters
 420                                 while (is_.get(c) && catcode(c) == catLetter)
 421                                         s += c;
 422                                 if (catcode(c) != catLetter)
 423                                         is_.putback(c);
 424                         }
 425                         push_back(Token(s, catEscape));
 426                 }
 427                 break;
 428         }
 429
 430         case catIgnore: {
 431                 cerr << "ignoring a char: " << int(c) << "\n";
 432                 break;
 433         }
 434
 435         default:
 436                 push_back(Token(c, catcode(c)));
 437         }
 438 }
 439
 440
 441 void Parser::tokenize()
 442 {
 443         while (is_)
 444                 tokenize_one();
 445 }
 446
 447
 448 void Parser::dump() const
 449 {
 450         cerr << "\nTokens: ";
 451         for (unsigned i = 0; i < tokens_.size(); ++i) {
 452                 if (i == pos_)
 453                         cerr << " <#> ";
 454                 cerr << tokens_[i];
 455         }
 456         cerr << " pos: " << pos_ << "\n";
 457 }
 458
 459
 460 void Parser::error(string const & msg)
 461 {
 462         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
 463         dump();
 464         //exit(1);
 465 }
 466
 467
 468 string Parser::verbatimOption()
 469 {
 470         string res;
 471         if (next_token().character() == '[') {
 472                 Token t = get_token();
 473                 for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
 474                         if (t.cat() == catBegin) {
 475                                 putback();
 476                                 res += '{' + verbatim_item() + '}';
 477                         } else
 478                                 res += t.asString();
 479                 }
 480         }
 481         return res;
 482 }
 483
 484
 485 string Parser::verbatim_item()
 486 {
 487         if (!good())
 488                 error("stream bad");
 489         skip_spaces();
 490         if (next_token().cat() == catBegin) {
 491                 Token t = get_token(); // skip brace
 492                 string res;
 493                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
 494                         if (t.cat() == catBegin) {
 495                                 putback();
 496                                 res += '{' + verbatim_item() + '}';
 497                         }
 498                         else
 499                                 res += t.asInput();
 500                 }
 501                 return res;
 502         }
 503         return get_token().asInput();
 504 }
 505
 506
 507 void Parser::reset()
 508 {
 509         pos_ = 0;
 510 }
 511
 512
 513 void Parser::setCatCode(char c, CatCode cat)
 514 {
 515         theCatcode[(unsigned char)c] = cat;
 516 }
 517
 518
 519 CatCode Parser::getCatCode(char c) const
 520 {
 521         return theCatcode[(unsigned char)c];
 522 }
 523
 524
 525 } // namespace lyx