src/tex2lyx/Parser.cpp

   1 /**
   2  * \file Parser.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author André Pönitz
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "Encoding.h"
  14 #include "Parser.h"
  15 #include "support/lstrings.h"
  16 #include "support/textutils.h"
  17
  18 #include <iostream>
  19
  20 using namespace std;
  21 using namespace lyx::support;
  22
  23 namespace lyx {
  24
  25 namespace {
  26
  27 /*!
  28  * Translate a line ending to '\n'.
  29  * \p c must have catcode catNewline, and it must be the last character read
  30  * from \p is.
  31  */
  32 char_type getNewline(idocstream & is, char_type c)
  33 {
  34         // we have to handle 3 different line endings:
  35         // - UNIX (\n)
  36         // - MAC  (\r)
  37         // - DOS  (\r\n)
  38         if (c == '\r') {
  39                 // MAC or DOS
  40                 char_type wc;
  41                 if (is.get(wc) && wc != '\n') {
  42                         // MAC
  43                         is.putback(wc);
  44                 }
  45                 return '\n';
  46         }
  47         // UNIX
  48         return c;
  49 }
  50
  51 }
  52
  53 //
  54 // Token
  55 //
  56
  57 ostream & operator<<(ostream & os, Token const & t)
  58 {
  59         if (t.cat() == catComment)
  60                 os << '%' << t.cs() << '\n';
  61         else if (t.cat() == catSpace)
  62                 os << t.cs();
  63         else if (t.cat() == catEscape)
  64                 os << '\\' << t.cs() << ' ';
  65         else if (t.cat() == catLetter)
  66                 os << t.cs();
  67         else if (t.cat() == catNewline)
  68                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
  69         else
  70                 os << '[' << t.cs() << ',' << t.cat() << ']';
  71         return os;
  72 }
  73
  74
  75 string Token::asInput() const
  76 {
  77         if (cat_ == catComment)
  78                 return '%' + cs_ + '\n';
  79         if (cat_ == catEscape)
  80                 return '\\' + cs_;
  81         return cs_;
  82 }
  83
  84
  85 bool Token::isAlnumASCII() const
  86 {
  87         return cat_ == catLetter ||
  88                (cat_ == catOther && cs_.length() == 1 && isDigitASCII(cs_[0]));
  89 }
  90
  91
  92 #ifdef FILEDEBUG
  93 void debugToken(std::ostream & os, Token const & t, unsigned int flags)
  94 {
  95         char sep = ' ';
  96         os << "t: " << t << " flags: " << flags;
  97         if (flags & FLAG_BRACE_LAST) { os << sep << "BRACE_LAST"; sep = '|'; }
  98         if (flags & FLAG_RIGHT     ) { os << sep << "RIGHT"     ; sep = '|'; }
  99         if (flags & FLAG_END       ) { os << sep << "END"       ; sep = '|'; }
 100         if (flags & FLAG_BRACK_LAST) { os << sep << "BRACK_LAST"; sep = '|'; }
 101         if (flags & FLAG_TEXTMODE  ) { os << sep << "TEXTMODE"  ; sep = '|'; }
 102         if (flags & FLAG_ITEM      ) { os << sep << "ITEM"      ; sep = '|'; }
 103         if (flags & FLAG_LEAVE     ) { os << sep << "LEAVE"     ; sep = '|'; }
 104         if (flags & FLAG_SIMPLE    ) { os << sep << "SIMPLE"    ; sep = '|'; }
 105         if (flags & FLAG_EQUATION  ) { os << sep << "EQUATION"  ; sep = '|'; }
 106         if (flags & FLAG_SIMPLE2   ) { os << sep << "SIMPLE2"   ; sep = '|'; }
 107         if (flags & FLAG_OPTION    ) { os << sep << "OPTION"    ; sep = '|'; }
 108         if (flags & FLAG_BRACED    ) { os << sep << "BRACED"    ; sep = '|'; }
 109         if (flags & FLAG_CELL      ) { os << sep << "CELL"      ; sep = '|'; }
 110         if (flags & FLAG_TABBING   ) { os << sep << "TABBING"   ; sep = '|'; }
 111         os << "\n";
 112 }
 113 #endif
 114
 115
 116 //
 117 // Parser
 118 //
 119
 120
 121 Parser::Parser(idocstream & is)
 122         : lineno_(0), pos_(0), iss_(0), is_(is), encoding_iconv_("UTF-8"),
 123           theCatcodesType_(NORMAL_CATCODES), curr_cat_(UNDECIDED_CATCODES)
 124 {
 125 }
 126
 127
 128 Parser::Parser(string const & s)
 129         : lineno_(0), pos_(0),
 130           iss_(new idocstringstream(from_utf8(s))), is_(*iss_),
 131           encoding_iconv_("UTF-8"),
 132           theCatcodesType_(NORMAL_CATCODES), curr_cat_(UNDECIDED_CATCODES)
 133 {
 134 }
 135
 136
 137 Parser::~Parser()
 138 {
 139         delete iss_;
 140 }
 141
 142
 143 void Parser::setEncoding(std::string const & e, int const & p)
 144 {
 145         // We may (and need to) use unsafe encodings here: Since the text is
 146         // converted to unicode while reading from is_, we never see text in
 147         // the original encoding of the parser, but operate on utf8 strings
 148         // instead. Therefore, we cannot misparse high bytes as {, } or \\.
 149         Encoding const * const enc = encodings.fromLaTeXName(e, p, true);
 150         if (!enc) {
 151                 cerr << "Unknown encoding " << e << ". Ignoring." << std::endl;
 152                 return;
 153         }
 154         setEncoding(enc->iconvName());
 155 }
 156
 157
 158 void Parser::catInit()
 159 {
 160         if (curr_cat_ == theCatcodesType_)
 161                 return;
 162         curr_cat_ = theCatcodesType_;
 163
 164         fill(theCatcode_, theCatcode_ + 256, catOther);
 165         fill(theCatcode_ + 'a', theCatcode_ + 'z' + 1, catLetter);
 166         fill(theCatcode_ + 'A', theCatcode_ + 'Z' + 1, catLetter);
 167         // This is wrong!
 168         theCatcode_[int('@')]  = catLetter;
 169
 170         if (theCatcodesType_ == NORMAL_CATCODES) {
 171                 theCatcode_[int('\\')] = catEscape;
 172                 theCatcode_[int('{')]  = catBegin;
 173                 theCatcode_[int('}')]  = catEnd;
 174                 theCatcode_[int('$')]  = catMath;
 175                 theCatcode_[int('&')]  = catAlign;
 176                 theCatcode_[int('\n')] = catNewline;
 177                 theCatcode_[int('#')]  = catParameter;
 178                 theCatcode_[int('^')]  = catSuper;
 179                 theCatcode_[int('_')]  = catSub;
 180                 theCatcode_[0x7f]      = catIgnore;
 181                 theCatcode_[int(' ')]  = catSpace;
 182                 theCatcode_[int('\t')] = catSpace;
 183                 theCatcode_[int('\r')] = catNewline;
 184                 theCatcode_[int('~')]  = catActive;
 185                 theCatcode_[int('%')]  = catComment;
 186         }
 187 }
 188
 189 CatCode Parser::catcode(char_type c) const
 190 {
 191         if (c < 256)
 192                 return theCatcode_[(unsigned char)c];
 193         return catOther;
 194 }
 195
 196
 197 void Parser::setCatcode(char c, CatCode cat)
 198 {
 199         theCatcode_[(unsigned char)c] = cat;
 200 }
 201
 202
 203 void Parser::setCatcodes(cat_type t)
 204 {
 205         theCatcodesType_ = t;
 206 }
 207
 208
 209 void Parser::setEncoding(std::string const & e)
 210 {
 211         //cerr << "setting encoding to " << e << std::endl;
 212         is_ << lyx::setEncoding(e);
 213         encoding_iconv_ = e;
 214 }
 215
 216
 217 void Parser::push_back(Token const & t)
 218 {
 219         tokens_.push_back(t);
 220 }
 221
 222
 223 // We return a copy here because the tokens_ vector may get reallocated
 224 Token const Parser::prev_token() const
 225 {
 226         static const Token dummy;
 227         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
 228 }
 229
 230
 231 // We return a copy here because the tokens_ vector may get reallocated
 232 Token const Parser::curr_token() const
 233 {
 234         static const Token dummy;
 235         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
 236 }
 237
 238
 239 // We return a copy here because the tokens_ vector may get reallocated
 240 Token const Parser::next_token()
 241 {
 242         static const Token dummy;
 243         return good() ? tokens_[pos_] : dummy;
 244 }
 245
 246
 247 // We return a copy here because the tokens_ vector may get reallocated
 248 Token const Parser::next_next_token()
 249 {
 250         static const Token dummy;
 251         // If good() has not been called after the last get_token() we need
 252         // to tokenize two more tokens.
 253         if (pos_ + 1 >= tokens_.size()) {
 254                 tokenize_one();
 255                 tokenize_one();
 256         }
 257         return pos_ + 1 < tokens_.size() ? tokens_[pos_ + 1] : dummy;
 258 }
 259
 260
 261 // We return a copy here because the tokens_ vector may get reallocated
 262 Token const Parser::get_token()
 263 {
 264         static const Token dummy;
 265         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
 266         return good() ? tokens_[pos_++] : dummy;
 267 }
 268
 269
 270 bool Parser::isParagraph()
 271 {
 272         // A new paragraph in TeX ist started
 273         // - either by a newline, following any amount of whitespace
 274         //   characters (including zero), and another newline
 275         // - or the token \par
 276         if (curr_token().cat() == catNewline &&
 277             (curr_token().cs().size() > 1 ||
 278              (next_token().cat() == catSpace &&
 279               next_next_token().cat() == catNewline)))
 280                 return true;
 281         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
 282                 return true;
 283         return false;
 284 }
 285
 286
 287 bool Parser::skip_spaces(bool skip_comments)
 288 {
 289         // We just silently return if we have no more tokens.
 290         // skip_spaces() should be callable at any time,
 291         // the caller must check p::good() anyway.
 292         bool skipped = false;
 293         while (good()) {
 294                 get_token();
 295                 if (isParagraph()) {
 296                         putback();
 297                         break;
 298                 }
 299                 if (curr_token().cat() == catSpace ||
 300                     curr_token().cat() == catNewline) {
 301                         skipped = true;
 302                         continue;
 303                 }
 304                 if ((curr_token().cat() == catComment && curr_token().cs().empty()))
 305                         continue;
 306                 if (skip_comments && curr_token().cat() == catComment) {
 307                         // If positions_ is not empty we are doing some kind
 308                         // of look ahead
 309                         if (!positions_.empty())
 310                                 cerr << "  Ignoring comment: "
 311                                      << curr_token().asInput();
 312                 } else {
 313                         putback();
 314                         break;
 315                 }
 316         }
 317         return skipped;
 318 }
 319
 320
 321 void Parser::unskip_spaces(bool skip_comments)
 322 {
 323         while (pos_ > 0) {
 324                 if ( curr_token().cat() == catSpace ||
 325                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
 326                         putback();
 327                 else if (skip_comments && curr_token().cat() == catComment) {
 328                         // TODO: Get rid of this
 329                         // If positions_ is not empty we are doing some kind
 330                         // of look ahead
 331                         if (!positions_.empty())
 332                                 cerr << "Unignoring comment: "
 333                                      << curr_token().asInput();
 334                         putback();
 335                 }
 336                 else
 337                         break;
 338         }
 339 }
 340
 341
 342 void Parser::putback()
 343 {
 344         --pos_;
 345 }
 346
 347
 348 void Parser::pushPosition()
 349 {
 350         positions_.push_back(pos_);
 351 }
 352
 353
 354 void Parser::popPosition()
 355 {
 356         pos_ = positions_.back();
 357         positions_.pop_back();
 358 }
 359
 360
 361 bool Parser::good()
 362 {
 363         if (pos_ < tokens_.size())
 364                 return true;
 365         tokenize_one();
 366         return pos_ < tokens_.size();
 367 }
 368
 369
 370 char Parser::getChar()
 371 {
 372         if (!good())
 373                 error("The input stream is not well...");
 374         return get_token().character();
 375 }
 376
 377
 378 bool Parser::hasOpt()
 379 {
 380         // An optional argument can occur in any of the following forms:
 381         // - \foo[bar]
 382         // - \foo [bar]
 383         // - \foo
 384         //   [bar]
 385         // - \foo %comment
 386         //   [bar]
 387
 388         // remember current position
 389         unsigned int oldpos = pos_;
 390         // skip spaces and comments
 391         while (good()) {
 392                 get_token();
 393                 if (isParagraph()) {
 394                         putback();
 395                         break;
 396                 }
 397                 if (curr_token().cat() == catSpace ||
 398                     curr_token().cat() == catNewline ||
 399                     curr_token().cat() == catComment)
 400                         continue;
 401                 putback();
 402                 break;
 403         }
 404         bool const retval = (next_token().asInput() == "[");
 405         pos_ = oldpos;
 406         return retval;
 407 }
 408
 409
 410 Parser::Arg Parser::getFullArg(char left, char right, bool allow_escaping)
 411 {
 412         skip_spaces(true);
 413
 414         // This is needed if a partial file ends with a command without arguments,
 415         // e. g. \medskip
 416         if (! good())
 417                 return make_pair(false, string());
 418
 419         string result;
 420         Token t = get_token();
 421
 422         if (t.cat() == catComment || t.cat() == catEscape ||
 423             t.character() != left) {
 424                 putback();
 425                 return make_pair(false, string());
 426         } else {
 427                 for (t = get_token(); good(); t = get_token()) {
 428                         // Ignore comments
 429                         if (t.cat() == catComment) {
 430                                 if (!t.cs().empty())
 431                                         cerr << "Ignoring comment: " << t.asInput();
 432                                 continue;
 433                         }
 434                         if (allow_escaping) {
 435                                 if (t.cat() != catEscape && t.character() == right)
 436                                         break;
 437                         } else {
 438                                 if (t.character() == right) {
 439                                         if (t.cat() == catEscape)
 440                                                 result += '\\';
 441                                         break;
 442                                 }
 443                         }
 444                         result += t.asInput();
 445                 }
 446         }
 447         return make_pair(true, result);
 448 }
 449
 450
 451 string Parser::getArg(char left, char right, bool allow_escaping)
 452 {
 453         return getFullArg(left, right, allow_escaping).second;
 454 }
 455
 456
 457 string Parser::getFullOpt(bool keepws)
 458 {
 459         Arg arg = getFullArg('[', ']');
 460         if (arg.first)
 461                 return '[' + arg.second + ']';
 462         if (keepws)
 463                 unskip_spaces(true);
 464         return string();
 465 }
 466
 467
 468 string Parser::getOpt(bool keepws)
 469 {
 470         string const res = getArg('[', ']');
 471         if (res.empty()) {
 472                 if (keepws)
 473                         unskip_spaces(true);
 474                 return string();
 475         }
 476         return '[' + res + ']';
 477 }
 478
 479
 480 string Parser::getFullParentheseArg()
 481 {
 482         Arg arg = getFullArg('(', ')');
 483         if (arg.first)
 484                 return '(' + arg.second + ')';
 485         return string();
 486 }
 487
 488
 489 string const Parser::ertEnvironment(string const & name)
 490 {
 491         if (!good())
 492                 return string();
 493
 494         ostringstream os;
 495         for (Token t = get_token(); good(); t = get_token()) {
 496                 if (t.cat() == catBegin) {
 497                         putback();
 498                         os << '{' << verbatim_item() << '}';
 499                 } else if (t.asInput() == "\\begin") {
 500                         string const env = getArg('{', '}');
 501                         os << "\\begin{" << env << '}'
 502                            << ertEnvironment(env)
 503                            << "\\end{" << env << '}';
 504                 } else if (t.asInput() == "\\end") {
 505                         string const end = getArg('{', '}');
 506                         if (end != name)
 507                                 cerr << "\\end{" << end
 508                                      << "} does not match \\begin{" << name
 509                                      << "}." << endl;
 510                         return os.str();
 511                 } else
 512                         os << t.asInput();
 513         }
 514         cerr << "unexpected end of input" << endl;
 515         return os.str();
 516 }
 517
 518
 519 string const Parser::plainEnvironment(string const & name)
 520 {
 521         if (!good())
 522                 return string();
 523
 524         ostringstream os;
 525         for (Token t = get_token(); good(); t = get_token()) {
 526                 if (t.asInput() == "\\end") {
 527                         string const end = getArg('{', '}');
 528                         if (end == name)
 529                                 return os.str();
 530                         else
 531                                 os << "\\end{" << end << '}';
 532                 } else
 533                         os << t.asInput();
 534         }
 535         cerr << "unexpected end of input" << endl;
 536         return os.str();
 537 }
 538
 539
 540 string const Parser::plainCommand(char left, char right, string const & name)
 541 {
 542         if (!good())
 543                 return string();
 544         // check if first token is really the start character
 545         Token tok = get_token();
 546         if (tok.character() != left) {
 547                 cerr << "first character does not match start character of command \\" << name << endl;
 548                 return string();
 549         }
 550         ostringstream os;
 551         for (Token t = get_token(); good(); t = get_token()) {
 552                 if (t.character() == right) {
 553                         return os.str();
 554                 } else
 555                         os << t.asInput();
 556         }
 557         cerr << "unexpected end of input" << endl;
 558         return os.str();
 559 }
 560
 561
 562 string const Parser::verbatimStuff(string const & end_string)
 563 {
 564         if (!good())
 565                 return string();
 566
 567         ostringstream oss;
 568         size_t match_index = 0;
 569         setCatcodes(VERBATIM_CATCODES);
 570         for (Token t = get_token(); good(); t = get_token()) {
 571                 // FIXME t.asInput() might be longer than we need ?
 572                 if (t.asInput() == end_string.substr(match_index,
 573                                                      t.asInput().length())) {
 574                         match_index += t.asInput().length();
 575                         if (match_index >= end_string.length())
 576                                 break;
 577                 } else if (match_index) {
 578                         oss << end_string.substr(0, match_index) << t.asInput();
 579                         match_index = 0;
 580                 } else
 581                         oss << t.asInput();
 582         }
 583         setCatcodes(NORMAL_CATCODES);
 584         if (!good())
 585                 cerr << "unexpected end of input" << endl;
 586         return oss.str();
 587 }
 588
 589
 590 string const Parser::verbatimEnvironment(string const & name)
 591 {
 592         string s = verbatimStuff("\\end{" + name + "}");
 593         // ignore one newline at beginning or end of string
 594         if (prefixIs(s, "\n"))
 595                 s.erase(0,1);
 596         if (suffixIs(s, "\n"))
 597                 s.erase(s.length() - 1,1);
 598         return s;
 599 }
 600
 601
 602 void Parser::tokenize_one()
 603 {
 604         catInit();
 605         char_type c;
 606         if (!is_.get(c))
 607                 return;
 608
 609         switch (catcode(c)) {
 610         case catSpace: {
 611                 docstring s(1, c);
 612                 while (is_.get(c) && catcode(c) == catSpace)
 613                         s += c;
 614                 if (catcode(c) != catSpace)
 615                         is_.putback(c);
 616                 push_back(Token(s, catSpace));
 617                 break;
 618         }
 619
 620         case catNewline: {
 621                 ++lineno_;
 622                 docstring s(1, getNewline(is_, c));
 623                 while (is_.get(c) && catcode(c) == catNewline) {
 624                         ++lineno_;
 625                         s += getNewline(is_, c);
 626                 }
 627                 if (catcode(c) != catNewline)
 628                         is_.putback(c);
 629                 push_back(Token(s, catNewline));
 630                 break;
 631         }
 632
 633         case catComment: {
 634                 // We don't treat "%\n" combinations here specially because
 635                 // we want to preserve them in the preamble
 636                 docstring s;
 637                 while (is_.get(c) && catcode(c) != catNewline)
 638                         s += c;
 639                 // handle possible DOS line ending
 640                 if (catcode(c) == catNewline)
 641                         c = getNewline(is_, c);
 642                 // Note: The '%' at the beginning and the '\n' at the end
 643                 // of the comment are not stored.
 644                 ++lineno_;
 645                 push_back(Token(s, catComment));
 646                 break;
 647         }
 648
 649         case catEscape: {
 650                 is_.get(c);
 651                 if (!is_) {
 652                         error("unexpected end of input");
 653                 } else {
 654                         docstring s(1, c);
 655                         if (catcode(c) == catLetter) {
 656                                 // collect letters
 657                                 while (is_.get(c) && catcode(c) == catLetter)
 658                                         s += c;
 659                                 if (catcode(c) != catLetter)
 660                                         is_.putback(c);
 661                         }
 662                         push_back(Token(s, catEscape));
 663                 }
 664                 break;
 665         }
 666
 667         case catIgnore: {
 668                 cerr << "ignoring a char: " << c << "\n";
 669                 break;
 670         }
 671
 672         default:
 673                 push_back(Token(docstring(1, c), catcode(c)));
 674         }
 675         //cerr << tokens_.back();
 676 }
 677
 678
 679 void Parser::dump() const
 680 {
 681         cerr << "\nTokens: ";
 682         for (unsigned i = 0; i < tokens_.size(); ++i) {
 683                 if (i == pos_)
 684                         cerr << " <#> ";
 685                 cerr << tokens_[i];
 686         }
 687         cerr << " pos: " << pos_ << "\n";
 688 }
 689
 690
 691 void Parser::error(string const & msg)
 692 {
 693         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
 694         dump();
 695         //exit(1);
 696 }
 697
 698
 699 string Parser::verbatimOption()
 700 {
 701         string res;
 702         if (next_token().character() == '[') {
 703                 Token t = get_token();
 704                 for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
 705                         if (t.cat() == catBegin) {
 706                                 putback();
 707                                 res += '{' + verbatim_item() + '}';
 708                         } else
 709                                 res += t.cs();
 710                 }
 711         }
 712         return res;
 713 }
 714
 715
 716 string Parser::verbatim_item()
 717 {
 718         if (!good())
 719                 error("stream bad");
 720         skip_spaces();
 721         if (next_token().cat() == catBegin) {
 722                 Token t = get_token(); // skip brace
 723                 string res;
 724                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
 725                         if (t.cat() == catBegin) {
 726                                 putback();
 727                                 res += '{' + verbatim_item() + '}';
 728                         }
 729                         else
 730                                 res += t.asInput();
 731                 }
 732                 return res;
 733         }
 734         return get_token().asInput();
 735 }
 736
 737
 738 void Parser::reset()
 739 {
 740         pos_ = 0;
 741 }
 742
 743
 744 } // namespace lyx