src/tex2lyx/Parser.cpp

   1 /**
   2  * \file Parser.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author André Pönitz
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "Encoding.h"
  14 #include "Parser.h"
  15 #include "support/foreach.h"
  16 #include "support/lstrings.h"
  17 #include "support/textutils.h"
  18
  19 #include <iostream>
  20
  21 using namespace std;
  22 using namespace lyx::support;
  23
  24 namespace lyx {
  25
  26 namespace {
  27
  28 /*!
  29  * Translate a line ending to '\n'.
  30  * \p c must have catcode catNewline, and it must be the last character read
  31  * from \p is.
  32  */
  33 char_type getNewline(iparserdocstream & is, char_type c)
  34 {
  35         // we have to handle 3 different line endings:
  36         // - UNIX (\n)
  37         // - MAC  (\r)
  38         // - DOS  (\r\n)
  39         if (c == '\r') {
  40                 // MAC or DOS
  41                 char_type wc;
  42                 if (is.get(wc) && wc != '\n') {
  43                         // MAC
  44                         is.putback(wc);
  45                 }
  46                 return '\n';
  47         }
  48         // UNIX
  49         return c;
  50 }
  51
  52 }
  53
  54 //
  55 // Token
  56 //
  57
  58 ostream & operator<<(ostream & os, Token const & t)
  59 {
  60         if (t.cat() == catComment)
  61                 os << '%' << t.cs() << '\n';
  62         else if (t.cat() == catSpace)
  63                 os << t.cs();
  64         else if (t.cat() == catEscape)
  65                 os << '\\' << t.cs() << ' ';
  66         else if (t.cat() == catLetter)
  67                 os << t.cs();
  68         else if (t.cat() == catNewline)
  69                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
  70         else
  71                 os << '[' << t.cs() << ',' << t.cat() << ']';
  72         return os;
  73 }
  74
  75
  76 string Token::asInput() const
  77 {
  78         if (cat_ == catComment)
  79                 return '%' + cs_ + '\n';
  80         if (cat_ == catEscape)
  81                 return '\\' + cs_;
  82         return cs_;
  83 }
  84
  85
  86 bool Token::isAlnumASCII() const
  87 {
  88         return cat_ == catLetter ||
  89                (cat_ == catOther && cs_.length() == 1 && isDigitASCII(cs_[0]));
  90 }
  91
  92
  93 #ifdef FILEDEBUG
  94 void debugToken(std::ostream & os, Token const & t, unsigned int flags)
  95 {
  96         char sep = ' ';
  97         os << "t: " << t << " flags: " << flags;
  98         if (flags & FLAG_BRACE_LAST) { os << sep << "BRACE_LAST"; sep = '|'; }
  99         if (flags & FLAG_RIGHT     ) { os << sep << "RIGHT"     ; sep = '|'; }
 100         if (flags & FLAG_END       ) { os << sep << "END"       ; sep = '|'; }
 101         if (flags & FLAG_BRACK_LAST) { os << sep << "BRACK_LAST"; sep = '|'; }
 102         if (flags & FLAG_TEXTMODE  ) { os << sep << "TEXTMODE"  ; sep = '|'; }
 103         if (flags & FLAG_ITEM      ) { os << sep << "ITEM"      ; sep = '|'; }
 104         if (flags & FLAG_LEAVE     ) { os << sep << "LEAVE"     ; sep = '|'; }
 105         if (flags & FLAG_SIMPLE    ) { os << sep << "SIMPLE"    ; sep = '|'; }
 106         if (flags & FLAG_EQUATION  ) { os << sep << "EQUATION"  ; sep = '|'; }
 107         if (flags & FLAG_SIMPLE2   ) { os << sep << "SIMPLE2"   ; sep = '|'; }
 108         if (flags & FLAG_OPTION    ) { os << sep << "OPTION"    ; sep = '|'; }
 109         if (flags & FLAG_BRACED    ) { os << sep << "BRACED"    ; sep = '|'; }
 110         if (flags & FLAG_CELL      ) { os << sep << "CELL"      ; sep = '|'; }
 111         if (flags & FLAG_TABBING   ) { os << sep << "TABBING"   ; sep = '|'; }
 112         os << "\n";
 113 }
 114 #endif
 115
 116
 117 //
 118 // Parser
 119 //
 120
 121
 122 Parser::Parser(idocstream & is)
 123         : lineno_(0), pos_(0), iss_(0), is_(is), encoding_iconv_("UTF-8"),
 124           theCatcodesType_(NORMAL_CATCODES), curr_cat_(UNDECIDED_CATCODES)
 125 {
 126 }
 127
 128
 129 Parser::Parser(string const & s)
 130         : lineno_(0), pos_(0),
 131           iss_(new idocstringstream(from_utf8(s))), is_(*iss_),
 132           encoding_iconv_("UTF-8"),
 133           theCatcodesType_(NORMAL_CATCODES), curr_cat_(UNDECIDED_CATCODES)
 134 {
 135 }
 136
 137
 138 Parser::~Parser()
 139 {
 140         delete iss_;
 141 }
 142
 143
 144 void Parser::deparse()
 145 {
 146         for(size_type i = pos_ ; i < tokens_.size() ; ++i) {
 147                 docstring const s = from_utf8(tokens_[i].asInput());
 148                 //cerr << "deparsing [" << to_utf8(s) << "]" <<endl;
 149                 foreach(char_type c, s)
 150                         is_.putback(c);
 151         }
 152         tokens_.erase(tokens_.begin() + pos_, tokens_.end());
 153         // make sure that next token is read
 154         tokenize_one();
 155 }
 156
 157
 158 void Parser::setEncoding(std::string const & e, int const & p)
 159 {
 160         // We may (and need to) use unsafe encodings here: Since the text is
 161         // converted to unicode while reading from is_, we never see text in
 162         // the original encoding of the parser, but operate on utf8 strings
 163         // instead. Therefore, we cannot misparse high bytes as {, } or \\.
 164         Encoding const * const enc = encodings.fromLaTeXName(e, p, true);
 165         if (!enc) {
 166                 cerr << "Unknown encoding " << e << ". Ignoring." << std::endl;
 167                 return;
 168         }
 169         setEncoding(enc->iconvName());
 170 }
 171
 172
 173 void Parser::catInit()
 174 {
 175         if (curr_cat_ == theCatcodesType_)
 176                 return;
 177         curr_cat_ = theCatcodesType_;
 178
 179         fill(theCatcode_, theCatcode_ + 256, catOther);
 180         fill(theCatcode_ + 'a', theCatcode_ + 'z' + 1, catLetter);
 181         fill(theCatcode_ + 'A', theCatcode_ + 'Z' + 1, catLetter);
 182         // This is wrong!
 183         theCatcode_[int('@')]  = catLetter;
 184
 185         if (theCatcodesType_ == NORMAL_CATCODES) {
 186                 theCatcode_[int('\\')] = catEscape;
 187                 theCatcode_[int('{')]  = catBegin;
 188                 theCatcode_[int('}')]  = catEnd;
 189                 theCatcode_[int('$')]  = catMath;
 190                 theCatcode_[int('&')]  = catAlign;
 191                 theCatcode_[int('\n')] = catNewline;
 192                 theCatcode_[int('#')]  = catParameter;
 193                 theCatcode_[int('^')]  = catSuper;
 194                 theCatcode_[int('_')]  = catSub;
 195                 theCatcode_[0x7f]      = catIgnore;
 196                 theCatcode_[int(' ')]  = catSpace;
 197                 theCatcode_[int('\t')] = catSpace;
 198                 theCatcode_[int('\r')] = catNewline;
 199                 theCatcode_[int('~')]  = catActive;
 200                 theCatcode_[int('%')]  = catComment;
 201         }
 202 }
 203
 204 CatCode Parser::catcode(char_type c) const
 205 {
 206         if (c < 256)
 207                 return theCatcode_[(unsigned char)c];
 208         return catOther;
 209 }
 210
 211
 212 void Parser::setCatcode(char c, CatCode cat)
 213 {
 214         theCatcode_[(unsigned char)c] = cat;
 215         deparse();
 216 }
 217
 218
 219 void Parser::setCatcodes(cat_type t)
 220 {
 221         theCatcodesType_ = t;
 222         deparse();
 223 }
 224
 225
 226 void Parser::setEncoding(std::string const & e)
 227 {
 228         //cerr << "setting encoding to " << e << std::endl;
 229         is_.docstream() << lyx::setEncoding(e);
 230         encoding_iconv_ = e;
 231 }
 232
 233
 234 void Parser::push_back(Token const & t)
 235 {
 236         tokens_.push_back(t);
 237 }
 238
 239
 240 // We return a copy here because the tokens_ vector may get reallocated
 241 Token const Parser::prev_token() const
 242 {
 243         static const Token dummy;
 244         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
 245 }
 246
 247
 248 // We return a copy here because the tokens_ vector may get reallocated
 249 Token const Parser::curr_token() const
 250 {
 251         static const Token dummy;
 252         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
 253 }
 254
 255
 256 // We return a copy here because the tokens_ vector may get reallocated
 257 Token const Parser::next_token()
 258 {
 259         static const Token dummy;
 260         return good() ? tokens_[pos_] : dummy;
 261 }
 262
 263
 264 // We return a copy here because the tokens_ vector may get reallocated
 265 Token const Parser::next_next_token()
 266 {
 267         static const Token dummy;
 268         // If good() has not been called after the last get_token() we need
 269         // to tokenize two more tokens.
 270         if (pos_ + 1 >= tokens_.size()) {
 271                 tokenize_one();
 272                 tokenize_one();
 273         }
 274         return pos_ + 1 < tokens_.size() ? tokens_[pos_ + 1] : dummy;
 275 }
 276
 277
 278 // We return a copy here because the tokens_ vector may get reallocated
 279 Token const Parser::get_token()
 280 {
 281         static const Token dummy;
 282         // if (good())
 283         //      cerr << "looking at token " << tokens_[pos_]
 284         //           << " pos: " << pos_ << '\n';
 285         return good() ? tokens_[pos_++] : dummy;
 286 }
 287
 288
 289 bool Parser::isParagraph()
 290 {
 291         // A new paragraph in TeX ist started
 292         // - either by a newline, following any amount of whitespace
 293         //   characters (including zero), and another newline
 294         // - or the token \par
 295         if (curr_token().cat() == catNewline &&
 296             (curr_token().cs().size() > 1 ||
 297              (next_token().cat() == catSpace &&
 298               next_next_token().cat() == catNewline)))
 299                 return true;
 300         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
 301                 return true;
 302         return false;
 303 }
 304
 305
 306 bool Parser::skip_spaces(bool skip_comments)
 307 {
 308         // We just silently return if we have no more tokens.
 309         // skip_spaces() should be callable at any time,
 310         // the caller must check p::good() anyway.
 311         bool skipped = false;
 312         while (good()) {
 313                 get_token();
 314                 if (isParagraph()) {
 315                         putback();
 316                         break;
 317                 }
 318                 if (curr_token().cat() == catSpace ||
 319                     curr_token().cat() == catNewline) {
 320                         skipped = true;
 321                         continue;
 322                 }
 323                 if ((curr_token().cat() == catComment && curr_token().cs().empty()))
 324                         continue;
 325                 if (skip_comments && curr_token().cat() == catComment) {
 326                         // If positions_ is not empty we are doing some kind
 327                         // of look ahead
 328                         if (!positions_.empty())
 329                                 cerr << "  Ignoring comment: "
 330                                      << curr_token().asInput();
 331                 } else {
 332                         putback();
 333                         break;
 334                 }
 335         }
 336         return skipped;
 337 }
 338
 339
 340 void Parser::unskip_spaces(bool skip_comments)
 341 {
 342         while (pos_ > 0) {
 343                 if ( curr_token().cat() == catSpace ||
 344                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
 345                         putback();
 346                 else if (skip_comments && curr_token().cat() == catComment) {
 347                         // TODO: Get rid of this
 348                         // If positions_ is not empty we are doing some kind
 349                         // of look ahead
 350                         if (!positions_.empty())
 351                                 cerr << "Unignoring comment: "
 352                                      << curr_token().asInput();
 353                         putback();
 354                 }
 355                 else
 356                         break;
 357         }
 358 }
 359
 360
 361 void Parser::putback()
 362 {
 363         --pos_;
 364 }
 365
 366
 367 void Parser::pushPosition()
 368 {
 369         positions_.push_back(pos_);
 370 }
 371
 372
 373 void Parser::popPosition()
 374 {
 375         pos_ = positions_.back();
 376         positions_.pop_back();
 377 }
 378
 379
 380 bool Parser::good()
 381 {
 382         if (pos_ < tokens_.size())
 383                 return true;
 384         tokenize_one();
 385         return pos_ < tokens_.size();
 386 }
 387
 388
 389 char Parser::getChar()
 390 {
 391         if (!good())
 392                 error("The input stream is not well...");
 393         return get_token().character();
 394 }
 395
 396
 397 bool Parser::hasOpt()
 398 {
 399         // An optional argument can occur in any of the following forms:
 400         // - \foo[bar]
 401         // - \foo [bar]
 402         // - \foo
 403         //   [bar]
 404         // - \foo %comment
 405         //   [bar]
 406
 407         // remember current position
 408         unsigned int oldpos = pos_;
 409         // skip spaces and comments
 410         while (good()) {
 411                 get_token();
 412                 if (isParagraph()) {
 413                         putback();
 414                         break;
 415                 }
 416                 if (curr_token().cat() == catSpace ||
 417                     curr_token().cat() == catNewline ||
 418                     curr_token().cat() == catComment)
 419                         continue;
 420                 putback();
 421                 break;
 422         }
 423         bool const retval = (next_token().asInput() == "[");
 424         pos_ = oldpos;
 425         return retval;
 426 }
 427
 428
 429 Parser::Arg Parser::getFullArg(char left, char right, bool allow_escaping)
 430 {
 431         skip_spaces(true);
 432
 433         // This is needed if a partial file ends with a command without arguments,
 434         // e. g. \medskip
 435         if (! good())
 436                 return make_pair(false, string());
 437
 438         string result;
 439         Token t = get_token();
 440
 441         if (t.cat() == catComment || t.cat() == catEscape ||
 442             t.character() != left) {
 443                 putback();
 444                 return make_pair(false, string());
 445         } else {
 446                 for (t = get_token(); good(); t = get_token()) {
 447                         // Ignore comments
 448                         if (t.cat() == catComment) {
 449                                 if (!t.cs().empty())
 450                                         cerr << "Ignoring comment: " << t.asInput();
 451                                 continue;
 452                         }
 453                         if (allow_escaping) {
 454                                 if (t.cat() != catEscape && t.character() == right)
 455                                         break;
 456                         } else {
 457                                 if (t.character() == right) {
 458                                         if (t.cat() == catEscape)
 459                                                 result += '\\';
 460                                         break;
 461                                 }
 462                         }
 463                         result += t.asInput();
 464                 }
 465         }
 466         return make_pair(true, result);
 467 }
 468
 469
 470 string Parser::getArg(char left, char right, bool allow_escaping)
 471 {
 472         return getFullArg(left, right, allow_escaping).second;
 473 }
 474
 475
 476 string Parser::getFullOpt(bool keepws)
 477 {
 478         Arg arg = getFullArg('[', ']');
 479         if (arg.first)
 480                 return '[' + arg.second + ']';
 481         if (keepws)
 482                 unskip_spaces(true);
 483         return string();
 484 }
 485
 486
 487 string Parser::getOpt(bool keepws)
 488 {
 489         string const res = getArg('[', ']');
 490         if (res.empty()) {
 491                 if (keepws)
 492                         unskip_spaces(true);
 493                 return string();
 494         }
 495         return '[' + res + ']';
 496 }
 497
 498
 499 string Parser::getFullParentheseArg()
 500 {
 501         Arg arg = getFullArg('(', ')');
 502         if (arg.first)
 503                 return '(' + arg.second + ')';
 504         return string();
 505 }
 506
 507
 508 string const Parser::ertEnvironment(string const & name)
 509 {
 510         if (!good())
 511                 return string();
 512
 513         ostringstream os;
 514         for (Token t = get_token(); good(); t = get_token()) {
 515                 if (t.cat() == catBegin) {
 516                         putback();
 517                         os << '{' << verbatim_item() << '}';
 518                 } else if (t.asInput() == "\\begin") {
 519                         string const env = getArg('{', '}');
 520                         os << "\\begin{" << env << '}'
 521                            << ertEnvironment(env)
 522                            << "\\end{" << env << '}';
 523                 } else if (t.asInput() == "\\end") {
 524                         string const end = getArg('{', '}');
 525                         if (end != name)
 526                                 cerr << "\\end{" << end
 527                                      << "} does not match \\begin{" << name
 528                                      << "}." << endl;
 529                         return os.str();
 530                 } else
 531                         os << t.asInput();
 532         }
 533         cerr << "unexpected end of input" << endl;
 534         return os.str();
 535 }
 536
 537
 538 string const Parser::plainEnvironment(string const & name)
 539 {
 540         if (!good())
 541                 return string();
 542
 543         ostringstream os;
 544         for (Token t = get_token(); good(); t = get_token()) {
 545                 if (t.asInput() == "\\end") {
 546                         string const end = getArg('{', '}');
 547                         if (end == name)
 548                                 return os.str();
 549                         else
 550                                 os << "\\end{" << end << '}';
 551                 } else
 552                         os << t.asInput();
 553         }
 554         cerr << "unexpected end of input" << endl;
 555         return os.str();
 556 }
 557
 558
 559 string const Parser::plainCommand(char left, char right, string const & name)
 560 {
 561         if (!good())
 562                 return string();
 563         // check if first token is really the start character
 564         Token tok = get_token();
 565         if (tok.character() != left) {
 566                 cerr << "first character does not match start character of command \\" << name << endl;
 567                 return string();
 568         }
 569         ostringstream os;
 570         for (Token t = get_token(); good(); t = get_token()) {
 571                 if (t.character() == right) {
 572                         return os.str();
 573                 } else
 574                         os << t.asInput();
 575         }
 576         cerr << "unexpected end of input" << endl;
 577         return os.str();
 578 }
 579
 580
 581 string const Parser::verbatimStuff(string const & end_string)
 582 {
 583         if (!good())
 584                 return string();
 585
 586         ostringstream oss;
 587         size_t match_index = 0;
 588         setCatcodes(VERBATIM_CATCODES);
 589         for (Token t = get_token(); good(); t = get_token()) {
 590                 // FIXME t.asInput() might be longer than we need ?
 591                 if (t.asInput() == end_string.substr(match_index,
 592                                                      t.asInput().length())) {
 593                         match_index += t.asInput().length();
 594                         if (match_index >= end_string.length())
 595                                 break;
 596                 } else if (match_index) {
 597                         oss << end_string.substr(0, match_index) << t.asInput();
 598                         match_index = 0;
 599                 } else
 600                         oss << t.asInput();
 601         }
 602         setCatcodes(NORMAL_CATCODES);
 603         if (!good())
 604                 cerr << "unexpected end of input" << endl;
 605         return oss.str();
 606 }
 607
 608
 609 string const Parser::verbatimEnvironment(string const & name)
 610 {
 611         string s = verbatimStuff("\\end{" + name + "}");
 612         // ignore one newline at beginning or end of string
 613         if (prefixIs(s, "\n"))
 614                 s.erase(0,1);
 615         if (suffixIs(s, "\n"))
 616                 s.erase(s.length() - 1,1);
 617         return s;
 618 }
 619
 620
 621 string Parser::verbatimOption()
 622 {
 623         string res;
 624         if (next_token().character() == '[') {
 625                 Token t = get_token();
 626                 for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
 627                         if (t.cat() == catBegin) {
 628                                 putback();
 629                                 res += '{' + verbatim_item() + '}';
 630                         } else
 631                                 res += t.cs();
 632                 }
 633         }
 634         return res;
 635 }
 636
 637
 638 string Parser::verbatim_item()
 639 {
 640         if (!good())
 641                 error("stream bad");
 642         skip_spaces();
 643         if (next_token().cat() == catBegin) {
 644                 Token t = get_token(); // skip brace
 645                 string res;
 646                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
 647                         if (t.cat() == catBegin) {
 648                                 putback();
 649                                 res += '{' + verbatim_item() + '}';
 650                         }
 651                         else
 652                                 res += t.asInput();
 653                 }
 654                 return res;
 655         }
 656         return get_token().asInput();
 657 }
 658
 659
 660 void Parser::tokenize_one()
 661 {
 662         catInit();
 663         char_type c;
 664         if (!is_.get(c))
 665                 return;
 666
 667         switch (catcode(c)) {
 668         case catSpace: {
 669                 docstring s(1, c);
 670                 while (is_.get(c) && catcode(c) == catSpace)
 671                         s += c;
 672                 if (catcode(c) != catSpace)
 673                         is_.putback(c);
 674                 push_back(Token(s, catSpace));
 675                 break;
 676         }
 677
 678         case catNewline: {
 679                 ++lineno_;
 680                 docstring s(1, getNewline(is_, c));
 681                 while (is_.get(c) && catcode(c) == catNewline) {
 682                         ++lineno_;
 683                         s += getNewline(is_, c);
 684                 }
 685                 if (catcode(c) != catNewline)
 686                         is_.putback(c);
 687                 push_back(Token(s, catNewline));
 688                 break;
 689         }
 690
 691         case catComment: {
 692                 // We don't treat "%\n" combinations here specially because
 693                 // we want to preserve them in the preamble
 694                 docstring s;
 695                 while (is_.get(c) && catcode(c) != catNewline)
 696                         s += c;
 697                 // handle possible DOS line ending
 698                 if (catcode(c) == catNewline)
 699                         c = getNewline(is_, c);
 700                 // Note: The '%' at the beginning and the '\n' at the end
 701                 // of the comment are not stored.
 702                 ++lineno_;
 703                 push_back(Token(s, catComment));
 704                 break;
 705         }
 706
 707         case catEscape: {
 708                 is_.get(c);
 709                 if (!is_) {
 710                         error("unexpected end of input");
 711                 } else {
 712                         docstring s(1, c);
 713                         if (catcode(c) == catLetter) {
 714                                 // collect letters
 715                                 while (is_.get(c) && catcode(c) == catLetter)
 716                                         s += c;
 717                                 if (catcode(c) != catLetter)
 718                                         is_.putback(c);
 719                         }
 720                         push_back(Token(s, catEscape));
 721                 }
 722                 break;
 723         }
 724
 725         case catIgnore: {
 726                 cerr << "ignoring a char: " << c << "\n";
 727                 break;
 728         }
 729
 730         default:
 731                 push_back(Token(docstring(1, c), catcode(c)));
 732         }
 733         //cerr << tokens_.back();
 734 }
 735
 736
 737 void Parser::dump() const
 738 {
 739         cerr << "\nTokens: ";
 740         for (unsigned i = 0; i < tokens_.size(); ++i) {
 741                 if (i == pos_)
 742                         cerr << " <#> ";
 743                 cerr << tokens_[i];
 744         }
 745         cerr << " pos: " << pos_ << "\n";
 746 }
 747
 748
 749 void Parser::error(string const & msg)
 750 {
 751         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
 752         dump();
 753         //exit(1);
 754 }
 755
 756
 757 void Parser::reset()
 758 {
 759         pos_ = 0;
 760 }
 761
 762
 763 } // namespace lyx