src/tex2lyx/Parser.cpp

   1 /**
   2  * \file Parser.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author André Pönitz
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "Parser.h"
  14
  15 #include <iostream>
  16 #include <sstream>
  17
  18 using namespace std;
  19
  20 namespace lyx {
  21
  22 namespace {
  23
  24 CatCode theCatcode[256];
  25
  26 void catInit()
  27 {
  28         static bool init_done = false;
  29         if (init_done)
  30                 return;
  31         init_done = true;
  32
  33         fill(theCatcode, theCatcode + 256, catOther);
  34         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
  35         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
  36
  37         theCatcode[int('\\')] = catEscape;
  38         theCatcode[int('{')]  = catBegin;
  39         theCatcode[int('}')]  = catEnd;
  40         theCatcode[int('$')]  = catMath;
  41         theCatcode[int('&')]  = catAlign;
  42         theCatcode[int('\n')] = catNewline;
  43         theCatcode[int('#')]  = catParameter;
  44         theCatcode[int('^')]  = catSuper;
  45         theCatcode[int('_')]  = catSub;
  46         theCatcode[0x7f]      = catIgnore;
  47         theCatcode[int(' ')]  = catSpace;
  48         theCatcode[int('\t')] = catSpace;
  49         theCatcode[int('\r')] = catNewline;
  50         theCatcode[int('~')]  = catActive;
  51         theCatcode[int('%')]  = catComment;
  52
  53         // This is wrong!
  54         theCatcode[int('@')]  = catLetter;
  55 }
  56
  57
  58 /*!
  59  * Translate a line ending to '\n'.
  60  * \p c must have catcode catNewline, and it must be the last character read
  61  * from \p is.
  62  */
  63 char getNewline(istream & is, char c)
  64 {
  65         // we have to handle 3 different line endings:
  66         // - UNIX (\n)
  67         // - MAC  (\r)
  68         // - DOS  (\r\n)
  69         if (c == '\r') {
  70                 // MAC or DOS
  71                 if (is.get(c) && c != '\n') {
  72                         // MAC
  73                         is.putback(c);
  74                 }
  75                 return '\n';
  76         }
  77         // UNIX
  78         return c;
  79 }
  80
  81 }
  82
  83
  84 //
  85 // catcodes
  86 //
  87
  88 CatCode catcode(unsigned char c)
  89 {
  90         return theCatcode[c];
  91 }
  92
  93
  94
  95 //
  96 // Token
  97 //
  98
  99 ostream & operator<<(ostream & os, Token const & t)
 100 {
 101         if (t.cat() == catComment)
 102                 os << '%' << t.cs() << '\n';
 103         else if (t.cat() == catSpace)
 104                 os << t.cs();
 105         else if (t.cat() == catEscape)
 106                 os << '\\' << t.cs() << ' ';
 107         else if (t.cat() == catLetter)
 108                 os << t.character();
 109         else if (t.cat() == catNewline)
 110                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
 111         else
 112                 os << '[' << t.character() << ',' << t.cat() << ']';
 113         return os;
 114 }
 115
 116
 117 string Token::asString() const
 118 {
 119         return cs_.size() ? cs_ : string(1, char_);
 120 }
 121
 122
 123 string Token::asInput() const
 124 {
 125         if (cat_ == catComment)
 126                 return '%' + cs_ + '\n';
 127         if (cat_ == catSpace || cat_ == catNewline)
 128                 return cs_;
 129         return char_ ? string(1, char_) : '\\' + cs_;
 130 }
 131
 132
 133 //
 134 // Parser
 135 //
 136
 137
 138 Parser::Parser(istream & is)
 139         : lineno_(0), pos_(0), iss_(0), is_(is)
 140 {
 141 }
 142
 143
 144 Parser::Parser(string const & s)
 145         : lineno_(0), pos_(0), iss_(new istringstream(s)), is_(*iss_)
 146 {
 147 }
 148
 149
 150 Parser::~Parser()
 151 {
 152         delete iss_;
 153 }
 154
 155
 156 void Parser::push_back(Token const & t)
 157 {
 158         tokens_.push_back(t);
 159 }
 160
 161
 162 Token const & Parser::prev_token() const
 163 {
 164         static const Token dummy;
 165         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
 166 }
 167
 168
 169 Token const & Parser::curr_token() const
 170 {
 171         static const Token dummy;
 172         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
 173 }
 174
 175
 176 Token const & Parser::next_token()
 177 {
 178         static const Token dummy;
 179         return good() ? tokens_[pos_] : dummy;
 180 }
 181
 182
 183 Token const & Parser::get_token()
 184 {
 185         static const Token dummy;
 186         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
 187         return good() ? tokens_[pos_++] : dummy;
 188 }
 189
 190
 191 bool Parser::isParagraph()
 192 {
 193         // A new paragraph in TeX ist started
 194         // - either by a newline, following any amount of whitespace
 195         //   characters (including zero), and another newline
 196         // - or the token \par
 197         if (curr_token().cat() == catNewline &&
 198             (curr_token().cs().size() > 1 ||
 199              (next_token().cat() == catSpace &&
 200               pos_ < tokens_.size() - 1 &&
 201               tokens_[pos_ + 1].cat() == catNewline)))
 202                 return true;
 203         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
 204                 return true;
 205         return false;
 206 }
 207
 208
 209 void Parser::skip_spaces(bool skip_comments)
 210 {
 211         // We just silently return if we have no more tokens.
 212         // skip_spaces() should be callable at any time,
 213         // the caller must check p::good() anyway.
 214         while (good()) {
 215                 get_token();
 216                 if (isParagraph()) {
 217                         putback();
 218                         break;
 219                 }
 220                 if ( curr_token().cat() == catSpace ||
 221                      curr_token().cat() == catNewline ||
 222                     (curr_token().cat() == catComment && curr_token().cs().empty()))
 223                         continue;
 224                 if (skip_comments && curr_token().cat() == catComment)
 225                         cerr << "  Ignoring comment: " << curr_token().asInput();
 226                 else {
 227                         putback();
 228                         break;
 229                 }
 230         }
 231 }
 232
 233
 234 void Parser::unskip_spaces(bool skip_comments)
 235 {
 236         while (pos_ > 0) {
 237                 if ( curr_token().cat() == catSpace ||
 238                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
 239                         putback();
 240                 else if (skip_comments && curr_token().cat() == catComment) {
 241                         // TODO: Get rid of this
 242                         cerr << "Unignoring comment: " << curr_token().asInput();
 243                         putback();
 244                 }
 245                 else
 246                         break;
 247         }
 248 }
 249
 250
 251 void Parser::putback()
 252 {
 253         --pos_;
 254 }
 255
 256
 257 bool Parser::good()
 258 {
 259         if (pos_ < tokens_.size())
 260                 return true;
 261         tokenize_one();
 262         return pos_ < tokens_.size();
 263 }
 264
 265
 266 char Parser::getChar()
 267 {
 268         if (!good())
 269                 error("The input stream is not well...");
 270         return tokens_[pos_++].character();
 271 }
 272
 273
 274 Parser::Arg Parser::getFullArg(char left, char right)
 275 {
 276         skip_spaces(true);
 277
 278         // This is needed if a partial file ends with a command without arguments,
 279         // e. g. \medskip
 280         if (! good())
 281                 return make_pair(false, string());
 282
 283         string result;
 284         char c = getChar();
 285
 286         if (c != left) {
 287                 putback();
 288                 return make_pair(false, string());
 289         } else
 290                 while ((c = getChar()) != right && good()) {
 291                         // Ignore comments
 292                         if (curr_token().cat() == catComment) {
 293                                 if (!curr_token().cs().empty())
 294                                         cerr << "Ignoring comment: " << curr_token().asInput();
 295                         }
 296                         else
 297                                 result += curr_token().asInput();
 298                 }
 299
 300         return make_pair(true, result);
 301 }
 302
 303
 304 string Parser::getArg(char left, char right)
 305 {
 306         return getFullArg(left, right).second;
 307 }
 308
 309
 310 string Parser::getFullOpt()
 311 {
 312         Arg arg = getFullArg('[', ']');
 313         if (arg.first)
 314                 return '[' + arg.second + ']';
 315         return string();
 316 }
 317
 318
 319 string Parser::getOpt()
 320 {
 321         string const res = getArg('[', ']');
 322         return res.empty() ? string() : '[' + res + ']';
 323 }
 324
 325
 326 string Parser::getFullParentheseArg()
 327 {
 328         Arg arg = getFullArg('(', ')');
 329         if (arg.first)
 330                 return '(' + arg.second + ')';
 331         return string();
 332 }
 333
 334
 335 string const Parser::verbatimEnvironment(string const & name)
 336 {
 337         if (!good())
 338                 return string();
 339
 340         ostringstream os;
 341         for (Token t = get_token(); good(); t = get_token()) {
 342                 if (t.cat() == catBegin) {
 343                         putback();
 344                         os << '{' << verbatim_item() << '}';
 345                 } else if (t.asInput() == "\\begin") {
 346                         string const env = getArg('{', '}');
 347                         os << "\\begin{" << env << '}'
 348                            << verbatimEnvironment(env)
 349                            << "\\end{" << env << '}';
 350                 } else if (t.asInput() == "\\end") {
 351                         string const end = getArg('{', '}');
 352                         if (end != name)
 353                                 cerr << "\\end{" << end
 354                                      << "} does not match \\begin{" << name
 355                                      << "}." << endl;
 356                         return os.str();
 357                 } else
 358                         os << t.asInput();
 359         }
 360         cerr << "unexpected end of input" << endl;
 361         return os.str();
 362 }
 363
 364
 365 void Parser::tokenize_one()
 366 {
 367         catInit();
 368         char c;
 369         if (!is_.get(c))
 370                 return;
 371         //cerr << "reading c: " << c << "\n";
 372
 373         switch (catcode(c)) {
 374         case catSpace: {
 375                 string s(1, c);
 376                 while (is_.get(c) && catcode(c) == catSpace)
 377                         s += c;
 378                 if (catcode(c) != catSpace)
 379                         is_.putback(c);
 380                 push_back(Token(s, catSpace));
 381                 break;
 382         }
 383
 384         case catNewline: {
 385                 ++lineno_;
 386                 string s(1, getNewline(is_, c));
 387                 while (is_.get(c) && catcode(c) == catNewline) {
 388                         ++lineno_;
 389                         s += getNewline(is_, c);
 390                 }
 391                 if (catcode(c) != catNewline)
 392                         is_.putback(c);
 393                 push_back(Token(s, catNewline));
 394                 break;
 395         }
 396
 397         case catComment: {
 398                 // We don't treat "%\n" combinations here specially because
 399                 // we want to preserve them in the preamble
 400                 string s;
 401                 while (is_.get(c) && catcode(c) != catNewline)
 402                         s += c;
 403                 // handle possible DOS line ending
 404                 if (catcode(c) == catNewline)
 405                         c = getNewline(is_, c);
 406                 // Note: The '%' at the beginning and the '\n' at the end
 407                 // of the comment are not stored.
 408                 ++lineno_;
 409                 push_back(Token(s, catComment));
 410                 break;
 411         }
 412
 413         case catEscape: {
 414                 is_.get(c);
 415                 if (!is_) {
 416                         error("unexpected end of input");
 417                 } else {
 418                         string s(1, c);
 419                         if (catcode(c) == catLetter) {
 420                                 // collect letters
 421                                 while (is_.get(c) && catcode(c) == catLetter)
 422                                         s += c;
 423                                 if (catcode(c) != catLetter)
 424                                         is_.putback(c);
 425                         }
 426                         push_back(Token(s, catEscape));
 427                 }
 428                 break;
 429         }
 430
 431         case catIgnore: {
 432                 cerr << "ignoring a char: " << int(c) << "\n";
 433                 break;
 434         }
 435
 436         default:
 437                 push_back(Token(c, catcode(c)));
 438         }
 439 }
 440
 441
 442 void Parser::dump() const
 443 {
 444         cerr << "\nTokens: ";
 445         for (unsigned i = 0; i < tokens_.size(); ++i) {
 446                 if (i == pos_)
 447                         cerr << " <#> ";
 448                 cerr << tokens_[i];
 449         }
 450         cerr << " pos: " << pos_ << "\n";
 451 }
 452
 453
 454 void Parser::error(string const & msg)
 455 {
 456         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
 457         dump();
 458         //exit(1);
 459 }
 460
 461
 462 string Parser::verbatimOption()
 463 {
 464         string res;
 465         if (next_token().character() == '[') {
 466                 Token t = get_token();
 467                 for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
 468                         if (t.cat() == catBegin) {
 469                                 putback();
 470                                 res += '{' + verbatim_item() + '}';
 471                         } else
 472                                 res += t.asString();
 473                 }
 474         }
 475         return res;
 476 }
 477
 478
 479 string Parser::verbatim_item()
 480 {
 481         if (!good())
 482                 error("stream bad");
 483         skip_spaces();
 484         if (next_token().cat() == catBegin) {
 485                 Token t = get_token(); // skip brace
 486                 string res;
 487                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
 488                         if (t.cat() == catBegin) {
 489                                 putback();
 490                                 res += '{' + verbatim_item() + '}';
 491                         }
 492                         else
 493                                 res += t.asInput();
 494                 }
 495                 return res;
 496         }
 497         return get_token().asInput();
 498 }
 499
 500
 501 void Parser::reset()
 502 {
 503         pos_ = 0;
 504 }
 505
 506
 507 void Parser::setCatCode(char c, CatCode cat)
 508 {
 509         theCatcode[(unsigned char)c] = cat;
 510 }
 511
 512
 513 CatCode Parser::getCatCode(char c) const
 514 {
 515         return theCatcode[(unsigned char)c];
 516 }
 517
 518
 519 } // namespace lyx