src/tex2lyx/texparser.C

   1 /**
   2  * \file texparser.C
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author André Pönitz
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "texparser.h"
  14
  15 #include <iostream>
  16 #include <sstream>
  17
  18 using std::cerr;
  19 using std::endl;
  20 using std::fill;
  21 using std::istream;
  22 using std::istringstream;
  23 using std::ostream;
  24 using std::string;
  25
  26
  27 namespace {
  28
  29 CatCode theCatcode[256];
  30
  31 void catInit()
  32 {
  33         fill(theCatcode, theCatcode + 256, catOther);
  34         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
  35         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
  36
  37         theCatcode[int('\\')] = catEscape;
  38         theCatcode[int('{')]  = catBegin;
  39         theCatcode[int('}')]  = catEnd;
  40         theCatcode[int('$')]  = catMath;
  41         theCatcode[int('&')]  = catAlign;
  42         theCatcode[int('\n')] = catNewline;
  43         theCatcode[int('#')]  = catParameter;
  44         theCatcode[int('^')]  = catSuper;
  45         theCatcode[int('_')]  = catSub;
  46         theCatcode[0x7f]      = catIgnore;
  47         theCatcode[int(' ')]  = catSpace;
  48         theCatcode[int('\t')] = catSpace;
  49         theCatcode[int('\r')] = catNewline;
  50         theCatcode[int('~')]  = catActive;
  51         theCatcode[int('%')]  = catComment;
  52
  53         // This is wrong!
  54         theCatcode[int('@')]  = catLetter;
  55 }
  56
  57
  58 /*!
  59  * Translate a line ending to '\n'.
  60  * \p c must have catcode catNewline, and it must be the last character read
  61  * from \p is.
  62  */
  63 char getNewline(istream & is, char c)
  64 {
  65         // we have to handle 3 different line endings:
  66         // - UNIX (\n)
  67         // - MAC  (\r)
  68         // - DOS  (\r\n)
  69         if (c == '\r') {
  70                 // MAC or DOS
  71                 if (is.get(c) && c != '\n') {
  72                         // MAC
  73                         is.putback(c);
  74                 }
  75                 return '\n';
  76         }
  77         // UNIX
  78         return c;
  79 }
  80
  81 }
  82
  83
  84 //
  85 // catcodes
  86 //
  87
  88 CatCode catcode(unsigned char c)
  89 {
  90         return theCatcode[c];
  91 }
  92
  93
  94
  95 //
  96 // Token
  97 //
  98
  99 ostream & operator<<(ostream & os, Token const & t)
 100 {
 101         if (t.cat() == catComment)
 102                 os << '%' << t.cs() << '\n';
 103         else if (t.cat() == catSpace)
 104                 os << t.cs();
 105         else if (t.cat() == catEscape)
 106                 os << '\\' << t.cs() << ' ';
 107         else if (t.cat() == catLetter)
 108                 os << t.character();
 109         else if (t.cat() == catNewline)
 110                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
 111         else
 112                 os << '[' << t.character() << ',' << t.cat() << ']';
 113         return os;
 114 }
 115
 116
 117 string Token::asString() const
 118 {
 119         return cs_.size() ? cs_ : string(1, char_);
 120 }
 121
 122
 123 string Token::asInput() const
 124 {
 125         if (cat_ == catComment)
 126                 return '%' + cs_ + '\n';
 127         if (cat_ == catSpace || cat_ == catNewline)
 128                 return cs_;
 129         return char_ ? string(1, char_) : '\\' + cs_;
 130 }
 131
 132
 133 //
 134 // Parser
 135 //
 136
 137
 138 Parser::Parser(istream & is)
 139         : lineno_(0), pos_(0)
 140 {
 141         tokenize(is);
 142 }
 143
 144
 145 Parser::Parser(string const & s)
 146         : lineno_(0), pos_(0)
 147 {
 148         istringstream is(s);
 149         tokenize(is);
 150 }
 151
 152
 153 void Parser::push_back(Token const & t)
 154 {
 155         tokens_.push_back(t);
 156 }
 157
 158
 159 void Parser::pop_back()
 160 {
 161         tokens_.pop_back();
 162 }
 163
 164
 165 Token const & Parser::prev_token() const
 166 {
 167         static const Token dummy;
 168         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
 169 }
 170
 171
 172 Token const & Parser::curr_token() const
 173 {
 174         static const Token dummy;
 175         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
 176 }
 177
 178
 179 Token const & Parser::next_token() const
 180 {
 181         static const Token dummy;
 182         return good() ? tokens_[pos_] : dummy;
 183 }
 184
 185
 186 Token const & Parser::get_token()
 187 {
 188         static const Token dummy;
 189         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
 190         return good() ? tokens_[pos_++] : dummy;
 191 }
 192
 193
 194 bool Parser::isParagraph() const
 195 {
 196         // A new paragraph in TeX ist started
 197         // - either by a newline, following any amount of whitespace
 198         //   characters (including zero), and another newline
 199         // - or the token \par
 200         if (curr_token().cat() == catNewline &&
 201             (curr_token().cs().size() > 1 ||
 202              (next_token().cat() == catSpace &&
 203               pos_ < tokens_.size() - 1 &&
 204               tokens_[pos_ + 1].cat() == catNewline)))
 205                 return true;
 206         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
 207                 return true;
 208         return false;
 209 }
 210
 211
 212 void Parser::skip_spaces(bool skip_comments)
 213 {
 214         // We just silently return if we have no more tokens.
 215         // skip_spaces() should be callable at any time,
 216         // the caller must check p::good() anyway.
 217         while (good()) {
 218                 get_token();
 219                 if (isParagraph()) {
 220                         putback();
 221                         break;
 222                 }
 223                 if ( curr_token().cat() == catSpace ||
 224                      curr_token().cat() == catNewline ||
 225                     (curr_token().cat() == catComment && curr_token().cs().empty()))
 226                         continue;
 227                 if (skip_comments && curr_token().cat() == catComment)
 228                         cerr << "  Ignoring comment: " << curr_token().asInput();
 229                 else {
 230                         putback();
 231                         break;
 232                 }
 233         }
 234 }
 235
 236
 237 void Parser::unskip_spaces(bool skip_comments)
 238 {
 239         while (pos_ > 0) {
 240                 if ( curr_token().cat() == catSpace ||
 241                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
 242                         putback();
 243                 else if (skip_comments && curr_token().cat() == catComment) {
 244                         // TODO: Get rid of this
 245                         cerr << "Unignoring comment: " << curr_token().asInput();
 246                         putback();
 247                 }
 248                 else
 249                         break;
 250         }
 251 }
 252
 253
 254 void Parser::putback()
 255 {
 256         --pos_;
 257 }
 258
 259
 260 bool Parser::good() const
 261 {
 262         return pos_ < tokens_.size();
 263 }
 264
 265
 266 char Parser::getChar()
 267 {
 268         if (!good())
 269                 error("The input stream is not well...");
 270         return tokens_[pos_++].character();
 271 }
 272
 273
 274 string Parser::getArg(char left, char right)
 275 {
 276         skip_spaces(true);
 277
 278         // This is needed if a partial file ends with a command without arguments,
 279         // e. g. \medskip
 280         if (! good())
 281                 return string();
 282
 283         string result;
 284         char c = getChar();
 285
 286         if (c != left)
 287                 putback();
 288         else
 289                 while ((c = getChar()) != right && good()) {
 290                         // Ignore comments
 291                         if (curr_token().cat() == catComment) {
 292                                 if (!curr_token().cs().empty())
 293                                         cerr << "Ignoring comment: " << curr_token().asInput();
 294                         }
 295                         else
 296                                 result += curr_token().asInput();
 297                 }
 298
 299         return result;
 300 }
 301
 302
 303 string Parser::getOpt()
 304 {
 305         string const res = getArg('[', ']');
 306         return res.size() ? '[' + res + ']' : string();
 307 }
 308
 309
 310 void Parser::tokenize(istream & is)
 311 {
 312         static bool init_done = false;
 313
 314         if (!init_done) {
 315                 catInit();
 316                 init_done = true;
 317         }
 318
 319         char c;
 320         while (is.get(c)) {
 321                 //cerr << "reading c: " << c << "\n";
 322
 323                 switch (catcode(c)) {
 324                         case catSpace: {
 325                                 string s(1, c);
 326                                 while (is.get(c) && catcode(c) == catSpace)
 327                                         s += c;
 328                                 if (catcode(c) != catSpace)
 329                                         is.putback(c);
 330                                 push_back(Token(s, catSpace));
 331                                 break;
 332                         }
 333
 334                         case catNewline: {
 335                                 ++lineno_;
 336                                 string s(1, getNewline(is, c));
 337                                 while (is.get(c) && catcode(c) == catNewline) {
 338                                         ++lineno_;
 339                                         s += getNewline(is, c);
 340                                 }
 341                                 if (catcode(c) != catNewline)
 342                                         is.putback(c);
 343                                 push_back(Token(s, catNewline));
 344                                 break;
 345                         }
 346
 347                         case catComment: {
 348                                 // We don't treat "%\n" combinations here specially because
 349                                 // we want to preserve them in the preamble
 350                                 string s;
 351                                 while (is.get(c) && catcode(c) != catNewline)
 352                                         s += c;
 353                                 // handle possible DOS line ending
 354                                 if (catcode(c) == catNewline)
 355                                         c = getNewline(is, c);
 356                                 // Note: The '%' at the beginning and the '\n' at the end
 357                                 // of the comment are not stored.
 358                                 ++lineno_;
 359                                 push_back(Token(s, catComment));
 360                                 break;
 361                         }
 362
 363                         case catEscape: {
 364                                 is.get(c);
 365                                 if (!is) {
 366                                         error("unexpected end of input");
 367                                 } else {
 368                                         string s(1, c);
 369                                         if (catcode(c) == catLetter) {
 370                                                 // collect letters
 371                                                 while (is.get(c) && catcode(c) == catLetter)
 372                                                         s += c;
 373                                                 if (catcode(c) != catLetter)
 374                                                         is.putback(c);
 375                                         }
 376                                         push_back(Token(s, catEscape));
 377                                 }
 378                                 break;
 379                         }
 380
 381                         case catIgnore: {
 382                                 cerr << "ignoring a char: " << int(c) << "\n";
 383                                 break;
 384                         }
 385
 386                         default:
 387                                 push_back(Token(c, catcode(c)));
 388                 }
 389         }
 390 }
 391
 392
 393 void Parser::dump() const
 394 {
 395         cerr << "\nTokens: ";
 396         for (unsigned i = 0; i < tokens_.size(); ++i) {
 397                 if (i == pos_)
 398                         cerr << " <#> ";
 399                 cerr << tokens_[i];
 400         }
 401         cerr << " pos: " << pos_ << "\n";
 402 }
 403
 404
 405 void Parser::error(string const & msg)
 406 {
 407         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
 408         dump();
 409         //exit(1);
 410 }
 411
 412
 413 string Parser::verbatimOption()
 414 {
 415         string res;
 416         if (next_token().character() == '[') {
 417                 Token t = get_token();
 418                 for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
 419                         if (t.cat() == catBegin) {
 420                                 putback();
 421                                 res += '{' + verbatim_item() + '}';
 422                         } else
 423                                 res += t.asString();
 424                 }
 425         }
 426         return res;
 427 }
 428
 429
 430 string Parser::verbatim_item()
 431 {
 432         if (!good())
 433                 error("stream bad");
 434         skip_spaces();
 435         if (next_token().cat() == catBegin) {
 436                 Token t = get_token(); // skip brace
 437                 string res;
 438                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
 439                         if (t.cat() == catBegin) {
 440                                 putback();
 441                                 res += '{' + verbatim_item() + '}';
 442                         }
 443                         else
 444                                 res += t.asInput();
 445                 }
 446                 return res;
 447         }
 448         return get_token().asInput();
 449 }
 450
 451
 452 void Parser::reset()
 453 {
 454         pos_ = 0;
 455 }
 456
 457
 458 void Parser::setCatCode(char c, CatCode cat)
 459 {
 460         theCatcode[(unsigned char)c] = cat;
 461 }
 462
 463
 464 CatCode Parser::getCatCode(char c) const
 465 {
 466         return theCatcode[(unsigned char)c];
 467 }