src/tex2lyx/text.C

   1 /** The .tex to .lyx converter
   2     \author André Pönitz (2003)
   3  */
   4
   5 // {[(
   6
   7 #include <config.h>
   8
   9 #include "tex2lyx.h"
  10 #include "FloatList.h"
  11 #include "lyxtextclass.h"
  12 #include "support/lstrings.h"
  13 #include "support/tostr.h"
  14
  15 #include <iostream>
  16 #include <map>
  17 #include <sstream>
  18 #include <vector>
  19
  20 using std::cerr;
  21 using std::endl;
  22 using std::map;
  23 using std::ostream;
  24 using std::ostringstream;
  25 using std::string;
  26 using std::vector;
  27
  28 using lyx::support::rtrim;
  29 using lyx::support::suffixIs;
  30
  31 namespace {
  32
  33 char const * known_latex_commands[] = { "ref", "cite", "label", "index",
  34 "printindex", "pageref", "url", 0 };
  35
  36 // LaTeX names for quotes
  37 char const * known_quotes[] = { "glqq", "grqq", "quotedblbase",
  38 "textquotedblleft", "quotesinglbase", "guilsinglleft", "guilsinglright", 0};
  39
  40 // the same as known_quotes with .lyx names
  41 char const * known_coded_quotes[] = { "gld", "grd", "gld",
  42 "grd", "gls", "fls", "frd", 0};
  43
  44 char const * known_sizes[] = { "tiny", "scriptsize", "footnotesize",
  45 "small", "normalsize", "large", "Large", "LARGE", "huge", "Huge", 0};
  46
  47 char const * known_coded_sizes[] = { "tiny", "scriptsize", "footnotesize",
  48 "small", "normal", "large", "larger", "largest",  "huge", "giant", 0};
  49
  50
  51 string cap(string s)
  52 {
  53         if (s.size())
  54                 s[0] = toupper(s[0]);
  55         return s;
  56 }
  57
  58
  59 // splits "x=z, y=b" into a map
  60 map<string, string> split_map(string const & s)
  61 {
  62         map<string, string> res;
  63         vector<string> v;
  64         split(s, v);
  65         for (size_t i = 0; i < v.size(); ++i) {
  66                 size_t const pos   = v[i].find('=');
  67                 string const index = v[i].substr(0, pos);
  68                 string const value = v[i].substr(pos + 1, string::npos);
  69                 res[trim(index)] = trim(value);
  70         }
  71         return res;
  72 }
  73
  74
  75 void begin_inset(ostream & os, string const & name)
  76 {
  77         os << "\n\\begin_inset " << name;
  78 }
  79
  80
  81 void end_inset(ostream & os)
  82 {
  83         os << "\n\\end_inset\n\n";
  84 }
  85
  86
  87 void skip_braces(Parser & p)
  88 {
  89         if (p.next_token().cat() != catBegin)
  90                 return;
  91         p.get_token();
  92         if (p.next_token().cat() == catEnd) {
  93                 p.get_token();
  94                 return;
  95         }
  96         p.putback();
  97 }
  98
  99
 100 void handle_ert(ostream & os, string const & s)
 101 {
 102         begin_inset(os, "ERT");
 103         os << "\nstatus Collapsed\n\n\\layout Standard\n\n";
 104         for (string::const_iterator it = s.begin(), et = s.end(); it != et; ++it) {
 105                 if (*it == '\\')
 106                         os << "\n\\backslash\n";
 107                 else
 108                         os << *it;
 109         }
 110         end_inset(os);
 111 }
 112
 113
 114 void handle_par(ostream & os)
 115 {
 116         if (active_environments.empty())
 117                 return;
 118         os << "\n\\layout ";
 119         string s = active_environment();
 120         if (s == "document" || s == "table")
 121                 os << "Standard\n\n";
 122         else if (s == "center")
 123                 os << "Standard\n\n\\align center\n";
 124         else if (s == "lyxcode")
 125                 os << "LyX-Code\n\n";
 126         else if (s == "lyxlist")
 127                 os << "List\n\n";
 128         else if (s == "thebibliography")
 129                 os << "Bibliography\n\n";
 130         else
 131                 os << cap(s) << "\n\n";
 132 }
 133
 134
 135 struct isLayout {
 136         isLayout(string const name) : name_(name) {}
 137         bool operator()(LyXLayout_ptr const & ptr) {
 138                 return ptr.get() && ptr->latexname() == name_;
 139         }
 140 private:
 141         string const name_;
 142 };
 143
 144
 145 LyXLayout_ptr findLayout(LyXTextClass const & textclass,
 146                          string const & name)
 147 {
 148         LyXTextClass::const_iterator it  = textclass.begin();
 149         LyXTextClass::const_iterator end = textclass.end();
 150         it = std::find_if(it, end, isLayout(name));
 151         return (it == end) ? LyXLayout_ptr() : *it;
 152 }
 153
 154
 155 void output_layout(ostream & os, LyXLayout_ptr const & layout_ptr,
 156                   Parser & p, bool outer, LyXTextClass const & textclass)
 157 {
 158         string name = layout_ptr->name();
 159         os << "\n\n\\layout " << name << "\n\n";
 160         if (layout_ptr->optionalargs > 0) {
 161                 string opt = p.getOpt();
 162                 if (opt.size()) {
 163                         begin_inset(os, "OptArg\n");
 164                         os << "collapsed true\n\n\\layout Standard\n\n" << opt;
 165                         end_inset(os);
 166                 }
 167         }
 168         parse_text(p, os, FLAG_ITEM, outer, textclass);
 169         os << "\n\n\\layout Standard\n\n";
 170 }
 171
 172 } // anonymous namespace
 173
 174
 175 void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
 176                 LyXTextClass const & textclass)
 177 {
 178         while (p.good()) {
 179                 LyXLayout_ptr layout_ptr;
 180                 Token const & t = p.get_token();
 181
 182 #ifdef FILEDEBUG
 183                 cerr << "t: " << t << " flags: " << flags << "\n";
 184 #endif
 185
 186                 if (flags & FLAG_ITEM) {
 187                         if (t.cat() == catSpace)
 188                                 continue;
 189
 190                         flags &= ~FLAG_ITEM;
 191                         if (t.cat() == catBegin) {
 192                                 // skip the brace and collect everything to the next matching
 193                                 // closing brace
 194                                 flags |= FLAG_BRACE_LAST;
 195                                 continue;
 196                         }
 197
 198                         // handle only this single token, leave the loop if done
 199                         flags |= FLAG_LEAVE;
 200                 }
 201
 202                 if (t.character() == ']' && (flags & FLAG_BRACK_LAST))
 203                         return;
 204
 205                 //
 206                 // cat codes
 207                 //
 208                 if (t.cat() == catMath) {
 209                         // we are inside some text mode thingy, so opening new math is allowed
 210                         begin_inset(os, "Formula ");
 211                         Token const & n = p.get_token();
 212                         if (n.cat() == catMath && outer) {
 213                                 // TeX's $$...$$ syntax for displayed math
 214                                 os << "\\[";
 215                                 parse_math(p, os, FLAG_SIMPLE, MATH_MODE);
 216                                 os << "\\]";
 217                                 p.get_token(); // skip the second '$' token
 218                         } else {
 219                                 // simple $...$  stuff
 220                                 p.putback();
 221                                 os << '$';
 222                                 parse_math(p, os, FLAG_SIMPLE, MATH_MODE);
 223                                 os << '$';
 224                         }
 225                         end_inset(os);
 226                 }
 227
 228                 else if (t.cat() == catSuper || t.cat() == catSub)
 229                         cerr << "catcode " << t << " illegal in text mode\n";
 230
 231                 else if (t.cat() == catLetter ||
 232                                t.cat() == catSpace ||
 233                                t.cat() == catOther ||
 234                                t.cat() == catAlign ||
 235                                t.cat() == catParameter)
 236                         os << t.character();
 237
 238                 else if (t.cat() == catNewline) {
 239                         if (p.next_token().cat() == catNewline) {
 240                                 p.get_token();
 241                                 handle_par(os);
 242                         } else {
 243                                 os << " "; // note the space
 244                         }
 245                 }
 246
 247                 else if (t.cat() == catActive) {
 248                         if (t.character() == '~') {
 249                                 if (active_environment() == "lyxcode")
 250                                         os << ' ';
 251                                 else
 252                                         os << "\\SpecialChar ~\n";
 253                         } else
 254                                 os << t.character();
 255                 }
 256
 257                 else if (t.cat() == catBegin) {
 258                         // special handling of size changes
 259                         bool const is_size = is_known(p.next_token().cs(), known_sizes);
 260                         string const s = parse_text(p, FLAG_BRACE_LAST, outer, textclass);
 261                         if (s.empty() && p.next_token().character() == '`')
 262                                 ; // ignore it in  {}``
 263                         else if (is_size || s == "[" || s == "]" || s == "*")
 264                                 os << s;
 265                         else {
 266                                 handle_ert(os, "{");
 267                                 os << s;
 268                                 handle_ert(os, "}");
 269                         }
 270                 }
 271
 272                 else if (t.cat() == catEnd) {
 273                         if (flags & FLAG_BRACE_LAST)
 274                                 return;
 275                         cerr << "stray '}' in text\n";
 276                         handle_ert(os, "}");
 277                 }
 278
 279                 else if (t.cat() == catOther)
 280                         os << string(1, t.character());
 281
 282                 else if (t.cat() == catComment)
 283                         handle_comment(p);
 284
 285                 //
 286                 // control sequences
 287                 //
 288
 289                 else if (t.cs() == "ldots") {
 290                         skip_braces(p);
 291                         os << "\n\\SpecialChar \\ldots{}\n";
 292                 }
 293
 294                 else if (t.cs() == "(") {
 295                         begin_inset(os, "Formula");
 296                         os << " \\(";
 297                         parse_math(p, os, FLAG_SIMPLE2, MATH_MODE);
 298                         os << "\\)";
 299                         end_inset(os);
 300                 }
 301
 302                 else if (t.cs() == "[") {
 303                         begin_inset(os, "Formula");
 304                         os << " \\[";
 305                         parse_math(p, os, FLAG_EQUATION, MATH_MODE);
 306                         os << "\\]";
 307                         end_inset(os);
 308                 }
 309
 310                 else if (t.cs() == "begin") {
 311                         string const name = p.getArg('{', '}');
 312                         active_environments.push_back(name);
 313                         if (is_math_env(name)) {
 314                                 begin_inset(os, "Formula ");
 315                                 os << "\\begin{" << name << "}";
 316                                 parse_math(p, os, FLAG_END, MATH_MODE);
 317                                 os << "\\end{" << name << "}";
 318                                 end_inset(os);
 319                                 continue;
 320                         }
 321
 322                         if (name == "tabular") {
 323                                 begin_inset(os, "Tabular ");
 324                                 handle_tabular(p, os, textclass);
 325                                 end_inset(os);
 326                                 continue;
 327                         }
 328
 329                         bool is_starred = suffixIs(name, '*');
 330                         string unstarred_name = rtrim(name, "*");
 331                         if (textclass.floats().typeExist(unstarred_name)) {
 332                                 string opts = p.getOpt();
 333                                 begin_inset(os, "Float " + unstarred_name + "\n");
 334                                 if (opts.size())
 335                                         os << "placement " << opts << '\n';
 336                                 os << "wide " << tostr(is_starred)
 337                                    << "\ncollapsed false\n\n"
 338                                    << "\\layout Standard\n";
 339                                 parse_text(p, os, FLAG_END, outer,
 340                                            textclass);
 341                                 end_inset(os);
 342                         } else if (name == "center") {
 343                                 handle_par(os);
 344                                 parse_text(p, os, FLAG_END, outer,
 345                                            textclass);
 346                         } else if (name == "enumerate" || name == "itemize"
 347                                         || name == "lyxlist") {
 348                                 size_t const n = active_environments.size();
 349                                 string const s = active_environments[n - 2];
 350                                 bool const deeper = s == "enumerate" || s == "itemize"
 351                                         || s == "lyxlist";
 352                                 if (deeper)
 353                                         os << "\n\\begin_deeper";
 354                                 os << "\n\\layout " << cap(name) << "\n\n";
 355                                 if (name == "lyxlist")
 356                                         p.verbatim_item(); // swallow next arg
 357                                 parse_text(p, os, FLAG_END, outer, textclass);
 358                                 if (deeper)
 359                                         os << "\n\\end_deeper\n";
 360                                 handle_par(os);
 361                         } else if (name == "thebibliography") {
 362                                 p.verbatim_item(); // swallow next arg
 363                                 parse_text(p, os, FLAG_END, outer, textclass);
 364                                 os << "\n\\layout Bibliography\n\n";
 365                         } else {
 366                                 handle_par(os);
 367                                 parse_text(p, os, FLAG_END, outer, textclass);
 368                         }
 369                 }
 370
 371                 else if (t.cs() == "end") {
 372                         if (flags & FLAG_END) {
 373                                 // eat environment name
 374                                 string const name = p.getArg('{', '}');
 375                                 if (name != active_environment())
 376                                         cerr << "\\end{" + name + "} does not match \\begin{"
 377                                                 + active_environment() + "}\n";
 378                                 active_environments.pop_back();
 379                                 handle_par(os);
 380                                 return;
 381                         }
 382                         p.error("found 'end' unexpectedly");
 383                 }
 384
 385                 else if (t.cs() == "item") {
 386                         p.skip_spaces();
 387                         string s;
 388                         if (p.next_token().character() == '[') {
 389                                 p.get_token(); // eat '['
 390                                 s = parse_text(p, FLAG_BRACK_LAST, outer, textclass);
 391                         }
 392                         handle_par(os);
 393                         os << s << ' ';
 394                 }
 395
 396                 else if (t.cs() == "def") {
 397                         string name = p.get_token().cs();
 398                         while (p.next_token().cat() != catBegin)
 399                                 name += p.get_token().asString();
 400                         handle_ert(os, "\\def\\" + name + '{' + p.verbatim_item() + '}');
 401                 }
 402
 403                 else if (t.cs() == "par") {
 404                         p.skip_spaces();
 405                         if (p.next_token().cs() != "\\begin")
 406                                 handle_par(os);
 407                         //cerr << "next token: '" << p.next_token().cs() << "'\n";
 408                 }
 409
 410                 // Must attempt to parse "Section*" before "Section".
 411                 else if ((p.next_token().asInput() == "*") &&
 412                          // The single '=' is meant here.
 413                          (layout_ptr = findLayout(textclass,
 414                                                   t.cs() + '*')).get() &&
 415                          layout_ptr->isCommand()) {
 416                         p.get_token();
 417                         output_layout(os, layout_ptr, p, outer, textclass);
 418                 }
 419
 420                 // The single '=' is meant here.
 421                 else if ((layout_ptr = findLayout(textclass, t.cs())).get() &&
 422                          layout_ptr->isCommand()) {
 423                         output_layout(os, layout_ptr, p, outer, textclass);
 424                 }
 425
 426                 else if (t.cs() == "includegraphics") {
 427                         map<string, string> opts = split_map(p.getArg('[', ']'));
 428                         string name = p.verbatim_item();
 429                         begin_inset(os, "Graphics ");
 430                         os << "\n\tfilename " << name << '\n';
 431                         if (opts.find("width") != opts.end())
 432                                 os << "\twidth " << opts["width"] << '\n';
 433                         if (opts.find("height") != opts.end())
 434                                 os << "\theight " << opts["height"] << '\n';
 435                         end_inset(os);
 436                 }
 437
 438                 else if (t.cs() == "footnote") {
 439                         begin_inset(os, "Foot\n");
 440                         os << "collapsed true\n\n\\layout Standard\n\n";
 441                         parse_text(p, os, FLAG_ITEM, false, textclass);
 442                         end_inset(os);
 443                 }
 444
 445                 else if (t.cs() == "ensuremath") {
 446                         string s = parse_text(p, FLAG_ITEM, false, textclass);
 447                         if (s == "±" || s == "³" || s == "²" || s == "µ")
 448                                 os << s;
 449                         else
 450                                 handle_ert(os, "\\ensuremath{" + s + "}");
 451                 }
 452
 453                 else if (t.cs() == "marginpar") {
 454                         begin_inset(os, "Marginal\n");
 455                         os << "collapsed true\n\n\\layout Standard\n\n";
 456                         parse_text(p, os, FLAG_ITEM, false, textclass);
 457                         end_inset(os);
 458                 }
 459
 460                 else if (t.cs() == "hfill") {
 461                         os << "\n\\hfill\n";
 462                         skip_braces(p);
 463                 }
 464
 465                 else if (t.cs() == "makeindex" || t.cs() == "maketitle")
 466                         skip_braces(p); // swallow this
 467
 468                 else if (t.cs() == "tableofcontents")
 469                         skip_braces(p); // swallow this
 470
 471                 else if (t.cs() == "textrm") {
 472                         os << "\n\\family roman \n";
 473                         parse_text(p, os, FLAG_ITEM, outer, textclass);
 474                         os << "\n\\family default \n";
 475                 }
 476
 477                 else if (t.cs() == "textsf") {
 478                         os << "\n\\family sans \n";
 479                         parse_text(p, os, FLAG_ITEM, outer, textclass);
 480                         os << "\n\\family default \n";
 481                 }
 482
 483                 else if (t.cs() == "texttt") {
 484                         os << "\n\\family typewriter \n";
 485                         parse_text(p, os, FLAG_ITEM, outer, textclass);
 486                         os << "\n\\family default \n";
 487                 }
 488
 489                 else if (t.cs() == "textit") {
 490                         os << "\n\\shape italic \n";
 491                         parse_text(p, os, FLAG_ITEM, outer, textclass);
 492                         os << "\n\\shape default \n";
 493                 }
 494
 495                 else if (t.cs() == "textsc") {
 496                         os << "\n\\noun on \n";
 497                         parse_text(p, os, FLAG_ITEM, outer, textclass);
 498                         os << "\n\\noun default \n";
 499                 }
 500
 501                 else if (t.cs() == "textbf") {
 502                         os << "\n\\series bold \n";
 503                         parse_text(p, os, FLAG_ITEM, outer, textclass);
 504                         os << "\n\\series default \n";
 505                 }
 506
 507                 else if (t.cs() == "underbar") {
 508                         os << "\n\\bar under \n";
 509                         parse_text(p, os, FLAG_ITEM, outer, textclass);
 510                         os << "\n\\bar default \n";
 511                 }
 512
 513                 else if (t.cs() == "emph" || t.cs() == "noun") {
 514                         os << "\n\\" << t.cs() << " on \n";
 515                         parse_text(p, os, FLAG_ITEM, outer, textclass);
 516                         os << "\n\\" << t.cs() << " default \n";
 517                 }
 518
 519                 else if (t.cs() == "bibitem") {
 520                         os << "\n\\layout Bibliography\n\\bibitem ";
 521                         os << p.getOpt();
 522                         os << '{' << p.verbatim_item() << '}' << "\n\n";
 523                 }
 524
 525                 else if (is_known(t.cs(), known_latex_commands)) {
 526                         begin_inset(os, "LatexCommand ");
 527                         os << '\\' << t.cs();
 528                         os << p.getOpt();
 529                         os << p.getOpt();
 530                         os << '{' << p.verbatim_item() << '}';
 531                         end_inset(os);
 532                 }
 533
 534                 else if (is_known(t.cs(), known_quotes)) {
 535                   char const ** where = is_known(t.cs(), known_quotes);
 536                         begin_inset(os, "Quotes ");
 537                         os << known_coded_quotes[where - known_quotes];
 538                         end_inset(os);
 539                         skip_braces(p);
 540                 }
 541
 542                 else if (is_known(t.cs(), known_sizes)) {
 543                   char const ** where = is_known(t.cs(), known_sizes);
 544                         os << "\n\\size " << known_coded_sizes[where - known_sizes] << "\n";
 545                 }
 546
 547                 else if (t.cs() == "LyX" || t.cs() == "TeX"
 548                          || t.cs() == "LaTeX") {
 549                         os << t.cs();
 550                         skip_braces(p); // eat {}
 551                 }
 552
 553                 else if (t.cs() == "LaTeXe") {
 554                         os << "LaTeX2e";
 555                         skip_braces(p); // eat {}
 556                 }
 557
 558                 else if (t.cs() == "lyxarrow") {
 559                         os << "\\SpecialChar \\menuseparator\n";
 560                         skip_braces(p);
 561                 }
 562
 563                 else if (t.cs() == "ldots") {
 564                         os << "\\SpecialChar \\ldots{}\n";
 565                         skip_braces(p);
 566                 }
 567
 568                 else if (t.cs() == "@") {
 569                         os << "\\SpecialChar \\@";
 570                         skip_braces(p);
 571                 }
 572
 573                 else if (t.cs() == "textasciitilde") {
 574                         os << '~';
 575                         skip_braces(p);
 576                 }
 577
 578                 else if (t.cs() == "textasciicircum") {
 579                         os << '^';
 580                         skip_braces(p);
 581                 }
 582
 583                 else if (t.cs() == "textbackslash") {
 584                         os << "\n\\backslash\n";
 585                         skip_braces(p);
 586                 }
 587
 588                 else if (t.cs() == "_" || t.cs() == "&" || t.cs() == "#" || t.cs() == "$"
 589                             || t.cs() == "{" || t.cs() == "}" || t.cs() == "%")
 590                         os << t.cs();
 591
 592                 else if (t.cs() == "char") {
 593                         if (p.next_token().character() == '`') {
 594                                 p.get_token();
 595                                 if (p.next_token().cs() == "\"") {
 596                                         p.get_token();
 597                                         os << '"';
 598                                         skip_braces(p);
 599                                 } else {
 600                                         handle_ert(os, "\\char`");
 601                                 }
 602                         } else {
 603                                 handle_ert(os, "\\char");
 604                         }
 605                 }
 606
 607                 else if (t.cs() == "\"") {
 608                         string const name = p.verbatim_item();
 609                              if (name == "a") os << 'ä';
 610                         else if (name == "o") os << 'ö';
 611                         else if (name == "u") os << 'ü';
 612                         else if (name == "A") os << 'Ä';
 613                         else if (name == "O") os << 'Ö';
 614                         else if (name == "U") os << 'Ü';
 615                         else handle_ert(os, "\"{" + name + "}");
 616                 }
 617
 618                 else if (t.cs() == "=" || t.cs() == "H" || t.cs() == "c"
 619                       || t.cs() == "^" || t.cs() == "'" || t.cs() == "~") {
 620                         // we need the trim as the LyX parser chokes on such spaces
 621                         os << "\n\\i \\" << t.cs() << "{"
 622                            << trim(parse_text(p, FLAG_ITEM, outer, textclass), " ") << "}\n";
 623                 }
 624
 625                 else if (t.cs() == "ss")
 626                         os << "ß";
 627
 628                 else if (t.cs() == "i" || t.cs() == "j")
 629                         os << "\\" << t.cs() << ' ';
 630
 631                 else if (t.cs() == "-")
 632                         os << "\\SpecialChar \\-\n";
 633
 634                 else if (t.cs() == "\\")
 635                         os << "\n\\newline\n";
 636
 637                 else if (t.cs() == "input")
 638                         handle_ert(os, "\\input{" + p.verbatim_item() + "}\n");
 639
 640                 else if (t.cs() == "fancyhead") {
 641                         ostringstream ss;
 642                         ss << "\\fancyhead";
 643                         ss << p.getOpt();
 644                         ss << '{' << p.verbatim_item() << "}\n";
 645                         handle_ert(os, ss.str());
 646                 }
 647
 648                 else {
 649                         //cerr << "#: " << t << " mode: " << mode << endl;
 650                         // heuristic: read up to next non-nested space
 651                         /*
 652                         string s = t.asInput();
 653                         string z = p.verbatim_item();
 654                         while (p.good() && z != " " && z.size()) {
 655                                 //cerr << "read: " << z << endl;
 656                                 s += z;
 657                                 z = p.verbatim_item();
 658                         }
 659                         cerr << "found ERT: " << s << endl;
 660                         handle_ert(os, s + ' ');
 661                         */
 662                         handle_ert(os, t.asInput() + ' ');
 663                 }
 664
 665                 if (flags & FLAG_LEAVE) {
 666                         flags &= ~FLAG_LEAVE;
 667                         break;
 668                 }
 669         }
 670 }
 671
 672
 673 string parse_text(Parser & p, unsigned flags, const bool outer,
 674                   LyXTextClass const & textclass)
 675 {
 676         ostringstream os;
 677         parse_text(p, os, flags, outer, textclass);
 678         return os.str();
 679 }
 680
 681
 682 // }])