1 /** The .tex to .lyx converter
2 \author André Pönitz (2003)
25 using std::istringstream;
27 using std::ostringstream;
35 char const OPEN = '<';
36 char const CLOSE = '>';
38 const char * known_languages[] = { "austrian", "babel", "bahasa",
39 "basque", "breton", "bulgarian", "catalan", "croatian", "czech", "danish",
40 "dutch", "english", "esperanto", "estonian", "finnish", "francais",
41 "frenchb", "galician", "germanb", "greek", "hebcal", "hebfont", "hebrew",
42 "hebrew_newcode", "hebrew_oldcode", "hebrew_p", "hyphen", "icelandic",
43 "irish", "italian", "latin", "lgrcmr", "lgrcmro", "lgrcmss", "lgrcmtt",
44 "lgrenc", "lgrlcmss", "lgrlcmtt", "lheclas", "lhecmr", "lhecmss",
45 "lhecmtt", "lhecrml", "lheenc", "lhefr", "lheredis", "lheshold",
46 "lheshscr", "lheshstk", "lsorbian", "magyar", "naustrian", "ngermanb",
47 "ngerman", "norsk", "polish", "portuges", "rlbabel", "romanian",
48 "russianb", "samin", "scottish", "serbian", "slovak", "slovene", "spanish",
49 "swedish", "turkish", "ukraineb", "usorbian", "welsh", 0};
51 const char * known_fontsizes[] = { "10pt", "11pt", "12pt", 0 };
56 string h_textclass = "FIXME";
57 string h_options = "FIXME";
58 string h_language = "FIXME";
59 string h_inputencoding = "FIXME";
60 string h_fontscheme = "FIXME";
61 string h_graphics = "default";
62 string h_paperfontsize = "FIXME";
63 string h_spacing = "single";
64 string h_papersize = "FIXME";
65 string h_paperpackage = "FIXME";
66 string h_use_geometry = "0";
67 string h_use_amsmath = "0";
68 string h_use_natbib = "0";
69 string h_use_numerical_citations = "0";
70 string h_paperorientation = "portrait";
71 string h_secnumdepth = "3";
72 string h_tocdepth = "3";
73 string h_paragraph_separation = "indent";
74 string h_defskip = "medskip";
75 string h_quotes_language = "2";
76 string h_quotes_times = "1";
77 string h_papercolumns = "1";
78 string h_papersides = "1";
79 string h_paperpagestyle = "default";
80 string h_tracking_changes = "0";
82 // indicates whether we are in the preamble
83 bool in_preamble = true;
85 // current stack of nested environments
86 stack<string> active_environments;
90 string const trim(string const & a, char const * p = " ")
97 string::size_type r = a.find_last_not_of(p);
98 string::size_type l = a.find_first_not_of(p);
100 // Is this the minimal test? (lgb)
101 if (r == string::npos && l == string::npos)
104 return a.substr(l, r - l + 1);
108 void split(string const & s, vector<string> & result, char delim)
112 while (getline(is, t, delim))
117 string join(vector<string> const & input, char delim)
120 for (size_t i = 0; i != input.size(); ++i) {
129 void handle_opt(vector<string> & opts, char const ** what, string & target)
134 for ( ; what; ++what) {
135 vector<string>::iterator it = find(opts.begin(), opts.end(), *what);
136 if (it != opts.end()) {
137 //cerr << "### found option '" << *what << "'\n";
146 void handle_ert(ostream & os, string const & s)
148 os << "\n\\begin_inset ERT\nstatus Collapsed\n\n\\layout Standard\n\n";
150 os << "\n\\end_inset\n";
154 void handle_package(string const & name, string const & options)
156 if (name == "a4wide") {
158 h_paperpackage = "widemarginsa4";
159 } else if (name == "ae")
161 else if (name == "aecompl")
163 else if (name == "amsmath")
165 else if (name == "amssymb")
167 else if (name == "babel")
169 else if (name == "fontenc")
171 else if (name == "inputenc")
172 h_inputencoding = options;
173 else if (name == "makeidx")
175 else if (name == "verbatim")
179 h_preamble += "\\usepackage[" + options + "]{" + name + "}\n";
181 h_preamble += "\\usepackage{" + name + "}\n";
186 string wrap(string const & cmd, string const & str)
188 return OPEN + cmd + ' ' + str + CLOSE;
192 string wrap(string const & cmd, string const & str, string const & str2)
194 return OPEN + cmd + ' ' + str + ' ' + str2 + CLOSE;
198 enum mode_type {UNDECIDED_MODE, TEXT_MODE, MATH_MODE};
200 mode_type asMode(mode_type oldmode, string const & str)
202 if (str == "mathmode")
204 if (str == "textmode" || str == "forcetext")
210 // These are TeX's catcodes
212 catEscape, // 0 backslash
222 catSpace, // 10 space
223 catLetter, // 11 a-zA-Z
224 catOther, // 12 none of the above
227 catInvalid // 15 <delete>
230 CatCode theCatcode[256];
233 inline CatCode catcode(unsigned char c)
235 return theCatcode[c];
240 FLAG_BRACE_LAST = 1 << 1, // last closing brace ends the parsing
241 FLAG_RIGHT = 1 << 2, // next \\right ends the parsing process
242 FLAG_END = 1 << 3, // next \\end ends the parsing process
243 FLAG_BRACK_LAST = 1 << 4, // next closing bracket ends the parsing
244 FLAG_TEXTMODE = 1 << 5, // we are in a box
245 FLAG_ITEM = 1 << 6, // read a (possibly braced token)
246 FLAG_LEAVE = 1 << 7, // leave the loop at the end
247 FLAG_SIMPLE = 1 << 8, // next $ leaves the loop
248 FLAG_EQUATION = 1 << 9, // next \] leaves the loop
249 FLAG_SIMPLE2 = 1 << 10, // next \) leaves the loop
250 FLAG_OPTION = 1 << 11, // read [...] style option
251 FLAG_BRACED = 1 << 12 // read {...} style argument
257 fill(theCatcode, theCatcode + 256, catOther);
258 fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
259 fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
261 theCatcode['\\'] = catEscape;
262 theCatcode['{'] = catBegin;
263 theCatcode['}'] = catEnd;
264 theCatcode['$'] = cat;
265 theCatcode['&'] = catAlign;
266 theCatcode['\n'] = catNewline;
267 theCatcode['#'] = catParameter;
268 theCatcode['^'] = catSuper;
269 theCatcode['_'] = catSub;
270 theCatcode['
\7f'] = catIgnore;
271 theCatcode[' '] = catSpace;
272 theCatcode['\t'] = catSpace;
273 theCatcode['\r'] = catNewline;
274 theCatcode['~'] = catActive;
275 theCatcode['%'] = catComment;
281 // Helper class for parsing
287 Token() : cs_(), char_(0), cat_(catIgnore) {}
289 Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {}
291 Token(string const & cs) : cs_(cs), char_(0), cat_(catIgnore) {}
294 string const & cs() const { return cs_; }
296 CatCode cat() const { return cat_; }
298 char character() const { return char_; }
300 string asString() const { return cs_.size() ? cs_ : string(1, char_); }
311 ostream & operator<<(ostream & os, Token const & t)
314 os << '\\' << t.cs();
316 os << '[' << t.character() << ',' << t.cat() << ']';
325 Parser(istream & is);
330 string parse(unsigned flags, mode_type mode);
332 int lineno() const { return lineno_; }
335 /// dump contents to screen
340 string getArg(char left, char right);
344 void error(string const & msg);
346 void tokenize(istream & is);
348 void tokenize(string const & s);
350 void skipSpaceTokens(istream & is, char c);
352 void push_back(Token const & t);
356 Token const & prevToken() const;
358 Token const & nextToken() const;
360 Token const & getToken();
361 /// skips spaces if any
364 void lex(string const & s);
368 string parse_verbatim_item();
370 string parse_verbatim_option();
375 vector<Token> tokens_;
381 Parser::Parser(istream & is)
382 : lineno_(0), pos_(0)
388 void Parser::push_back(Token const & t)
390 tokens_.push_back(t);
394 void Parser::pop_back()
400 Token const & Parser::prevToken() const
402 static const Token dummy;
403 return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
407 Token const & Parser::nextToken() const
409 static const Token dummy;
410 return good() ? tokens_[pos_] : dummy;
414 Token const & Parser::getToken()
416 static const Token dummy;
417 //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
418 return good() ? tokens_[pos_++] : dummy;
422 void Parser::skipSpaces()
424 while (nextToken().cat() == catSpace || nextToken().cat() == catNewline)
429 void Parser::putback()
435 bool Parser::good() const
437 return pos_ < tokens_.size();
441 char Parser::getChar()
444 error("The input stream is not well...");
445 return tokens_[pos_++].character();
449 string Parser::getArg(char left, char right)
459 while ((c = getChar()) != right && good())
466 void Parser::skipSpaceTokens(istream & is, char c)
468 // skip trailing spaces
469 while (catcode(c) == catSpace || catcode(c) == catNewline)
472 //cerr << "putting back: " << c << "\n";
477 void Parser::tokenize(istream & is)
479 // eat everything up to the next \end_inset or end of stream
480 // and store it in s for further tokenization
485 if (s.size() >= 10 && s.substr(s.size() - 10) == "\\end_inset") {
486 s = s.substr(0, s.size() - 10);
490 // Remove the space after \end_inset
491 if (is.get(c) && c != ' ')
499 void Parser::tokenize(string const & buffer)
501 static bool init_done = false;
508 istringstream is(buffer.c_str(), ios::in | ios::binary);
512 //cerr << "reading c: " << c << "\n";
514 switch (catcode(c)) {
518 if (catcode(c) == catNewline)
519 push_back(Token("par"));
521 push_back(Token('\n', catNewline));
529 while (is.get(c) && catcode(c) != catNewline)
539 error("unexpected end of input");
542 if (catcode(c) == catLetter) {
544 while (is.get(c) && catcode(c) == catLetter)
546 skipSpaceTokens(is, c);
555 push_back(Token(c, catcode(c)));
557 skipSpaceTokens(is, c);
562 cerr << "ignoring a char: " << int(c) << "\n";
567 push_back(Token(c, catcode(c)));
577 void Parser::dump() const
579 cerr << "\nTokens: ";
580 for (unsigned i = 0; i < tokens_.size(); ++i) {
585 cerr << " pos: " << pos_ << "\n";
589 void Parser::error(string const & msg)
591 cerr << "Line ~" << lineno_ << ": parse error: " << msg << endl;
597 string Parser::parse()
600 return parse(0, UNDECIDED_MODE);
604 string Parser::parse_verbatim_option()
607 if (nextToken().character() == '[') {
608 Token t = getToken();
609 for (Token t = getToken(); t.character() != ']' && good(); t = getToken()) {
610 if (t.cat() == catBegin) {
612 res += '{' + parse_verbatim_item() + '}';
621 string Parser::parse_verbatim_item()
624 if (nextToken().cat() == catBegin) {
625 Token t = getToken();
626 for (Token t = getToken(); t.cat() != catEnd && good(); t = getToken()) {
627 if (t.cat() == catBegin) {
629 res += '{' + parse_verbatim_item() + '}';
639 string Parser::parse(unsigned flags, mode_type mode)
643 ostringstream result;
645 Token const & t = getToken();
648 cerr << "t: " << t << " flags: " << flags << "\n";
653 if (flags & FLAG_ITEM) {
654 if (t.cat() == catSpace)
658 if (t.cat() == catBegin) {
659 // skip the brace and collect everything to the next matching
661 flags |= FLAG_BRACE_LAST;
665 // handle only this single token, leave the loop if done
670 if (flags & FLAG_BRACED) {
671 if (t.cat() == catSpace)
674 if (t.cat() != catBegin) {
675 error("opening brace expected");
679 // skip the brace and collect everything to the next matching
681 flags = FLAG_BRACE_LAST;
685 if (flags & FLAG_OPTION) {
686 if (t.cat() == catOther && t.character() == '[') {
687 result << parse(FLAG_BRACK_LAST, mode);
689 // no option found, put back token and we are done
698 if (t.cat() == cat) {
699 if (mode != MATH_MODE) {
700 // we are inside some text mode thingy, so opening new math is allowed
701 Token const & n = getToken();
702 if (n.cat() == cat) {
703 // TeX's $$...$$ syntax for displayed math
704 result << wrap("equation", parse(FLAG_SIMPLE, MATH_MODE));
705 getToken(); // skip the second '$' token
707 // simple $...$ stuff
709 result << wrap("simple", parse(FLAG_SIMPLE, MATH_MODE));
713 else if (flags & FLAG_SIMPLE) {
714 // this is the end of the formula
719 error("something strange in the parser\n");
724 else if (t.cat() == catLetter)
725 result << t.character();
727 else if (t.cat() == catSpace && mode != MATH_MODE) {
728 //if (result.empty() || result[result.size() - 1] != ' ')
729 result << t.character();
732 else if (t.cat() == catNewline && mode != MATH_MODE)
733 result << t.character();
735 else if (t.cat() == catParameter) {
736 Token const & n = getToken();
737 result << wrap("macroarg", string(1, n.character()));
740 else if (t.cat() == catActive)
741 result << wrap("active", string(1, t.character()));
743 else if (t.cat() == catBegin)
744 result << wrap("braced", parse(FLAG_BRACE_LAST, mode));
746 else if (t.cat() == catEnd) {
747 if (flags & FLAG_BRACE_LAST)
749 error("found '}' unexpectedly");
751 //add(cell, '}', LM_TC_TEX);
755 else if (t.cat() == catAlign) {
757 //cerr << " column now " << cellcol << " max: " << grid.ncols() << "\n";
758 if (cellcol == grid.ncols()) {
759 //cerr << "adding column " << cellcol << "\n";
760 grid.addCol(cellcol - 1);
762 cell = &grid.cell(grid.index(cellrow, cellcol));
766 else if (t.character() == ']' && (flags & FLAG_BRACK_LAST)) {
767 //cerr << "finished reading option\n";
771 else if (t.cat() == catOther)
772 result << string(1, t.character());
774 else if (t.cat() == catComment) {
777 Token const & t = getToken();
778 if (t.cat() == catNewline)
782 //result << wrap("comment", s);
790 else if (t.cs() == "lyxlock") {
794 else if (t.cs() == "newcommand" || t.cs() == "providecommand") {
795 string const name = parse_verbatim_item();
796 string const opts = getArg('[', ']');
797 string const body = parse_verbatim_item();
798 // only non-lyxspecific stuff
799 if (name != "noun" && name != "tabularnewline") {
800 h_preamble += "\\" + t.cs() + "{" + name + "}";
802 h_preamble += "[" + opts + "]";
803 h_preamble += "{" + body + "}\n";
807 else if (t.cs() == "(")
808 result << wrap("simple", parse(FLAG_SIMPLE2, MATH_MODE));
810 else if (t.cs() == "[")
811 result << wrap("equation", parse(FLAG_EQUATION, MATH_MODE));
813 else if (t.cs() == "protect")
814 // ignore \\protect, will hopefully be re-added during output
817 else if (t.cs() == "end") {
818 if (flags & FLAG_END) {
819 // eat environment name
820 string const name = getArg('{', '}');
821 if (name != active_environments.top())
822 error("\\end{" + name + "} does not match \\begin{"
823 + active_environments.top() + "}");
824 active_environments.pop();
827 error("found 'end' unexpectedly");
830 else if (t.cs() == ")") {
831 if (flags & FLAG_SIMPLE2)
833 error("found '\\)' unexpectedly");
836 else if (t.cs() == "]") {
837 if (flags & FLAG_EQUATION)
839 error("found '\\]' unexpectedly");
843 else if (t.cs() == "\\") {
844 grid.vcrskip(LyXLength(getArg('[', ']')), cellrow);
847 if (cellrow == grid.nrows())
848 grid.addRow(cellrow - 1);
849 if (grid.asHullstring())
850 grid.asHullstring()->numbered(cellrow, numbered);
851 cell = &grid.cell(grid.index(cellrow, cellcol));
854 else if (t.cs() == "documentclass") {
856 split(getArg('[', ']'), opts, ',');
857 handle_opt(opts, known_languages, h_language);
858 handle_opt(opts, known_fontsizes, h_paperfontsize);
859 h_options = join(opts, ',');
860 h_textclass = getArg('{', '}');
863 else if (t.cs() == "usepackage") {
864 string const options = getArg('[', ']');
865 string const name = getArg('{', '}');
866 if (options.empty() && name.find(',')) {
867 vector<string> vecnames;
868 split(name, vecnames, ',');
869 vector<string>::const_iterator it = vecnames.begin();
870 vector<string>::const_iterator end = vecnames.end();
871 for (; it != end; ++it) {
872 handle_package(trim(*it), string());
875 handle_package(name, options);
879 else if (t.cs() == "newenvironment") {
880 string const name = getArg('{', '}');
882 string const begin = parse_verbatim_item();
884 string const end = parse_verbatim_item();
886 if (name != "lyxcode")
887 result << wrap("newenvironment", begin + end);
890 else if (t.cs() == "def") {
891 string const name = getToken().cs();
893 while (nextToken().cat() != catBegin)
894 res += getToken().asString();
895 handle_ert(result, "\\def" + res + '{' + parse_verbatim_item() + '}');
898 else if (t.cs() == "setcounter") {
899 string const name = getArg('{', '}');
900 string const content = getArg('{', '}');
901 if (name == "secnumdepth")
902 h_secnumdepth = content;
903 else if (name == "tocdepth")
904 h_tocdepth = content;
906 h_preamble += "\\setcounter{" + name + "}{" + content + "}\n";
909 else if (t.cs() == "setlength") {
910 string const name = getToken().cs();
911 string const content = getArg('{', '}');
912 if (name == "parskip")
913 h_paragraph_separation = "skip";
914 else if (name == "parindent")
915 h_paragraph_separation = "skip";
917 h_preamble += "\\setcounter{" + name + "}{" + content + "}\n";
920 else if (t.cs() == "par") {
921 if (!active_environments.empty())
922 result << "\n\\layout " << active_environments.top() << "\n\n";
925 else if (t.cs() == "title")
926 result << "\\layout Title\n\n" + parse_verbatim_item();
928 else if (t.cs() == "author")
929 result << "\\layout Author\n\n" + parse_verbatim_item();
931 else if (t.cs() == "abstract")
932 result << "\\layout Abstract\n\n" + parse_verbatim_item();
934 else if (t.cs() == "begin") {
935 string const name = getArg('{', '}');
936 active_environments.push(name);
937 result << parse(FLAG_END, mode);
940 if (flags & FLAG_LEAVE) {
941 flags &= ~FLAG_LEAVE;
950 } // anonymous namespace
953 int main(int argc, char * argv[])
956 cerr << "Usage: " << argv[0] << " <infile.tex>" << endl;
961 ifstream is(argv[1]);
964 string s = p.parse();
965 cout << "# tex2lyx 0.0.2 created this file\n"
966 << "\\lyxformat 222\n"
967 << "\\textclass " << h_textclass << "\n"
968 << "\\begin_preamble\n" << h_preamble << "\\end_preamble\n"
969 << "\\options " << h_options << "\n"
970 << "\\language " << h_language << "\n"
971 << "\\inputencoding " << h_inputencoding << "\n"
972 << "\\fontscheme " << h_fontscheme << "\n"
973 << "\\graphics " << h_graphics << "\n"
974 << "\\paperfontsize " << h_paperfontsize << "\n"
975 << "\\spacing " << h_spacing << "\n"
976 << "\\papersize " << h_papersize << "\n"
977 << "\\paperpackage " << h_paperpackage << "\n"
978 << "\\use_geometry " << h_use_geometry << "\n"
979 << "\\use_amsmath " << h_use_amsmath << "\n"
980 << "\\use_natbib " << h_use_natbib << "\n"
981 << "\\use_numerical_citations " << h_use_numerical_citations << "\n"
982 << "\\paperorientation " << h_paperorientation << "\n"
983 << "\\secnumdepth " << h_secnumdepth << "\n"
984 << "\\tocdepth " << h_tocdepth << "\n"
985 << "\\paragraph_separation " << h_paragraph_separation << "\n"
986 << "\\defskip " << h_defskip << "\n"
987 << "\\quotes_language " << h_quotes_language << "\n"
988 << "\\quotes_times " << h_quotes_times << "\n"
989 << "\\papercolumns " << h_papercolumns << "\n"
990 << "\\papersides " << h_papersides << "\n"
991 << "\\paperpagestyle " << h_paperpagestyle << "\n"
992 << "\\tracking_changes " << h_tracking_changes << "\n"