2 /** The .tex to .lyx converter
3 \author André Pönitz (2003)
26 using std::istringstream;
28 using std::ostringstream;
36 char const OPEN = '<';
37 char const CLOSE = '>';
39 const char * known_languages[] = { "austrian", "babel", "bahasa",
40 "basque", "breton", "bulgarian", "catalan", "croatian", "czech", "danish",
41 "dutch", "english", "esperanto", "estonian", "finnish", "francais",
42 "frenchb", "galician", "germanb", "greek", "hebcal", "hebfont", "hebrew",
43 "hebrew_newcode", "hebrew_oldcode", "hebrew_p", "hyphen", "icelandic",
44 "irish", "italian", "latin", "lgrcmr", "lgrcmro", "lgrcmss", "lgrcmtt",
45 "lgrenc", "lgrlcmss", "lgrlcmtt", "lheclas", "lhecmr", "lhecmss",
46 "lhecmtt", "lhecrml", "lheenc", "lhefr", "lheredis", "lheshold",
47 "lheshscr", "lheshstk", "lsorbian", "magyar", "naustrian", "ngermanb",
48 "ngerman", "norsk", "polish", "portuges", "rlbabel", "romanian",
49 "russianb", "samin", "scottish", "serbian", "slovak", "slovene", "spanish",
50 "swedish", "turkish", "ukraineb", "usorbian", "welsh", 0};
52 const char * known_fontsizes[] = { "10pt", "11pt", "12pt", 0 };
57 string h_textclass = "FIXME";
58 string h_options = "FIXME";
59 string h_language = "FIXME";
60 string h_inputencoding = "FIXME";
61 string h_fontscheme = "FIXME";
62 string h_graphics = "default";
63 string h_paperfontsize = "FIXME";
64 string h_spacing = "single";
65 string h_papersize = "FIXME";
66 string h_paperpackage = "FIXME";
67 string h_use_geometry = "0";
68 string h_use_amsmath = "0";
69 string h_use_natbib = "0";
70 string h_use_numerical_citations = "0";
71 string h_paperorientation = "portrait";
72 string h_secnumdepth = "3";
73 string h_tocdepth = "3";
74 string h_paragraph_separation = "indent";
75 string h_defskip = "medskip";
76 string h_quotes_language = "2";
77 string h_quotes_times = "1";
78 string h_papercolumns = "1";
79 string h_papersides = "1";
80 string h_paperpagestyle = "default";
81 string h_tracking_changes = "0";
83 // indicates whether we are in the preamble
84 bool in_preamble = true;
86 // current stack of nested environments
87 stack<string> active_environments;
91 void split(string const & s, vector<string> & result, char delim)
95 while (getline(is, t, delim))
100 string join(vector<string> const & input, char delim)
103 for (size_t i = 0; i != input.size(); ++i) {
112 void handle_opt(vector<string> & opts, char const ** what, string & target)
114 for ( ; what; ++what) {
115 vector<string>::iterator it = find(opts.begin(), opts.end(), *what);
116 if (it != opts.end()) {
117 //cerr << "### found option '" << *what << "'\n";
126 void handle_ert(ostream & os, string const & s)
128 os << "\n\\begin_inset ERT\nstatus Collapsed\n\n\\layout Standard\n\n";
130 os << "\n\\end_inset\n";
134 string wrap(string const & cmd, string const & str)
136 return OPEN + cmd + ' ' + str + CLOSE;
140 string wrap(string const & cmd, string const & str, string const & str2)
142 return OPEN + cmd + ' ' + str + ' ' + str2 + CLOSE;
146 enum mode_type {UNDECIDED_MODE, TEXT_MODE, MATH_MODE};
148 mode_type asMode(mode_type oldmode, string const & str)
150 if (str == "mathmode")
152 if (str == "textmode" || str == "forcetext")
158 // These are TeX's catcodes
160 catEscape, // 0 backslash
170 catSpace, // 10 space
171 catLetter, // 11 a-zA-Z
172 catOther, // 12 none of the above
175 catInvalid // 15 <delete>
178 CatCode theCatcode[256];
181 inline CatCode catcode(unsigned char c)
183 return theCatcode[c];
188 FLAG_BRACE_LAST = 1 << 1, // last closing brace ends the parsing
189 FLAG_RIGHT = 1 << 2, // next \\right ends the parsing process
190 FLAG_END = 1 << 3, // next \\end ends the parsing process
191 FLAG_BRACK_LAST = 1 << 4, // next closing bracket ends the parsing
192 FLAG_TEXTMODE = 1 << 5, // we are in a box
193 FLAG_ITEM = 1 << 6, // read a (possibly braced token)
194 FLAG_LEAVE = 1 << 7, // leave the loop at the end
195 FLAG_SIMPLE = 1 << 8, // next $ leaves the loop
196 FLAG_EQUATION = 1 << 9, // next \] leaves the loop
197 FLAG_SIMPLE2 = 1 << 10, // next \) leaves the loop
198 FLAG_OPTION = 1 << 11, // read [...] style option
199 FLAG_BRACED = 1 << 12 // read {...} style argument
205 fill(theCatcode, theCatcode + 256, catOther);
206 fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
207 fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
209 theCatcode['\\'] = catEscape;
210 theCatcode['{'] = catBegin;
211 theCatcode['}'] = catEnd;
212 theCatcode['$'] = cat;
213 theCatcode['&'] = catAlign;
214 theCatcode['\n'] = catNewline;
215 theCatcode['#'] = catParameter;
216 theCatcode['^'] = catSuper;
217 theCatcode['_'] = catSub;
218 theCatcode['
\7f'] = catIgnore;
219 theCatcode[' '] = catSpace;
220 theCatcode['\t'] = catSpace;
221 theCatcode['\r'] = catNewline;
222 theCatcode['~'] = catActive;
223 theCatcode['%'] = catComment;
229 // Helper class for parsing
235 Token() : cs_(), char_(0), cat_(catIgnore) {}
237 Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {}
239 Token(string const & cs) : cs_(cs), char_(0), cat_(catIgnore) {}
242 string const & cs() const { return cs_; }
244 CatCode cat() const { return cat_; }
246 char character() const { return char_; }
248 string asString() const { return cs_.size() ? cs_ : string(1, char_); }
259 ostream & operator<<(ostream & os, Token const & t)
262 os << '\\' << t.cs();
264 os << '[' << t.character() << ',' << t.cat() << ']';
273 Parser(istream & is);
278 string parse(unsigned flags, mode_type mode);
280 int lineno() const { return lineno_; }
283 /// dump contents to screen
288 string getArg(char left, char right);
292 void error(string const & msg);
294 void tokenize(istream & is);
296 void tokenize(string const & s);
298 void skipSpaceTokens(istream & is, char c);
300 void push_back(Token const & t);
304 Token const & prevToken() const;
306 Token const & nextToken() const;
308 Token const & getToken();
309 /// skips spaces if any
312 void lex(string const & s);
316 string parse_verbatim_item();
318 string parse_verbatim_option();
323 vector<Token> tokens_;
329 Parser::Parser(istream & is)
330 : lineno_(0), pos_(0)
336 void Parser::push_back(Token const & t)
338 tokens_.push_back(t);
342 void Parser::pop_back()
348 Token const & Parser::prevToken() const
350 static const Token dummy;
351 return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
355 Token const & Parser::nextToken() const
357 static const Token dummy;
358 return good() ? tokens_[pos_] : dummy;
362 Token const & Parser::getToken()
364 static const Token dummy;
365 //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
366 return good() ? tokens_[pos_++] : dummy;
370 void Parser::skipSpaces()
372 while (nextToken().cat() == catSpace || nextToken().cat() == catNewline)
377 void Parser::putback()
383 bool Parser::good() const
385 return pos_ < tokens_.size();
389 char Parser::getChar()
392 error("The input stream is not well...");
393 return tokens_[pos_++].character();
397 string Parser::getArg(char left, char right)
407 while ((c = getChar()) != right && good())
414 void Parser::skipSpaceTokens(istream & is, char c)
416 // skip trailing spaces
417 while (catcode(c) == catSpace || catcode(c) == catNewline)
420 //cerr << "putting back: " << c << "\n";
425 void Parser::tokenize(istream & is)
427 // eat everything up to the next \end_inset or end of stream
428 // and store it in s for further tokenization
433 if (s.size() >= 10 && s.substr(s.size() - 10) == "\\end_inset") {
434 s = s.substr(0, s.size() - 10);
438 // Remove the space after \end_inset
439 if (is.get(c) && c != ' ')
447 void Parser::tokenize(string const & buffer)
449 static bool init_done = false;
456 istringstream is(buffer.c_str(), ios::in | ios::binary);
460 //cerr << "reading c: " << c << "\n";
462 switch (catcode(c)) {
466 if (catcode(c) == catNewline)
467 push_back(Token("par"));
469 push_back(Token('\n', catNewline));
477 while (is.get(c) && catcode(c) != catNewline)
487 error("unexpected end of input");
490 if (catcode(c) == catLetter) {
492 while (is.get(c) && catcode(c) == catLetter)
494 skipSpaceTokens(is, c);
503 push_back(Token(c, catcode(c)));
505 skipSpaceTokens(is, c);
510 cerr << "ignoring a char: " << int(c) << "\n";
515 push_back(Token(c, catcode(c)));
525 void Parser::dump() const
527 cerr << "\nTokens: ";
528 for (unsigned i = 0; i < tokens_.size(); ++i) {
533 cerr << " pos: " << pos_ << "\n";
537 void Parser::error(string const & msg)
539 cerr << "Line ~" << lineno_ << ": parse error: " << msg << endl;
545 string Parser::parse()
548 return parse(0, UNDECIDED_MODE);
552 string Parser::parse_verbatim_option()
555 if (nextToken().character() == '[') {
556 Token t = getToken();
557 for (Token t = getToken(); t.character() != ']' && good(); t = getToken()) {
558 if (t.cat() == catBegin) {
560 res += '{' + parse_verbatim_item() + '}';
569 string Parser::parse_verbatim_item()
572 if (nextToken().cat() == catBegin) {
573 Token t = getToken();
574 for (Token t = getToken(); t.cat() != catEnd && good(); t = getToken()) {
575 if (t.cat() == catBegin) {
577 res += '{' + parse_verbatim_item() + '}';
587 string Parser::parse(unsigned flags, mode_type mode)
591 ostringstream result;
593 Token const & t = getToken();
596 cerr << "t: " << t << " flags: " << flags << "\n";
601 if (flags & FLAG_ITEM) {
602 if (t.cat() == catSpace)
606 if (t.cat() == catBegin) {
607 // skip the brace and collect everything to the next matching
609 flags |= FLAG_BRACE_LAST;
613 // handle only this single token, leave the loop if done
618 if (flags & FLAG_BRACED) {
619 if (t.cat() == catSpace)
622 if (t.cat() != catBegin) {
623 error("opening brace expected");
627 // skip the brace and collect everything to the next matching
629 flags = FLAG_BRACE_LAST;
633 if (flags & FLAG_OPTION) {
634 if (t.cat() == catOther && t.character() == '[') {
635 result << parse(FLAG_BRACK_LAST, mode);
637 // no option found, put back token and we are done
646 if (t.cat() == cat) {
647 if (mode != MATH_MODE) {
648 // we are inside some text mode thingy, so opening new math is allowed
649 Token const & n = getToken();
650 if (n.cat() == cat) {
651 // TeX's $$...$$ syntax for displayed math
652 result << wrap("equation", parse(FLAG_SIMPLE, MATH_MODE));
653 getToken(); // skip the second '$' token
655 // simple $...$ stuff
657 result << wrap("simple", parse(FLAG_SIMPLE, MATH_MODE));
661 else if (flags & FLAG_SIMPLE) {
662 // this is the end of the formula
667 error("something strange in the parser\n");
672 else if (t.cat() == catLetter)
673 result << t.character();
675 else if (t.cat() == catSpace && mode != MATH_MODE) {
676 //if (result.empty() || result[result.size() - 1] != ' ')
677 result << t.character();
680 else if (t.cat() == catNewline && mode != MATH_MODE)
681 result << t.character();
683 else if (t.cat() == catParameter) {
684 Token const & n = getToken();
685 result << wrap("macroarg", string(1, n.character()));
688 else if (t.cat() == catActive)
689 result << wrap("active", string(1, t.character()));
691 else if (t.cat() == catBegin)
692 result << wrap("braced", parse(FLAG_BRACE_LAST, mode));
694 else if (t.cat() == catEnd) {
695 if (flags & FLAG_BRACE_LAST)
697 error("found '}' unexpectedly");
699 //add(cell, '}', LM_TC_TEX);
703 else if (t.cat() == catAlign) {
705 //cerr << " column now " << cellcol << " max: " << grid.ncols() << "\n";
706 if (cellcol == grid.ncols()) {
707 //cerr << "adding column " << cellcol << "\n";
708 grid.addCol(cellcol - 1);
710 cell = &grid.cell(grid.index(cellrow, cellcol));
714 else if (t.character() == ']' && (flags & FLAG_BRACK_LAST)) {
715 //cerr << "finished reading option\n";
719 else if (t.cat() == catOther)
720 result << string(1, t.character());
722 else if (t.cat() == catComment) {
725 Token const & t = getToken();
726 if (t.cat() == catNewline)
730 //result << wrap("comment", s);
738 else if (t.cs() == "lyxlock") {
742 else if (t.cs() == "newcommand" || t.cs() == "providecommand") {
743 string const name = parse_verbatim_item();
744 string const opts = getArg('[', ']');
745 string const body = parse_verbatim_item();
746 // only non-lyxspecific stuff
747 if (name != "noun" && name != "tabularnewline") {
748 h_preamble += "\\" + t.cs() + "{" + name + "}";
750 h_preamble += "[" + opts + "]";
751 h_preamble += "{" + body + "}\n";
755 else if (t.cs() == "(")
756 result << wrap("simple", parse(FLAG_SIMPLE2, MATH_MODE));
758 else if (t.cs() == "[")
759 result << wrap("equation", parse(FLAG_EQUATION, MATH_MODE));
761 else if (t.cs() == "protect")
762 // ignore \\protect, will hopefully be re-added during output
765 else if (t.cs() == "end") {
766 if (flags & FLAG_END) {
767 // eat environment name
768 string const name = getArg('{', '}');
769 if (name != active_environments.top())
770 error("\\end{" + name + "} does not match \\begin{"
771 + active_environments.top() + "}");
772 active_environments.pop();
775 error("found 'end' unexpectedly");
778 else if (t.cs() == ")") {
779 if (flags & FLAG_SIMPLE2)
781 error("found '\\)' unexpectedly");
784 else if (t.cs() == "]") {
785 if (flags & FLAG_EQUATION)
787 error("found '\\]' unexpectedly");
791 else if (t.cs() == "\\") {
792 grid.vcrskip(LyXLength(getArg('[', ']')), cellrow);
795 if (cellrow == grid.nrows())
796 grid.addRow(cellrow - 1);
797 if (grid.asHullstring())
798 grid.asHullstring()->numbered(cellrow, numbered);
799 cell = &grid.cell(grid.index(cellrow, cellcol));
802 else if (t.cs() == "documentclass") {
804 split(getArg('[', ']'), opts, ',');
805 handle_opt(opts, known_languages, h_language);
806 handle_opt(opts, known_fontsizes, h_paperfontsize);
807 h_options = join(opts, ',');
808 h_textclass = getArg('{', '}');
811 else if (t.cs() == "usepackage") {
812 string const options = getArg('[', ']');
813 string const name = getArg('{', '}');
814 if (name == "a4wide") {
816 h_paperpackage = "widemarginsa4";
817 } else if (name == "ae")
819 else if (name == "aecompl")
821 else if (name == "amsmath")
823 else if (name == "amssymb")
825 else if (name == "babel")
827 else if (name == "fontenc")
829 else if (name == "inputenc")
830 h_inputencoding = options;
831 else if (name == "makeidx")
833 else if (name == "verbatim")
837 h_preamble += "\\usepackage[" + options + "]{" + name + "}\n";
839 h_preamble += "\\usepackage{" + name + "}\n";
843 else if (t.cs() == "newenvironment") {
844 string const name = getArg('{', '}');
846 string const begin = parse_verbatim_item();
848 string const end = parse_verbatim_item();
850 if (name != "lyxcode")
851 result << wrap("newenvironment", begin + end);
854 else if (t.cs() == "def") {
855 string const name = getToken().cs();
857 while (nextToken().cat() != catBegin)
858 res += getToken().asString();
859 handle_ert(result, "\\def" + res + '{' + parse_verbatim_item() + '}');
862 else if (t.cs() == "setcounter") {
863 string const name = getArg('{', '}');
864 string const content = getArg('{', '}');
865 if (name == "secnumdepth")
866 h_secnumdepth = content;
867 else if (name == "tocdepth")
868 h_tocdepth = content;
870 h_preamble += "\\setcounter{" + name + "}{" + content + "}\n";
873 else if (t.cs() == "setlength") {
874 string const name = getToken().cs();
875 string const content = getArg('{', '}');
876 if (name == "parskip")
877 h_paragraph_separation = "skip";
878 else if (name == "parindent")
879 h_paragraph_separation = "skip";
881 h_preamble += "\\setcounter{" + name + "}{" + content + "}\n";
884 else if (t.cs() == "par") {
885 if (!active_environments.empty())
886 result << "\n\\layout " << active_environments.top() << "\n\n";
889 else if (t.cs() == "title")
890 result << "\\layout Title\n\n" + parse_verbatim_item();
892 else if (t.cs() == "author")
893 result << "\\layout Author\n\n" + parse_verbatim_item();
895 else if (t.cs() == "abstract")
896 result << "\\layout Abstract\n\n" + parse_verbatim_item();
898 else if (t.cs() == "begin") {
899 string const name = getArg('{', '}');
900 active_environments.push(name);
901 result << parse(FLAG_END, mode);
904 if (flags & FLAG_LEAVE) {
905 flags &= ~FLAG_LEAVE;
914 } // anonymous namespace
917 int main(int argc, char * argv[])
920 cerr << "Usage: " << argv[0] << " <infile.tex>" << endl;
925 ifstream is(argv[1]);
928 string s = p.parse();
929 cout << "# tex2lyx 0.0.2 created this file\n"
930 << "\\lyxformat 222\n"
931 << "\\textclass " << h_textclass << "\n"
932 << "\\begin_preamble\n" << h_preamble << "\\end_preamble\n"
933 << "\\options " << h_options << "\n"
934 << "\\language " << h_language << "\n"
935 << "\\inputencoding " << h_inputencoding << "\n"
936 << "\\fontscheme " << h_fontscheme << "\n"
937 << "\\graphics " << h_graphics << "\n"
938 << "\\paperfontsize " << h_paperfontsize << "\n"
939 << "\\spacing " << h_spacing << "\n"
940 << "\\papersize " << h_papersize << "\n"
941 << "\\paperpackage " << h_paperpackage << "\n"
942 << "\\use_geometry " << h_use_geometry << "\n"
943 << "\\use_amsmath " << h_use_amsmath << "\n"
944 << "\\use_natbib " << h_use_natbib << "\n"
945 << "\\use_numerical_citations " << h_use_numerical_citations << "\n"
946 << "\\paperorientation " << h_paperorientation << "\n"
947 << "\\secnumdepth " << h_secnumdepth << "\n"
948 << "\\tocdepth " << h_tocdepth << "\n"
949 << "\\paragraph_separation " << h_paragraph_separation << "\n"
950 << "\\defskip " << h_defskip << "\n"
951 << "\\quotes_language " << h_quotes_language << "\n"
952 << "\\quotes_times " << h_quotes_times << "\n"
953 << "\\papercolumns " << h_papercolumns << "\n"
954 << "\\papersides " << h_papersides << "\n"
955 << "\\paperpagestyle " << h_paperpagestyle << "\n"
956 << "\\tracking_changes " << h_tracking_changes << "\n"