/** The .tex to .lyx converter \author André Pönitz (2003) */ #include #include #include #include #include #include #include #include #include using std::atoi; using std::cout; using std::cerr; using std::endl; using std::fill; using std::getline; using std::ios; using std::istream; using std::ostream; using std::vector; namespace { char const OPEN = '<'; char const CLOSE = '>'; const char * known_languages[] = { "austrian", "babel", "bahasa", "basque", "breton", "bulgarian", "catalan", "croatian", "czech", "danish", "dutch", "english", "esperanto", "estonian", "finnish", "francais", "frenchb", "galician", "germanb", "greek", "hebcal", "hebfont", "hebrew", "hebrew_newcode", "hebrew_oldcode", "hebrew_p", "hyphen", "icelandic", "irish", "italian", "latin", "lgrcmr", "lgrcmro", "lgrcmss", "lgrcmtt", "lgrenc", "lgrlcmss", "lgrlcmtt", "lheclas", "lhecmr", "lhecmss", "lhecmtt", "lhecrml", "lheenc", "lhefr", "lheredis", "lheshold", "lheshscr", "lheshstk", "lsorbian", "magyar", "naustrian", "ngermanb", "ngerman", "norsk", "polish", "portuges", "rlbabel", "romanian", "russianb", "samin", "scottish", "serbian", "slovak", "slovene", "spanish", "swedish", "turkish", "ukraineb", "usorbian", "welsh", 0}; const char * known_fontsizes[] = { "10pt", "11pt", "12pt", 0 }; // some ugly stuff string h_preamble; string h_textclass = "FIXME"; string h_options = "FIXME"; string h_language = "FIXME"; string h_inputencoding = "FIXME"; string h_fontscheme = "FIXME"; string h_graphics = "default"; string h_paperfontsize = "FIXME"; string h_spacing = "single"; string h_papersize = "FIXME"; string h_paperpackage = "FIXME"; string h_use_geometry = "0"; string h_use_amsmath = "0"; string h_use_natbib = "0"; string h_use_numerical_citations = "0"; string h_paperorientation = "portrait"; string h_secnumdepth = "3"; string h_tocdepth = "3"; string h_paragraph_separation = "indent"; string h_defskip = "medskip"; string h_quotes_language = "2"; string h_quotes_times = "1"; string h_papercolumns = "1"; string h_papersides = "1"; string h_paperpagestyle = "default"; string h_tracking_changes = "0"; // indicates whether we are in the preamble bool in_preamble = true; // current stack of nested environments stack active_environments; void split(string const & s, vector & result, char delim) { istringstream is(s); string t; while (getline(is, t, delim)) result.push_back(t); } string join(vector const & input, char delim) { ostringstream os; for (size_t i = 0; i != input.size(); ++i) { if (i) os << delim; os << input[i]; } return os.str(); } void handle_opt(vector & opts, char const ** what, string & target) { for ( ; what; ++what) { vector::iterator it = find(opts.begin(), opts.end(), *what); if (it != opts.end()) { //cerr << "### found option '" << *what << "'\n"; target = *what; opts.erase(it); return; } } } void handle_ert(ostream & os, string const & s) { os << "\n\\begin_inset ERT\nstatus Collapsed\n\n\\layout Standard\n\n"; os << s; os << "\n\\end_inset\n"; } string wrap(string const & cmd, string const & str) { return OPEN + cmd + ' ' + str + CLOSE; } string wrap(string const & cmd, string const & str, string const & str2) { return OPEN + cmd + ' ' + str + ' ' + str2 + CLOSE; } enum mode_type {UNDECIDED_MODE, TEXT_MODE, MATH_MODE}; mode_type asMode(mode_type oldmode, string const & str) { if (str == "mathmode") return MATH_MODE; if (str == "textmode" || str == "forcetext") return TEXT_MODE; return oldmode; } bool stared(string const & s) { string::size_type const n = s.size(); return n && s[n - 1] == '*'; } // These are TeX's catcodes enum CatCode { catEscape, // 0 backslash catBegin, // 1 { catEnd, // 2 } cat, // 3 $ catAlign, // 4 & catNewline, // 5 ^^M catParameter, // 6 # catSuper, // 7 ^ catSub, // 8 _ catIgnore, // 9 catSpace, // 10 space catLetter, // 11 a-zA-Z catOther, // 12 none of the above catActive, // 13 ~ catComment, // 14 % catInvalid // 15 }; CatCode theCatcode[256]; inline CatCode catcode(unsigned char c) { return theCatcode[c]; } enum { FLAG_BRACE_LAST = 1 << 1, // last closing brace ends the parsing FLAG_RIGHT = 1 << 2, // next \\right ends the parsing process FLAG_END = 1 << 3, // next \\end ends the parsing process FLAG_BRACK_LAST = 1 << 4, // next closing bracket ends the parsing FLAG_TEXTMODE = 1 << 5, // we are in a box FLAG_ITEM = 1 << 6, // read a (possibly braced token) FLAG_LEAVE = 1 << 7, // leave the loop at the end FLAG_SIMPLE = 1 << 8, // next $ leaves the loop FLAG_EQUATION = 1 << 9, // next \] leaves the loop FLAG_SIMPLE2 = 1 << 10, // next \) leaves the loop FLAG_OPTION = 1 << 11, // read [...] style option FLAG_BRACED = 1 << 12 // read {...} style argument }; void catInit() { fill(theCatcode, theCatcode + 256, catOther); fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter); fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter); theCatcode['\\'] = catEscape; theCatcode['{'] = catBegin; theCatcode['}'] = catEnd; theCatcode['$'] = cat; theCatcode['&'] = catAlign; theCatcode['\n'] = catNewline; theCatcode['#'] = catParameter; theCatcode['^'] = catSuper; theCatcode['_'] = catSub; theCatcode[''] = catIgnore; theCatcode[' '] = catSpace; theCatcode['\t'] = catSpace; theCatcode['\r'] = catNewline; theCatcode['~'] = catActive; theCatcode['%'] = catComment; } // // Helper class for parsing // class Token { public: /// Token() : cs_(), char_(0), cat_(catIgnore) {} /// Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {} /// Token(string const & cs) : cs_(cs), char_(0), cat_(catIgnore) {} /// string const & cs() const { return cs_; } /// CatCode cat() const { return cat_; } /// char character() const { return char_; } /// string asString() const { return cs_.size() ? cs_ : string(1, char_); } private: /// string cs_; /// char char_; /// CatCode cat_; }; ostream & operator<<(ostream & os, Token const & t) { if (t.cs().size()) os << '\\' << t.cs(); else os << '[' << t.character() << ',' << t.cat() << ']'; return os; } class Parser { public: /// Parser(istream & is); /// string parse(); /// string parse(unsigned flags, mode_type mode); /// int lineno() const { return lineno_; } /// void putback(); /// dump contents to screen void dump() const; private: /// string getArg(char left, char right); /// char getChar(); /// void error(string const & msg); /// void tokenize(istream & is); /// void tokenize(string const & s); /// void skipSpaceTokens(istream & is, char c); /// void push_back(Token const & t); /// void pop_back(); /// Token const & prevToken() const; /// Token const & nextToken() const; /// Token const & getToken(); /// skips spaces if any void skipSpaces(); /// void lex(string const & s); /// bool good() const; /// string parse_verbatim_item(); /// string parse_verbatim_option(); /// int lineno_; /// vector tokens_; /// unsigned pos_; }; Parser::Parser(istream & is) : lineno_(0), pos_(0) { tokenize(is); } void Parser::push_back(Token const & t) { tokens_.push_back(t); } void Parser::pop_back() { tokens_.pop_back(); } Token const & Parser::prevToken() const { static const Token dummy; return pos_ > 0 ? tokens_[pos_ - 1] : dummy; } Token const & Parser::nextToken() const { static const Token dummy; return good() ? tokens_[pos_] : dummy; } Token const & Parser::getToken() { static const Token dummy; //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n'; return good() ? tokens_[pos_++] : dummy; } void Parser::skipSpaces() { while (nextToken().cat() == catSpace || nextToken().cat() == catNewline) getToken(); } void Parser::putback() { --pos_; } bool Parser::good() const { return pos_ < tokens_.size(); } char Parser::getChar() { if (!good()) error("The input stream is not well..."); return tokens_[pos_++].character(); } string Parser::getArg(char left, char right) { skipSpaces(); string result; char c = getChar(); if (c != left) putback(); else while ((c = getChar()) != right && good()) result += c; return result; } void Parser::skipSpaceTokens(istream & is, char c) { // skip trailing spaces while (catcode(c) == catSpace || catcode(c) == catNewline) if (!is.get(c)) break; //cerr << "putting back: " << c << "\n"; is.putback(c); } void Parser::tokenize(istream & is) { // eat everything up to the next \end_inset or end of stream // and store it in s for further tokenization string s; char c; while (is.get(c)) { s += c; if (s.size() >= 10 && s.substr(s.size() - 10) == "\\end_inset") { s = s.substr(0, s.size() - 10); break; } } // Remove the space after \end_inset if (is.get(c) && c != ' ') is.unget(); // tokenize buffer tokenize(s); } void Parser::tokenize(string const & buffer) { static bool init_done = false; if (!init_done) { catInit(); init_done = true; } istringstream is(buffer.c_str(), ios::in | ios::binary); char c; while (is.get(c)) { //cerr << "reading c: " << c << "\n"; switch (catcode(c)) { case catNewline: { ++lineno_; is.get(c); if (catcode(c) == catNewline) push_back(Token("par")); else { push_back(Token('\n', catNewline)); is.putback(c); } break; } /* case catComment: { while (is.get(c) && catcode(c) != catNewline) ; ++lineno_; break; } */ case catEscape: { is.get(c); if (!is) { error("unexpected end of input"); } else { string s(1, c); if (catcode(c) == catLetter) { // collect letters while (is.get(c) && catcode(c) == catLetter) s += c; skipSpaceTokens(is, c); } push_back(Token(s)); } break; } case catSuper: case catSub: { push_back(Token(c, catcode(c))); is.get(c); skipSpaceTokens(is, c); break; } case catIgnore: { cerr << "ignoring a char: " << int(c) << "\n"; break; } default: push_back(Token(c, catcode(c))); } } #ifdef FILEDEBUG dump(); #endif } void Parser::dump() const { cerr << "\nTokens: "; for (unsigned i = 0; i < tokens_.size(); ++i) { if (i == pos_) cerr << " <#> "; cerr << tokens_[i]; } cerr << " pos: " << pos_ << "\n"; } void Parser::error(string const & msg) { cerr << "Line ~" << lineno_ << ": parse error: " << msg << endl; dump(); //exit(1); } string Parser::parse() { skipSpaces(); return parse(0, UNDECIDED_MODE); } string Parser::parse_verbatim_option() { string res; if (nextToken().character() == '[') { Token t = getToken(); for (Token t = getToken(); t.character() != ']' && good(); t = getToken()) { if (t.cat() == catBegin) { putback(); res += '{' + parse_verbatim_item() + '}'; } else res += t.asString(); } } return res; } string Parser::parse_verbatim_item() { string res; if (nextToken().cat() == catBegin) { Token t = getToken(); for (Token t = getToken(); t.cat() != catEnd && good(); t = getToken()) { if (t.cat() == catBegin) { putback(); res += '{' + parse_verbatim_item() + '}'; } else res += t.asString(); } } return res; } string Parser::parse(unsigned flags, mode_type mode) { //int limits = 0; ostringstream result; while (good()) { Token const & t = getToken(); #ifdef FILEDEBUG cerr << "t: " << t << " flags: " << flags << "\n"; cell->dump(); cerr << "\n"; #endif if (flags & FLAG_ITEM) { if (t.cat() == catSpace) continue; flags &= ~FLAG_ITEM; if (t.cat() == catBegin) { // skip the brace and collect everything to the next matching // closing brace flags |= FLAG_BRACE_LAST; continue; } // handle only this single token, leave the loop if done flags |= FLAG_LEAVE; } if (flags & FLAG_BRACED) { if (t.cat() == catSpace) continue; if (t.cat() != catBegin) { error("opening brace expected"); return result.str(); } // skip the brace and collect everything to the next matching // closing brace flags = FLAG_BRACE_LAST; } if (flags & FLAG_OPTION) { if (t.cat() == catOther && t.character() == '[') { result << parse(FLAG_BRACK_LAST, mode); } else { // no option found, put back token and we are done putback(); } return result.str(); } // // cat codes // if (t.cat() == cat) { if (mode != MATH_MODE) { // we are inside some text mode thingy, so opening new math is allowed Token const & n = getToken(); if (n.cat() == cat) { // TeX's $$...$$ syntax for displayed math result << wrap("equation", parse(FLAG_SIMPLE, MATH_MODE)); getToken(); // skip the second '$' token } else { // simple $...$ stuff putback(); result << wrap("simple", parse(FLAG_SIMPLE, MATH_MODE)); } } else if (flags & FLAG_SIMPLE) { // this is the end of the formula return result.str(); } else { error("something strange in the parser\n"); break; } } else if (t.cat() == catLetter) result << t.character(); else if (t.cat() == catSpace && mode != MATH_MODE) { //if (result.empty() || result[result.size() - 1] != ' ') result << t.character(); } else if (t.cat() == catNewline && mode != MATH_MODE) result << t.character(); else if (t.cat() == catParameter) { Token const & n = getToken(); result << wrap("macroarg", string(1, n.character())); } else if (t.cat() == catActive) result << wrap("active", string(1, t.character())); else if (t.cat() == catBegin) result << wrap("braced", parse(FLAG_BRACE_LAST, mode)); else if (t.cat() == catEnd) { if (flags & FLAG_BRACE_LAST) return result.str(); error("found '}' unexpectedly"); //lyx::Assert(0); //add(cell, '}', LM_TC_TEX); } /* else if (t.cat() == catAlign) { ++cellcol; //cerr << " column now " << cellcol << " max: " << grid.ncols() << "\n"; if (cellcol == grid.ncols()) { //cerr << "adding column " << cellcol << "\n"; grid.addCol(cellcol - 1); } cell = &grid.cell(grid.index(cellrow, cellcol)); } */ else if (t.character() == ']' && (flags & FLAG_BRACK_LAST)) { //cerr << "finished reading option\n"; return result.str(); } else if (t.cat() == catOther) result << string(1, t.character()); else if (t.cat() == catComment) { string s; while (good()) { Token const & t = getToken(); if (t.cat() == catNewline) break; s += t.asString(); } //result << wrap("comment", s); skipSpaces(); } // // control sequences // else if (t.cs() == "lyxlock") { // ignored; } else if (t.cs() == "newcommand" || t.cs() == "providecommand") { string const name = parse_verbatim_item(); string const opts = getArg('[', ']'); string const body = parse_verbatim_item(); // only non-lyxspecific stuff if (name != "noun" && name != "tabularnewline") { h_preamble += "\\" + t.cs() + "{" + name + "}"; if (opts.size()) h_preamble += "[" + opts + "]"; h_preamble += "{" + body + "}\n"; } } else if (t.cs() == "(") result << wrap("simple", parse(FLAG_SIMPLE2, MATH_MODE)); else if (t.cs() == "[") result << wrap("equation", parse(FLAG_EQUATION, MATH_MODE)); else if (t.cs() == "protect") // ignore \\protect, will hopefully be re-added during output ; else if (t.cs() == "end") { if (flags & FLAG_END) { // eat environment name string const name = getArg('{', '}'); if (name != active_environments.top()) error("\\end{" + name + "} does not match \\begin{" + active_environments.top() + "}"); active_environments.pop(); return result.str(); } error("found 'end' unexpectedly"); } else if (t.cs() == ")") { if (flags & FLAG_SIMPLE2) return result.str(); error("found '\\)' unexpectedly"); } else if (t.cs() == "]") { if (flags & FLAG_EQUATION) return result.str(); error("found '\\]' unexpectedly"); } /* else if (t.cs() == "\\") { grid.vcrskip(LyXLength(getArg('[', ']')), cellrow); ++cellrow; cellcol = 0; if (cellrow == grid.nrows()) grid.addRow(cellrow - 1); if (grid.asHullstring()) grid.asHullstring()->numbered(cellrow, numbered); cell = &grid.cell(grid.index(cellrow, cellcol)); } */ else if (t.cs() == "documentclass") { vector opts; split(getArg('[', ']'), opts, ','); handle_opt(opts, known_languages, h_language); handle_opt(opts, known_fontsizes, h_paperfontsize); h_options = join(opts, ','); h_textclass = getArg('{', '}'); } else if (t.cs() == "usepackage") { string const options = getArg('[', ']'); string const name = getArg('{', '}'); if (name == "a4wide") { h_papersize = "a4"; h_paperpackage = "widemarginsa4"; } else if (name == "ae") h_fontscheme = "ae"; else if (name == "aecompl") h_fontscheme = "ae"; else if (name == "amsmath") h_use_amsmath = "1"; else if (name == "amssymb") h_use_amsmath = "1"; else if (name == "babel") ; // ignore this else if (name == "fontenc") ; // ignore this else if (name == "inputenc") h_inputencoding = options; else if (name == "makeidx") ; // ignore this else if (name == "verbatim") ; // ignore this else { if (options.size()) h_preamble += "\\usepackage[" + options + "]{" + name + "}\n"; else h_preamble += "\\usepackage{" + name + "}\n"; } } else if (t.cs() == "newenvironment") { string const name = getArg('{', '}'); skipSpaces(); string const begin = parse_verbatim_item(); skipSpaces(); string const end = parse_verbatim_item(); // ignore out mess if (name != "lyxcode") result << wrap("newenvironment", begin + end); } else if (t.cs() == "def") { string const name = getToken().cs(); string res; while (nextToken().cat() != catBegin) res += getToken().asString(); handle_ert(result, "\\def" + res + '{' + parse_verbatim_item() + '}'); } else if (t.cs() == "setcounter") { string const name = getArg('{', '}'); string const content = getArg('{', '}'); if (name == "secnumdepth") h_secnumdepth = content; else if (name == "tocdepth") h_tocdepth = content; else h_preamble += "\\setcounter{" + name + "}{" + content + "}\n"; } else if (t.cs() == "setlength") { string const name = getToken().cs(); string const content = getArg('{', '}'); if (name == "parskip") h_paragraph_separation = "skip"; else if (name == "parindent") h_paragraph_separation = "skip"; else h_preamble += "\\setcounter{" + name + "}{" + content + "}\n"; } else if (t.cs() == "par") { if (!active_environments.empty()) result << "\n\\layout " << active_environments.top() << "\n\n"; } else if (t.cs() == "title") result << "\\layout Title\n\n" + parse_verbatim_item(); else if (t.cs() == "author") result << "\\layout Author\n\n" + parse_verbatim_item(); else if (t.cs() == "abstract") result << "\\layout Abstract\n\n" + parse_verbatim_item(); else if (t.cs() == "begin") { string const name = getArg('{', '}'); active_environments.push(name); result << parse(FLAG_END, mode); } if (flags & FLAG_LEAVE) { flags &= ~FLAG_LEAVE; break; } } return result.str(); } } // anonymous namespace int main(int argc, char * argv[]) { if (argc <= 1) { cerr << "Usage: " << argv[0] << " " << endl; return 2; } string t; ifstream is(argv[1]); Parser p(is); //p.dump(); string s = p.parse(); cout << "# tex2lyx 0.0.2 created this file\n" << "\\lyxformat 222\n" << "\\textclass " << h_textclass << "\n" << "\\begin_preamble\n" << h_preamble << "\\end_preamble\n" << "\\options " << h_options << "\n" << "\\language " << h_language << "\n" << "\\inputencoding " << h_inputencoding << "\n" << "\\fontscheme " << h_fontscheme << "\n" << "\\graphics " << h_graphics << "\n" << "\\paperfontsize " << h_paperfontsize << "\n" << "\\spacing " << h_spacing << "\n" << "\\papersize " << h_papersize << "\n" << "\\paperpackage " << h_paperpackage << "\n" << "\\use_geometry " << h_use_geometry << "\n" << "\\use_amsmath " << h_use_amsmath << "\n" << "\\use_natbib " << h_use_natbib << "\n" << "\\use_numerical_citations " << h_use_numerical_citations << "\n" << "\\paperorientation " << h_paperorientation << "\n" << "\\secnumdepth " << h_secnumdepth << "\n" << "\\tocdepth " << h_tocdepth << "\n" << "\\paragraph_separation " << h_paragraph_separation << "\n" << "\\defskip " << h_defskip << "\n" << "\\quotes_language " << h_quotes_language << "\n" << "\\quotes_times " << h_quotes_times << "\n" << "\\papercolumns " << h_papercolumns << "\n" << "\\papersides " << h_papersides << "\n" << "\\paperpagestyle " << h_paperpagestyle << "\n" << "\\tracking_changes " << h_tracking_changes << "\n" << s << "\n" << "\\the_end"; return 0; }