From 84f1457439690a0c8694f063a08273c47cbf511b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Andr=C3=A9=20P=C3=B6nitz?= Date: Wed, 12 Feb 2003 08:36:49 +0000 Subject: [PATCH] finish reader/writer split git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@6112 a592a061-630c-0410-9148-cb99ea01b6c8 --- src/tex2lyx/Makefile.am | 13 +- src/tex2lyx/tex2lyx.C | 574 +++++----------------------------------- src/tex2lyx/texparser.C | 36 +-- src/tex2lyx/texparser.h | 3 +- 4 files changed, 91 insertions(+), 535 deletions(-) diff --git a/src/tex2lyx/Makefile.am b/src/tex2lyx/Makefile.am index 403bdc380e..e604059055 100644 --- a/src/tex2lyx/Makefile.am +++ b/src/tex2lyx/Makefile.am @@ -2,6 +2,17 @@ include $(top_srcdir)/config/common.am INCLUDES = -I$(srcdir)/../ $(BOOST_INCLUDES) +noinst_LTLIBRARIES = libtexparser.la + +libtexparser_la_SOURCES = \ + texparser.C \ + texparser.h + +tex2lyx_LDADD = libtexparser.la + bin_PROGRAMS = tex2lyx -tex2lyx_SOURCES = tex2lyx.C +tex2lyx_SOURCES = \ + tex2lyx.C + texparser.C \ + texparser.h diff --git a/src/tex2lyx/tex2lyx.C b/src/tex2lyx/tex2lyx.C index 9e5d9a2f16..015573bbc3 100644 --- a/src/tex2lyx/tex2lyx.C +++ b/src/tex2lyx/tex2lyx.C @@ -13,7 +13,8 @@ #include #include -using std::atoi; +#include "texparser.h" + using std::cout; using std::cerr; using std::endl; @@ -151,6 +152,20 @@ void handle_ert(ostream & os, string const & s) } +void handle_par(ostream & os) +{ + if (active_environments.empty()) + return; + os << "\n\\layout "; + string s = active_environments.top(); + if (s == "document") + os << "Standard"; + else + os << s; + os << "\n\n"; +} + + void handle_package(string const & name, string const & options) { if (name == "a4wide") { @@ -195,458 +210,17 @@ string wrap(string const & cmd, string const & str, string const & str2) } -enum mode_type {UNDECIDED_MODE, TEXT_MODE, MATH_MODE}; - -mode_type asMode(mode_type oldmode, string const & str) -{ - if (str == "mathmode") - return MATH_MODE; - if (str == "textmode" || str == "forcetext") - return TEXT_MODE; - return oldmode; -} - - -// These are TeX's catcodes -enum CatCode { - catEscape, // 0 backslash - catBegin, // 1 { - catEnd, // 2 } - cat, // 3 $ - catAlign, // 4 & - catNewline, // 5 ^^M - catParameter, // 6 # - catSuper, // 7 ^ - catSub, // 8 _ - catIgnore, // 9 - catSpace, // 10 space - catLetter, // 11 a-zA-Z - catOther, // 12 none of the above - catActive, // 13 ~ - catComment, // 14 % - catInvalid // 15 -}; - -CatCode theCatcode[256]; - - -inline CatCode catcode(unsigned char c) -{ - return theCatcode[c]; -} - - -enum { - FLAG_BRACE_LAST = 1 << 1, // last closing brace ends the parsing - FLAG_RIGHT = 1 << 2, // next \\right ends the parsing process - FLAG_END = 1 << 3, // next \\end ends the parsing process - FLAG_BRACK_LAST = 1 << 4, // next closing bracket ends the parsing - FLAG_TEXTMODE = 1 << 5, // we are in a box - FLAG_ITEM = 1 << 6, // read a (possibly braced token) - FLAG_LEAVE = 1 << 7, // leave the loop at the end - FLAG_SIMPLE = 1 << 8, // next $ leaves the loop - FLAG_EQUATION = 1 << 9, // next \] leaves the loop - FLAG_SIMPLE2 = 1 << 10, // next \) leaves the loop - FLAG_OPTION = 1 << 11, // read [...] style option - FLAG_BRACED = 1 << 12 // read {...} style argument -}; - - -void catInit() -{ - fill(theCatcode, theCatcode + 256, catOther); - fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter); - fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter); - - theCatcode['\\'] = catEscape; - theCatcode['{'] = catBegin; - theCatcode['}'] = catEnd; - theCatcode['$'] = cat; - theCatcode['&'] = catAlign; - theCatcode['\n'] = catNewline; - theCatcode['#'] = catParameter; - theCatcode['^'] = catSuper; - theCatcode['_'] = catSub; - theCatcode[''] = catIgnore; - theCatcode[' '] = catSpace; - theCatcode['\t'] = catSpace; - theCatcode['\r'] = catNewline; - theCatcode['~'] = catActive; - theCatcode['%'] = catComment; -} - - - -// -// Helper class for parsing -// - -class Token { -public: - /// - Token() : cs_(), char_(0), cat_(catIgnore) {} - /// - Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {} - /// - Token(string const & cs) : cs_(cs), char_(0), cat_(catIgnore) {} - - /// - string const & cs() const { return cs_; } - /// - CatCode cat() const { return cat_; } - /// - char character() const { return char_; } - /// - string asString() const { return cs_.size() ? cs_ : string(1, char_); } - -private: - /// - string cs_; - /// - char char_; - /// - CatCode cat_; -}; - -ostream & operator<<(ostream & os, Token const & t) -{ - if (t.cs().size()) - os << '\\' << t.cs(); - else - os << '[' << t.character() << ',' << t.cat() << ']'; - return os; -} - - -class Parser { - -public: - /// - Parser(istream & is); - - /// - string parse(); - /// - string parse(unsigned flags, mode_type mode); - /// - int lineno() const { return lineno_; } - /// - void putback(); - /// dump contents to screen - void dump() const; - -private: - /// - string getArg(char left, char right); - /// - char getChar(); - /// - void error(string const & msg); - /// - void tokenize(istream & is); - /// - void tokenize(string const & s); - /// - void skipSpaceTokens(istream & is, char c); - /// - void push_back(Token const & t); - /// - void pop_back(); - /// - Token const & prevToken() const; - /// - Token const & nextToken() const; - /// - Token const & getToken(); - /// skips spaces if any - void skipSpaces(); - /// - void lex(string const & s); - /// - bool good() const; - /// - string parse_verbatim_item(); - /// - string parse_verbatim_option(); - - /// - int lineno_; - /// - vector tokens_; - /// - unsigned pos_; -}; - - -Parser::Parser(istream & is) - : lineno_(0), pos_(0) -{ - tokenize(is); -} - - -void Parser::push_back(Token const & t) -{ - tokens_.push_back(t); -} - - -void Parser::pop_back() -{ - tokens_.pop_back(); -} - - -Token const & Parser::prevToken() const -{ - static const Token dummy; - return pos_ > 0 ? tokens_[pos_ - 1] : dummy; -} - - -Token const & Parser::nextToken() const -{ - static const Token dummy; - return good() ? tokens_[pos_] : dummy; -} - - -Token const & Parser::getToken() -{ - static const Token dummy; - //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n'; - return good() ? tokens_[pos_++] : dummy; -} - - -void Parser::skipSpaces() -{ - while (nextToken().cat() == catSpace || nextToken().cat() == catNewline) - getToken(); -} - - -void Parser::putback() -{ - --pos_; -} - - -bool Parser::good() const -{ - return pos_ < tokens_.size(); -} - - -char Parser::getChar() -{ - if (!good()) - error("The input stream is not well..."); - return tokens_[pos_++].character(); -} - - -string Parser::getArg(char left, char right) -{ - skipSpaces(); - - string result; - char c = getChar(); - - if (c != left) - putback(); - else - while ((c = getChar()) != right && good()) - result += c; - - return result; -} - - -void Parser::skipSpaceTokens(istream & is, char c) -{ - // skip trailing spaces - while (catcode(c) == catSpace || catcode(c) == catNewline) - if (!is.get(c)) - break; - //cerr << "putting back: " << c << "\n"; - is.putback(c); -} - - -void Parser::tokenize(istream & is) -{ - // eat everything up to the next \end_inset or end of stream - // and store it in s for further tokenization - string s; - char c; - while (is.get(c)) { - s += c; - if (s.size() >= 10 && s.substr(s.size() - 10) == "\\end_inset") { - s = s.substr(0, s.size() - 10); - break; - } - } - // Remove the space after \end_inset - if (is.get(c) && c != ' ') - is.unget(); - - // tokenize buffer - tokenize(s); -} - - -void Parser::tokenize(string const & buffer) -{ - static bool init_done = false; - - if (!init_done) { - catInit(); - init_done = true; - } - - istringstream is(buffer.c_str(), ios::in | ios::binary); - - char c; - while (is.get(c)) { - //cerr << "reading c: " << c << "\n"; - - switch (catcode(c)) { - case catNewline: { - ++lineno_; - is.get(c); - if (catcode(c) == catNewline) - push_back(Token("par")); - else { - push_back(Token('\n', catNewline)); - is.putback(c); - } - break; - } - -/* - case catComment: { - while (is.get(c) && catcode(c) != catNewline) - ; - ++lineno_; - break; - } -*/ - - case catEscape: { - is.get(c); - if (!is) { - error("unexpected end of input"); - } else { - string s(1, c); - if (catcode(c) == catLetter) { - // collect letters - while (is.get(c) && catcode(c) == catLetter) - s += c; - skipSpaceTokens(is, c); - } - push_back(Token(s)); - } - break; - } - - case catSuper: - case catSub: { - push_back(Token(c, catcode(c))); - is.get(c); - skipSpaceTokens(is, c); - break; - } - - case catIgnore: { - cerr << "ignoring a char: " << int(c) << "\n"; - break; - } - - default: - push_back(Token(c, catcode(c))); - } - } - -#ifdef FILEDEBUG - dump(); -#endif -} - - -void Parser::dump() const -{ - cerr << "\nTokens: "; - for (unsigned i = 0; i < tokens_.size(); ++i) { - if (i == pos_) - cerr << " <#> "; - cerr << tokens_[i]; - } - cerr << " pos: " << pos_ << "\n"; -} - - -void Parser::error(string const & msg) -{ - cerr << "Line ~" << lineno_ << ": parse error: " << msg << endl; - dump(); - //exit(1); -} - - -string Parser::parse() -{ - skipSpaces(); - return parse(0, UNDECIDED_MODE); -} - - -string Parser::parse_verbatim_option() -{ - string res; - if (nextToken().character() == '[') { - Token t = getToken(); - for (Token t = getToken(); t.character() != ']' && good(); t = getToken()) { - if (t.cat() == catBegin) { - putback(); - res += '{' + parse_verbatim_item() + '}'; - } else - res += t.asString(); - } - } - return res; -} - - -string Parser::parse_verbatim_item() -{ - string res; - if (nextToken().cat() == catBegin) { - Token t = getToken(); - for (Token t = getToken(); t.cat() != catEnd && good(); t = getToken()) { - if (t.cat() == catBegin) { - putback(); - res += '{' + parse_verbatim_item() + '}'; - } - else - res += t.asString(); - } - } - return res; -} - - -string Parser::parse(unsigned flags, mode_type mode) +string parse(Parser & p, unsigned flags, mode_type mode) { //int limits = 0; ostringstream result; - while (good()) { - Token const & t = getToken(); + while (p.good()) { + Token const & t = p.getToken(); #ifdef FILEDEBUG cerr << "t: " << t << " flags: " << flags << "\n"; - cell->dump(); + //cell->dump(); cerr << "\n"; #endif @@ -672,7 +246,7 @@ string Parser::parse(unsigned flags, mode_type mode) continue; if (t.cat() != catBegin) { - error("opening brace expected"); + p.error("opening brace expected"); return result.str(); } @@ -684,10 +258,10 @@ string Parser::parse(unsigned flags, mode_type mode) if (flags & FLAG_OPTION) { if (t.cat() == catOther && t.character() == '[') { - result << parse(FLAG_BRACK_LAST, mode); + result << parse(p, FLAG_BRACK_LAST, mode); } else { // no option found, put back token and we are done - putback(); + p.putback(); } return result.str(); } @@ -695,18 +269,18 @@ string Parser::parse(unsigned flags, mode_type mode) // // cat codes // - if (t.cat() == cat) { + if (t.cat() == catMath) { if (mode != MATH_MODE) { // we are inside some text mode thingy, so opening new math is allowed - Token const & n = getToken(); - if (n.cat() == cat) { + Token const & n = p.getToken(); + if (n.cat() == catMath) { // TeX's $$...$$ syntax for displayed math - result << wrap("equation", parse(FLAG_SIMPLE, MATH_MODE)); - getToken(); // skip the second '$' token + result << wrap("equation", parse(p, FLAG_SIMPLE, MATH_MODE)); + p.getToken(); // skip the second '$' token } else { // simple $...$ stuff - putback(); - result << wrap("simple", parse(FLAG_SIMPLE, MATH_MODE)); + p.putback(); + result << wrap("simple", parse(p, FLAG_SIMPLE, MATH_MODE)); } } @@ -716,7 +290,7 @@ string Parser::parse(unsigned flags, mode_type mode) } else { - error("something strange in the parser\n"); + p.error("something strange in the parser\n"); break; } } @@ -733,7 +307,7 @@ string Parser::parse(unsigned flags, mode_type mode) result << t.character(); else if (t.cat() == catParameter) { - Token const & n = getToken(); + Token const & n = p.getToken(); result << wrap("macroarg", string(1, n.character())); } @@ -741,12 +315,12 @@ string Parser::parse(unsigned flags, mode_type mode) result << wrap("active", string(1, t.character())); else if (t.cat() == catBegin) - result << wrap("braced", parse(FLAG_BRACE_LAST, mode)); + result << wrap("braced", parse(p, FLAG_BRACE_LAST, mode)); else if (t.cat() == catEnd) { if (flags & FLAG_BRACE_LAST) return result.str(); - error("found '}' unexpectedly"); + p.error("found '}' unexpectedly"); //lyx::Assert(0); //add(cell, '}', LM_TC_TEX); } @@ -773,14 +347,14 @@ string Parser::parse(unsigned flags, mode_type mode) else if (t.cat() == catComment) { string s; - while (good()) { - Token const & t = getToken(); + while (p.good()) { + Token const & t = p.getToken(); if (t.cat() == catNewline) break; s += t.asString(); } //result << wrap("comment", s); - skipSpaces(); + p.skipSpaces(); } // @@ -792,9 +366,9 @@ string Parser::parse(unsigned flags, mode_type mode) } else if (t.cs() == "newcommand" || t.cs() == "providecommand") { - string const name = parse_verbatim_item(); - string const opts = getArg('[', ']'); - string const body = parse_verbatim_item(); + string const name = p.verbatimItem(); + string const opts = p.getArg('[', ']'); + string const body = p.verbatimItem(); // only non-lyxspecific stuff if (name != "noun" && name != "tabularnewline") { h_preamble += "\\" + t.cs() + "{" + name + "}"; @@ -805,10 +379,10 @@ string Parser::parse(unsigned flags, mode_type mode) } else if (t.cs() == "(") - result << wrap("simple", parse(FLAG_SIMPLE2, MATH_MODE)); + result << wrap("simple", parse(p, FLAG_SIMPLE2, MATH_MODE)); else if (t.cs() == "[") - result << wrap("equation", parse(FLAG_EQUATION, MATH_MODE)); + result << wrap("equation", parse(p, FLAG_EQUATION, MATH_MODE)); else if (t.cs() == "protect") // ignore \\protect, will hopefully be re-added during output @@ -817,26 +391,26 @@ string Parser::parse(unsigned flags, mode_type mode) else if (t.cs() == "end") { if (flags & FLAG_END) { // eat environment name - string const name = getArg('{', '}'); + string const name = p.getArg('{', '}'); if (name != active_environments.top()) - error("\\end{" + name + "} does not match \\begin{" + p.error("\\end{" + name + "} does not match \\begin{" + active_environments.top() + "}"); active_environments.pop(); return result.str(); } - error("found 'end' unexpectedly"); + p.error("found 'end' unexpectedly"); } else if (t.cs() == ")") { if (flags & FLAG_SIMPLE2) return result.str(); - error("found '\\)' unexpectedly"); + p.error("found '\\)' unexpectedly"); } else if (t.cs() == "]") { if (flags & FLAG_EQUATION) return result.str(); - error("found '\\]' unexpectedly"); + p.error("found '\\]' unexpectedly"); } /* @@ -853,16 +427,16 @@ string Parser::parse(unsigned flags, mode_type mode) */ else if (t.cs() == "documentclass") { vector opts; - split(getArg('[', ']'), opts, ','); + split(p.getArg('[', ']'), opts, ','); handle_opt(opts, known_languages, h_language); handle_opt(opts, known_fontsizes, h_paperfontsize); h_options = join(opts, ','); - h_textclass = getArg('{', '}'); + h_textclass = p.getArg('{', '}'); } else if (t.cs() == "usepackage") { - string const options = getArg('[', ']'); - string const name = getArg('{', '}'); + string const options = p.getArg('[', ']'); + string const name = p.getArg('{', '}'); if (options.empty() && name.find(',')) { vector vecnames; split(name, vecnames, ','); @@ -877,27 +451,27 @@ string Parser::parse(unsigned flags, mode_type mode) } else if (t.cs() == "newenvironment") { - string const name = getArg('{', '}'); - skipSpaces(); - string const begin = parse_verbatim_item(); - skipSpaces(); - string const end = parse_verbatim_item(); + string const name = p.getArg('{', '}'); + p.skipSpaces(); + string const begin = p.verbatimItem(); + p.skipSpaces(); + string const end = p.verbatimItem(); // ignore out mess if (name != "lyxcode") result << wrap("newenvironment", begin + end); } else if (t.cs() == "def") { - string const name = getToken().cs(); + string const name = p.getToken().cs(); string res; - while (nextToken().cat() != catBegin) - res += getToken().asString(); - handle_ert(result, "\\def" + res + '{' + parse_verbatim_item() + '}'); + while (p.nextToken().cat() != catBegin) + res += p.getToken().asString(); + handle_ert(result, "\\def" + res + '{' + p.verbatimItem() + '}'); } else if (t.cs() == "setcounter") { - string const name = getArg('{', '}'); - string const content = getArg('{', '}'); + string const name = p.getArg('{', '}'); + string const content = p.getArg('{', '}'); if (name == "secnumdepth") h_secnumdepth = content; else if (name == "tocdepth") @@ -907,8 +481,8 @@ string Parser::parse(unsigned flags, mode_type mode) } else if (t.cs() == "setlength") { - string const name = getToken().cs(); - string const content = getArg('{', '}'); + string const name = p.getToken().cs(); + string const content = p.getArg('{', '}'); if (name == "parskip") h_paragraph_separation = "skip"; else if (name == "parindent") @@ -917,24 +491,22 @@ string Parser::parse(unsigned flags, mode_type mode) h_preamble += "\\setcounter{" + name + "}{" + content + "}\n"; } - else if (t.cs() == "par") { - if (!active_environments.empty()) - result << "\n\\layout " << active_environments.top() << "\n\n"; - } + else if (t.cs() == "par") + handle_par(result); else if (t.cs() == "title") - result << "\\layout Title\n\n" + parse_verbatim_item(); + result << "\\layout Title\n\n" + p.verbatimItem(); else if (t.cs() == "author") - result << "\\layout Author\n\n" + parse_verbatim_item(); + result << "\\layout Author\n\n" + p.verbatimItem(); else if (t.cs() == "abstract") - result << "\\layout Abstract\n\n" + parse_verbatim_item(); + result << "\\layout Abstract\n\n" + p.verbatimItem(); else if (t.cs() == "begin") { - string const name = getArg('{', '}'); + string const name = p.getArg('{', '}'); active_environments.push(name); - result << parse(FLAG_END, mode); + result << parse(p, FLAG_END, mode); } if (flags & FLAG_LEAVE) { @@ -957,11 +529,9 @@ int main(int argc, char * argv[]) return 2; } - string t; ifstream is(argv[1]); Parser p(is); - //p.dump(); - string s = p.parse(); + string s = parse(p, 0, UNDECIDED_MODE); cout << "# tex2lyx 0.0.2 created this file\n" << "\\lyxformat 222\n" << "\\textclass " << h_textclass << "\n" diff --git a/src/tex2lyx/texparser.C b/src/tex2lyx/texparser.C index a638b93890..b22ee50d3c 100644 --- a/src/tex2lyx/texparser.C +++ b/src/tex2lyx/texparser.C @@ -1,4 +1,6 @@ -#include "parser.h" + +#include "texparser.h" +#include "Lsstream.h" using std::cerr; using std::endl; @@ -8,6 +10,7 @@ using std::istream; using std::istringstream; using std::ostream; using std::string; +using std::vector; // @@ -42,7 +45,7 @@ void catInit() theCatcode['\\'] = catEscape; theCatcode['{'] = catBegin; theCatcode['}'] = catEnd; - theCatcode['$'] = cat; + theCatcode['$'] = catMath; theCatcode['&'] = catAlign; theCatcode['\n'] = catNewline; theCatcode['#'] = catParameter; @@ -172,29 +175,8 @@ void Parser::skipSpaceTokens(istream & is, char c) } -void Parser::tokenize(istream & is) -{ - // eat everything up to the next \end_inset or end of stream - // and store it in s for further tokenization - string s; - char c; - while (is.get(c)) { - s += c; - if (s.size() >= 10 && s.substr(s.size() - 10) == "\\end_inset") { - s = s.substr(0, s.size() - 10); - break; - } - } - // Remove the space after \end_inset - if (is.get(c) && c != ' ') - is.unget(); - // tokenize buffer - tokenize(s); -} - - -void Parser::tokenize(string const & buffer) +void Parser::tokenize(istream & is) { static bool init_done = false; @@ -203,8 +185,6 @@ void Parser::tokenize(string const & buffer) init_done = true; } - istringstream is(buffer.c_str(), ios::in | ios::binary); - char c; while (is.get(c)) { //cerr << "reading c: " << c << "\n"; @@ -265,10 +245,6 @@ void Parser::tokenize(string const & buffer) push_back(Token(c, catcode(c))); } } - -#ifdef FILEDEBUG - dump(); -#endif } diff --git a/src/tex2lyx/texparser.h b/src/tex2lyx/texparser.h index 99ccd5db9b..d550e9d320 100644 --- a/src/tex2lyx/texparser.h +++ b/src/tex2lyx/texparser.h @@ -3,6 +3,7 @@ #define PARSER_H #include "LString.h" +#include enum mode_type {UNDECIDED_MODE, TEXT_MODE, MATH_MODE}; @@ -110,8 +111,6 @@ public: /// void tokenize(istream & is); /// - void tokenize(string const & s); - /// void skipSpaceTokens(istream & is, char c); /// void push_back(Token const & t); -- 2.39.2