X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2Ftex2lyx%2FParser.cpp;h=41ab92063c031a50b2a496d67dd75de963f039a6;hb=298730215c21735f16e7278a5d5a4469fb0b9859;hp=323667f1157cf4c9072900161d7d7262707d7d20;hpb=9abb7db46800e554f57e865a3e768602ffd9d6f1;p=lyx.git diff --git a/src/tex2lyx/Parser.cpp b/src/tex2lyx/Parser.cpp index 323667f115..41ab92063c 100644 --- a/src/tex2lyx/Parser.cpp +++ b/src/tex2lyx/Parser.cpp @@ -3,59 +3,33 @@ * This file is part of LyX, the document processor. * Licence details can be found in the file COPYING. * - * \author André Pönitz + * \author André Pönitz * * Full author contact details are available in file CREDITS. */ #include +#include "Encoding.h" #include "Parser.h" +#include "support/lstrings.h" +#include "support/textutils.h" #include -#include using namespace std; +using namespace lyx::support; namespace lyx { namespace { -CatCode theCatcode[256]; - -void catInit() -{ - fill(theCatcode, theCatcode + 256, catOther); - fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter); - fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter); - - theCatcode[int('\\')] = catEscape; - theCatcode[int('{')] = catBegin; - theCatcode[int('}')] = catEnd; - theCatcode[int('$')] = catMath; - theCatcode[int('&')] = catAlign; - theCatcode[int('\n')] = catNewline; - theCatcode[int('#')] = catParameter; - theCatcode[int('^')] = catSuper; - theCatcode[int('_')] = catSub; - theCatcode[0x7f] = catIgnore; - theCatcode[int(' ')] = catSpace; - theCatcode[int('\t')] = catSpace; - theCatcode[int('\r')] = catNewline; - theCatcode[int('~')] = catActive; - theCatcode[int('%')] = catComment; - - // This is wrong! - theCatcode[int('@')] = catLetter; -} - - /*! * Translate a line ending to '\n'. * \p c must have catcode catNewline, and it must be the last character read * from \p is. */ -char getNewline(istream & is, char c) +char_type getNewline(iparserdocstream & is, char_type c) { // we have to handle 3 different line endings: // - UNIX (\n) @@ -63,9 +37,10 @@ char getNewline(istream & is, char c) // - DOS (\r\n) if (c == '\r') { // MAC or DOS - if (is.get(c) && c != '\n') { + char_type wc; + if (is.get(wc) && wc != '\n') { // MAC - is.putback(c); + is.putback(wc); } return '\n'; } @@ -75,18 +50,6 @@ char getNewline(istream & is, char c) } - -// -// catcodes -// - -CatCode catcode(unsigned char c) -{ - return theCatcode[c]; -} - - - // // Token // @@ -100,28 +63,88 @@ ostream & operator<<(ostream & os, Token const & t) else if (t.cat() == catEscape) os << '\\' << t.cs() << ' '; else if (t.cat() == catLetter) - os << t.character(); + os << t.cs(); else if (t.cat() == catNewline) os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n"; else - os << '[' << t.character() << ',' << t.cat() << ']'; + os << '[' << t.cs() << ',' << t.cat() << ']'; return os; } -string Token::asString() const +string Token::asInput() const { - return cs_.size() ? cs_ : string(1, char_); + if (cat_ == catComment) + return '%' + cs_ + '\n'; + if (cat_ == catEscape) + return '\\' + cs_; + return cs_; } -string Token::asInput() const +bool Token::isAlnumASCII() const { - if (cat_ == catComment) - return '%' + cs_ + '\n'; - if (cat_ == catSpace || cat_ == catNewline) - return cs_; - return char_ ? string(1, char_) : '\\' + cs_; + return cat_ == catLetter || + (cat_ == catOther && cs_.length() == 1 && isDigitASCII(cs_[0])); +} + + +#ifdef FILEDEBUG +void debugToken(std::ostream & os, Token const & t, unsigned int flags) +{ + char sep = ' '; + os << "t: " << t << " flags: " << flags; + if (flags & FLAG_BRACE_LAST) { os << sep << "BRACE_LAST"; sep = '|'; } + if (flags & FLAG_RIGHT ) { os << sep << "RIGHT" ; sep = '|'; } + if (flags & FLAG_END ) { os << sep << "END" ; sep = '|'; } + if (flags & FLAG_BRACK_LAST) { os << sep << "BRACK_LAST"; sep = '|'; } + if (flags & FLAG_TEXTMODE ) { os << sep << "TEXTMODE" ; sep = '|'; } + if (flags & FLAG_ITEM ) { os << sep << "ITEM" ; sep = '|'; } + if (flags & FLAG_LEAVE ) { os << sep << "LEAVE" ; sep = '|'; } + if (flags & FLAG_SIMPLE ) { os << sep << "SIMPLE" ; sep = '|'; } + if (flags & FLAG_EQUATION ) { os << sep << "EQUATION" ; sep = '|'; } + if (flags & FLAG_SIMPLE2 ) { os << sep << "SIMPLE2" ; sep = '|'; } + if (flags & FLAG_OPTION ) { os << sep << "OPTION" ; sep = '|'; } + if (flags & FLAG_BRACED ) { os << sep << "BRACED" ; sep = '|'; } + if (flags & FLAG_CELL ) { os << sep << "CELL" ; sep = '|'; } + if (flags & FLAG_TABBING ) { os << sep << "TABBING" ; sep = '|'; } + os << "\n"; +} +#endif + + +// +// Wrapper +// + +void iparserdocstream::setEncoding(std::string const & e) +{ + is_ << lyx::setEncoding(e); +} + + +void iparserdocstream::putback(char_type c) +{ + s_ = c + s_; +} + + +void iparserdocstream::putback(docstring s) +{ + s_ = s + s_; +} + + +iparserdocstream & iparserdocstream::get(char_type &c) +{ + if (s_.empty()) + is_.get(c); + else { + //cerr << "unparsed: " << to_utf8(s_) <iconvName()); } -Token const & Parser::prev_token() const +void Parser::catInit() +{ + if (curr_cat_ == theCatcodesType_) + return; + curr_cat_ = theCatcodesType_; + + fill(theCatcode_, theCatcode_ + 256, catOther); + fill(theCatcode_ + 'a', theCatcode_ + 'z' + 1, catLetter); + fill(theCatcode_ + 'A', theCatcode_ + 'Z' + 1, catLetter); + // This is wrong! + theCatcode_[int('@')] = catLetter; + + if (theCatcodesType_ == NORMAL_CATCODES) { + theCatcode_[int('\\')] = catEscape; + theCatcode_[int('{')] = catBegin; + theCatcode_[int('}')] = catEnd; + theCatcode_[int('$')] = catMath; + theCatcode_[int('&')] = catAlign; + theCatcode_[int('\n')] = catNewline; + theCatcode_[int('#')] = catParameter; + theCatcode_[int('^')] = catSuper; + theCatcode_[int('_')] = catSub; + theCatcode_[0x7f] = catIgnore; + theCatcode_[int(' ')] = catSpace; + theCatcode_[int('\t')] = catSpace; + theCatcode_[int('\r')] = catNewline; + theCatcode_[int('~')] = catActive; + theCatcode_[int('%')] = catComment; + } +} + +CatCode Parser::catcode(char_type c) const +{ + if (c < 256) + return theCatcode_[(unsigned char)c]; + return catOther; +} + + +void Parser::setCatcode(char c, CatCode cat) +{ + theCatcode_[(unsigned char)c] = cat; + deparse(); +} + + +void Parser::setCatcodes(cat_type t) +{ + theCatcodesType_ = t; + deparse(); +} + + +bool Parser::setEncoding(std::string const & e) +{ + //cerr << "setting encoding to " << e << std::endl; + encoding_iconv_ = e; + // If the encoding is fixed, we must not change the stream encoding + // (because the whole input uses that encoding, e.g. if it comes from + // the clipboard). We still need to track the original encoding in + // encoding_iconv_, so that the generated output is correct. + if (!fixed_enc_) + is_.setEncoding(e); + return true; +} + + +void Parser::push_back(Token const & t) +{ + tokens_.push_back(t); +} + + +// We return a copy here because the tokens_ vector may get reallocated +Token const Parser::prev_token() const { static const Token dummy; return pos_ > 1 ? tokens_[pos_ - 2] : dummy; } -Token const & Parser::curr_token() const +// We return a copy here because the tokens_ vector may get reallocated +Token const Parser::curr_token() const { static const Token dummy; return pos_ > 0 ? tokens_[pos_ - 1] : dummy; } -Token const & Parser::next_token() const +// We return a copy here because the tokens_ vector may get reallocated +Token const Parser::next_token() +{ + static const Token dummy; + if (!good()) + return dummy; + if (pos_ >= tokens_.size()) + tokenize_one(); + return pos_ < tokens_.size() ? tokens_[pos_] : dummy; +} + + +// We return a copy here because the tokens_ vector may get reallocated +Token const Parser::next_next_token() { static const Token dummy; - return good() ? tokens_[pos_] : dummy; + if (!good()) + return dummy; + // If tokenize_one() has not been called after the last get_token() we + // need to tokenize two more tokens. + if (pos_ >= tokens_.size()) + tokenize_one(); + if (pos_ + 1 >= tokens_.size()) + tokenize_one(); + return pos_ + 1 < tokens_.size() ? tokens_[pos_ + 1] : dummy; } -Token const & Parser::get_token() +// We return a copy here because the tokens_ vector may get reallocated +Token const Parser::get_token() { static const Token dummy; - //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n'; - return good() ? tokens_[pos_++] : dummy; + if (!good()) + return dummy; + if (pos_ >= tokens_.size()) { + tokenize_one(); + if (pos_ >= tokens_.size()) + return dummy; + } + // cerr << "looking at token " << tokens_[pos_] + // << " pos: " << pos_ << '\n'; + return tokens_[pos_++]; } -bool Parser::isParagraph() const +bool Parser::isParagraph() { // A new paragraph in TeX ist started // - either by a newline, following any amount of whitespace @@ -195,8 +354,7 @@ bool Parser::isParagraph() const if (curr_token().cat() == catNewline && (curr_token().cs().size() > 1 || (next_token().cat() == catSpace && - pos_ < tokens_.size() - 1 && - tokens_[pos_ + 1].cat() == catNewline))) + next_next_token().cat() == catNewline))) return true; if (curr_token().cat() == catEscape && curr_token().cs() == "par") return true; @@ -204,28 +362,37 @@ bool Parser::isParagraph() const } -void Parser::skip_spaces(bool skip_comments) +bool Parser::skip_spaces(bool skip_comments) { // We just silently return if we have no more tokens. // skip_spaces() should be callable at any time, // the caller must check p::good() anyway. + bool skipped = false; while (good()) { get_token(); if (isParagraph()) { putback(); break; } - if ( curr_token().cat() == catSpace || - curr_token().cat() == catNewline || - (curr_token().cat() == catComment && curr_token().cs().empty())) + if (curr_token().cat() == catSpace || + curr_token().cat() == catNewline) { + skipped = true; + continue; + } + if ((curr_token().cat() == catComment && curr_token().cs().empty())) continue; - if (skip_comments && curr_token().cat() == catComment) - cerr << " Ignoring comment: " << curr_token().asInput(); - else { + if (skip_comments && curr_token().cat() == catComment) { + // If positions_ is not empty we are doing some kind + // of look ahead + if (!positions_.empty()) + cerr << " Ignoring comment: " + << curr_token().asInput(); + } else { putback(); break; } } + return skipped; } @@ -237,7 +404,11 @@ void Parser::unskip_spaces(bool skip_comments) putback(); else if (skip_comments && curr_token().cat() == catComment) { // TODO: Get rid of this - cerr << "Unignoring comment: " << curr_token().asInput(); + // If positions_ is not empty we are doing some kind + // of look ahead + if (!positions_.empty()) + cerr << "Unignoring comment: " + << curr_token().asInput(); putback(); } else @@ -252,21 +423,69 @@ void Parser::putback() } -bool Parser::good() const +void Parser::pushPosition() { - return pos_ < tokens_.size(); + positions_.push_back(pos_); } -char Parser::getChar() +void Parser::popPosition() { - if (!good()) - error("The input stream is not well..."); - return tokens_[pos_++].character(); + pos_ = positions_.back(); + positions_.pop_back(); + deparse(); +} + + +void Parser::dropPosition() +{ + positions_.pop_back(); +} + + +bool Parser::good() +{ + if (pos_ < tokens_.size()) + return true; + if (!is_.good()) + return false; + return is_.peek() != idocstream::traits_type::eof(); } -Parser::Arg Parser::getFullArg(char left, char right) +bool Parser::hasOpt() +{ + // An optional argument can occur in any of the following forms: + // - \foo[bar] + // - \foo [bar] + // - \foo + // [bar] + // - \foo %comment + // [bar] + + // remember current position + unsigned int oldpos = pos_; + // skip spaces and comments + while (good()) { + get_token(); + if (isParagraph()) { + putback(); + break; + } + if (curr_token().cat() == catSpace || + curr_token().cat() == catNewline || + curr_token().cat() == catComment) + continue; + putback(); + break; + } + bool const retval = (next_token().asInput() == "["); + pos_ = oldpos; + return retval; +} + + +Parser::Arg Parser::getFullArg(char left, char right, bool allow_escaping) { skip_spaces(true); @@ -276,49 +495,77 @@ Parser::Arg Parser::getFullArg(char left, char right) return make_pair(false, string()); string result; - char c = getChar(); + Token t = get_token(); - if (c != left) { + if (t.cat() == catComment || t.cat() == catEscape || + t.character() != left) { putback(); return make_pair(false, string()); - } else - while ((c = getChar()) != right && good()) { + } else { + while (good()) { + t = get_token(); // Ignore comments - if (curr_token().cat() == catComment) { - if (!curr_token().cs().empty()) - cerr << "Ignoring comment: " << curr_token().asInput(); + if (t.cat() == catComment) { + if (!t.cs().empty()) + cerr << "Ignoring comment: " << t.asInput(); + continue; } - else - result += curr_token().asInput(); + if (allow_escaping) { + if (t.cat() != catEscape && t.character() == right) + break; + } else { + if (t.character() == right) { + if (t.cat() == catEscape) + result += '\\'; + break; + } + } + result += t.asInput(); } - + } return make_pair(true, result); } -string Parser::getArg(char left, char right) +string Parser::getArg(char left, char right, bool allow_escaping) { - return getFullArg(left, right).second; + return getFullArg(left, right, allow_escaping).second; } -string Parser::getFullOpt() +string Parser::getFullOpt(bool keepws) { Arg arg = getFullArg('[', ']'); if (arg.first) return '[' + arg.second + ']'; - return arg.second; + if (keepws) + unskip_spaces(true); + return string(); } -string Parser::getOpt() +string Parser::getOpt(bool keepws) { string const res = getArg('[', ']'); - return res.empty() ? string() : '[' + res + ']'; + if (res.empty()) { + if (keepws) + unskip_spaces(true); + return string(); + } + return '[' + res + ']'; } -string const Parser::verbatimEnvironment(string const & name) +string Parser::getFullParentheseArg() +{ + Arg arg = getFullArg('(', ')'); + if (arg.first) + return '(' + arg.second + ')'; + return string(); +} + + +string const Parser::ertEnvironment(string const & name) { if (!good()) return string(); @@ -331,7 +578,7 @@ string const Parser::verbatimEnvironment(string const & name) } else if (t.asInput() == "\\begin") { string const env = getArg('{', '}'); os << "\\begin{" << env << '}' - << verbatimEnvironment(env) + << ertEnvironment(env) << "\\end{" << env << '}'; } else if (t.asInput() == "\\end") { string const end = getArg('{', '}'); @@ -348,106 +595,103 @@ string const Parser::verbatimEnvironment(string const & name) } -void Parser::tokenize(istream & is) +string const Parser::plainEnvironment(string const & name) { - static bool init_done = false; + if (!good()) + return string(); - if (!init_done) { - catInit(); - init_done = true; + ostringstream os; + for (Token t = get_token(); good(); t = get_token()) { + if (t.asInput() == "\\end") { + string const end = getArg('{', '}'); + if (end == name) + return os.str(); + else + os << "\\end{" << end << '}'; + } else + os << t.asInput(); } + cerr << "unexpected end of input" << endl; + return os.str(); +} - char c; - while (is.get(c)) { - //cerr << "reading c: " << c << "\n"; - - switch (catcode(c)) { - case catSpace: { - string s(1, c); - while (is.get(c) && catcode(c) == catSpace) - s += c; - if (catcode(c) != catSpace) - is.putback(c); - push_back(Token(s, catSpace)); - break; - } - case catNewline: { - ++lineno_; - string s(1, getNewline(is, c)); - while (is.get(c) && catcode(c) == catNewline) { - ++lineno_; - s += getNewline(is, c); - } - if (catcode(c) != catNewline) - is.putback(c); - push_back(Token(s, catNewline)); - break; - } +string const Parser::plainCommand(char left, char right, string const & name) +{ + if (!good()) + return string(); + // check if first token is really the start character + Token tok = get_token(); + if (tok.character() != left) { + cerr << "first character does not match start character of command \\" << name << endl; + return string(); + } + ostringstream os; + for (Token t = get_token(); good(); t = get_token()) { + if (t.character() == right) { + return os.str(); + } else + os << t.asInput(); + } + cerr << "unexpected end of input" << endl; + return os.str(); +} - case catComment: { - // We don't treat "%\n" combinations here specially because - // we want to preserve them in the preamble - string s; - while (is.get(c) && catcode(c) != catNewline) - s += c; - // handle possible DOS line ending - if (catcode(c) == catNewline) - c = getNewline(is, c); - // Note: The '%' at the beginning and the '\n' at the end - // of the comment are not stored. - ++lineno_; - push_back(Token(s, catComment)); - break; - } - case catEscape: { - is.get(c); - if (!is) { - error("unexpected end of input"); - } else { - string s(1, c); - if (catcode(c) == catLetter) { - // collect letters - while (is.get(c) && catcode(c) == catLetter) - s += c; - if (catcode(c) != catLetter) - is.putback(c); - } - push_back(Token(s, catEscape)); - } - break; - } +Parser::Arg Parser::verbatimStuff(string const & end_string, bool const allow_linebreak) +{ + if (!good()) + return Arg(false, string()); - case catIgnore: { - cerr << "ignoring a char: " << int(c) << "\n"; + pushPosition(); + ostringstream oss; + size_t match_index = 0; + setCatcodes(VERBATIM_CATCODES); + for (Token t = get_token(); good(); t = get_token()) { + // FIXME t.asInput() might be longer than we need ? + if (t.asInput() == end_string.substr(match_index, + t.asInput().length())) { + match_index += t.asInput().length(); + if (match_index >= end_string.length()) break; + } else { + if (!allow_linebreak && t.asInput() == "\n") { + cerr << "unexpected end of input" << endl; + popPosition(); + setCatcodes(NORMAL_CATCODES); + return Arg(false, string()); } - - default: - push_back(Token(c, catcode(c))); + if (match_index) { + oss << end_string.substr(0, match_index) + << t.asInput(); + match_index = 0; + } else + oss << t.asInput(); } } -} - -void Parser::dump() const -{ - cerr << "\nTokens: "; - for (unsigned i = 0; i < tokens_.size(); ++i) { - if (i == pos_) - cerr << " <#> "; - cerr << tokens_[i]; + if (!good()) { + cerr << "unexpected end of input" << endl; + popPosition(); + setCatcodes(NORMAL_CATCODES); + return Arg(false, string()); } - cerr << " pos: " << pos_ << "\n"; + setCatcodes(NORMAL_CATCODES); + dropPosition(); + return Arg(true, oss.str()); } -void Parser::error(string const & msg) +string const Parser::verbatimEnvironment(string const & name) { - cerr << "Line ~" << lineno_ << ": parse error: " << msg << endl; - dump(); - //exit(1); + //FIXME: do something if endstring is not found + string s = verbatimStuff("\\end{" + name + "}").second; + // ignore one newline at beginning or end of string + if (prefixIs(s, "\n")) + s.erase(0,1); + if (suffixIs(s, "\n")) + s.erase(s.length() - 1,1); + return s; } @@ -456,12 +700,12 @@ string Parser::verbatimOption() string res; if (next_token().character() == '[') { Token t = get_token(); - for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) { + for (t = get_token(); t.character() != ']' && good(); t = get_token()) { if (t.cat() == catBegin) { putback(); res += '{' + verbatim_item() + '}'; } else - res += t.asString(); + res += t.asInput(); } } return res; @@ -490,21 +734,106 @@ string Parser::verbatim_item() } -void Parser::reset() +void Parser::tokenize_one() { - pos_ = 0; + catInit(); + char_type c; + if (!is_.get(c)) + return; + + switch (catcode(c)) { + case catSpace: { + docstring s(1, c); + while (is_.get(c) && catcode(c) == catSpace) + s += c; + if (catcode(c) != catSpace) + is_.putback(c); + push_back(Token(s, catSpace)); + break; + } + + case catNewline: { + ++lineno_; + docstring s(1, getNewline(is_, c)); + while (is_.get(c) && catcode(c) == catNewline) { + ++lineno_; + s += getNewline(is_, c); + } + if (catcode(c) != catNewline) + is_.putback(c); + push_back(Token(s, catNewline)); + break; + } + + case catComment: { + // We don't treat "%\n" combinations here specially because + // we want to preserve them in the preamble + docstring s; + while (is_.get(c) && catcode(c) != catNewline) + s += c; + // handle possible DOS line ending + if (catcode(c) == catNewline) + c = getNewline(is_, c); + // Note: The '%' at the beginning and the '\n' at the end + // of the comment are not stored. + ++lineno_; + push_back(Token(s, catComment)); + break; + } + + case catEscape: { + is_.get(c); + if (!is_) { + error("unexpected end of input"); + } else { + docstring s(1, c); + if (catcode(c) == catLetter) { + // collect letters + while (is_.get(c) && catcode(c) == catLetter) + s += c; + if (catcode(c) != catLetter) + is_.putback(c); + } + push_back(Token(s, catEscape)); + } + break; + } + + case catIgnore: { + cerr << "ignoring a char: " << c << "\n"; + break; + } + + default: + push_back(Token(docstring(1, c), catcode(c))); + } + //cerr << tokens_.back(); +} + + +void Parser::dump() const +{ + cerr << "\nTokens: "; + for (unsigned i = 0; i < tokens_.size(); ++i) { + if (i == pos_) + cerr << " <#> "; + cerr << tokens_[i]; + } + cerr << " pos: " << pos_ << "\n"; } -void Parser::setCatCode(char c, CatCode cat) +void Parser::error(string const & msg) { - theCatcode[(unsigned char)c] = cat; + cerr << "Line ~" << lineno_ << ": parse error: " << msg << endl; + dump(); + //exit(1); } -CatCode Parser::getCatCode(char c) const +void Parser::reset() { - return theCatcode[(unsigned char)c]; + pos_ = 0; }