X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2Ftex2lyx%2FParser.cpp;h=41ab92063c031a50b2a496d67dd75de963f039a6;hb=298730215c21735f16e7278a5d5a4469fb0b9859;hp=06027e880ad2d0209116f7ebf840535a739e3258;hpb=1a5891e1fd80e9272e195cc482724708f0ab96dd;p=lyx.git diff --git a/src/tex2lyx/Parser.cpp b/src/tex2lyx/Parser.cpp index 06027e880a..41ab92063c 100644 --- a/src/tex2lyx/Parser.cpp +++ b/src/tex2lyx/Parser.cpp @@ -3,7 +3,7 @@ * This file is part of LyX, the document processor. * Licence details can be found in the file COPYING. * - * \author André Pönitz + * \author André Pönitz * * Full author contact details are available in file CREDITS. */ @@ -12,55 +12,24 @@ #include "Encoding.h" #include "Parser.h" +#include "support/lstrings.h" #include "support/textutils.h" #include using namespace std; +using namespace lyx::support; namespace lyx { namespace { -CatCode theCatcode[256]; - -void catInit() -{ - static bool init_done = false; - if (init_done) - return; - init_done = true; - - fill(theCatcode, theCatcode + 256, catOther); - fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter); - fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter); - - theCatcode[int('\\')] = catEscape; - theCatcode[int('{')] = catBegin; - theCatcode[int('}')] = catEnd; - theCatcode[int('$')] = catMath; - theCatcode[int('&')] = catAlign; - theCatcode[int('\n')] = catNewline; - theCatcode[int('#')] = catParameter; - theCatcode[int('^')] = catSuper; - theCatcode[int('_')] = catSub; - theCatcode[0x7f] = catIgnore; - theCatcode[int(' ')] = catSpace; - theCatcode[int('\t')] = catSpace; - theCatcode[int('\r')] = catNewline; - theCatcode[int('~')] = catActive; - theCatcode[int('%')] = catComment; - - // This is wrong! - theCatcode[int('@')] = catLetter; -} - /*! * Translate a line ending to '\n'. * \p c must have catcode catNewline, and it must be the last character read * from \p is. */ -char_type getNewline(idocstream & is, char_type c) +char_type getNewline(iparserdocstream & is, char_type c) { // we have to handle 3 different line endings: // - UNIX (\n) @@ -79,16 +48,8 @@ char_type getNewline(idocstream & is, char_type c) return c; } -CatCode catcode(char_type c) -{ - if (c < 256) - return theCatcode[(unsigned char)c]; - return catOther; } -} - - // // Token // @@ -152,22 +113,67 @@ void debugToken(std::ostream & os, Token const & t, unsigned int flags) #endif +// +// Wrapper +// + +void iparserdocstream::setEncoding(std::string const & e) +{ + is_ << lyx::setEncoding(e); +} + + +void iparserdocstream::putback(char_type c) +{ + s_ = c + s_; +} + + +void iparserdocstream::putback(docstring s) +{ + s_ = s + s_; +} + + +iparserdocstream & iparserdocstream::get(char_type &c) +{ + if (s_.empty()) + is_.get(c); + else { + //cerr << "unparsed: " << to_utf8(s_) <iconvName()); +} + + +void Parser::catInit() +{ + if (curr_cat_ == theCatcodesType_) return; + curr_cat_ = theCatcodesType_; + + fill(theCatcode_, theCatcode_ + 256, catOther); + fill(theCatcode_ + 'a', theCatcode_ + 'z' + 1, catLetter); + fill(theCatcode_ + 'A', theCatcode_ + 'Z' + 1, catLetter); + // This is wrong! + theCatcode_[int('@')] = catLetter; + + if (theCatcodesType_ == NORMAL_CATCODES) { + theCatcode_[int('\\')] = catEscape; + theCatcode_[int('{')] = catBegin; + theCatcode_[int('}')] = catEnd; + theCatcode_[int('$')] = catMath; + theCatcode_[int('&')] = catAlign; + theCatcode_[int('\n')] = catNewline; + theCatcode_[int('#')] = catParameter; + theCatcode_[int('^')] = catSuper; + theCatcode_[int('_')] = catSub; + theCatcode_[0x7f] = catIgnore; + theCatcode_[int(' ')] = catSpace; + theCatcode_[int('\t')] = catSpace; + theCatcode_[int('\r')] = catNewline; + theCatcode_[int('~')] = catActive; + theCatcode_[int('%')] = catComment; } - //cerr << "setting encoding to " << enc->iconvName() << std::endl; - is_ << lyx::setEncoding(enc->iconvName()); - encoding_latex_ = e; +} + +CatCode Parser::catcode(char_type c) const +{ + if (c < 256) + return theCatcode_[(unsigned char)c]; + return catOther; +} + + +void Parser::setCatcode(char c, CatCode cat) +{ + theCatcode_[(unsigned char)c] = cat; + deparse(); +} + + +void Parser::setCatcodes(cat_type t) +{ + theCatcodesType_ = t; + deparse(); +} + + +bool Parser::setEncoding(std::string const & e) +{ + //cerr << "setting encoding to " << e << std::endl; + encoding_iconv_ = e; + // If the encoding is fixed, we must not change the stream encoding + // (because the whole input uses that encoding, e.g. if it comes from + // the clipboard). We still need to track the original encoding in + // encoding_iconv_, so that the generated output is correct. + if (!fixed_enc_) + is_.setEncoding(e); + return true; } @@ -216,7 +304,11 @@ Token const Parser::curr_token() const Token const Parser::next_token() { static const Token dummy; - return good() ? tokens_[pos_] : dummy; + if (!good()) + return dummy; + if (pos_ >= tokens_.size()) + tokenize_one(); + return pos_ < tokens_.size() ? tokens_[pos_] : dummy; } @@ -224,12 +316,14 @@ Token const Parser::next_token() Token const Parser::next_next_token() { static const Token dummy; - // If good() has not been called after the last get_token() we need - // to tokenize two more tokens. - if (pos_ + 1 >= tokens_.size()) { + if (!good()) + return dummy; + // If tokenize_one() has not been called after the last get_token() we + // need to tokenize two more tokens. + if (pos_ >= tokens_.size()) tokenize_one(); + if (pos_ + 1 >= tokens_.size()) tokenize_one(); - } return pos_ + 1 < tokens_.size() ? tokens_[pos_ + 1] : dummy; } @@ -238,8 +332,16 @@ Token const Parser::next_next_token() Token const Parser::get_token() { static const Token dummy; - //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n'; - return good() ? tokens_[pos_++] : dummy; + if (!good()) + return dummy; + if (pos_ >= tokens_.size()) { + tokenize_one(); + if (pos_ >= tokens_.size()) + return dummy; + } + // cerr << "looking at token " << tokens_[pos_] + // << " pos: " << pos_ << '\n'; + return tokens_[pos_++]; } @@ -331,23 +433,23 @@ void Parser::popPosition() { pos_ = positions_.back(); positions_.pop_back(); + deparse(); } -bool Parser::good() +void Parser::dropPosition() { - if (pos_ < tokens_.size()) - return true; - tokenize_one(); - return pos_ < tokens_.size(); + positions_.pop_back(); } -char Parser::getChar() +bool Parser::good() { - if (!good()) - error("The input stream is not well..."); - return get_token().character(); + if (pos_ < tokens_.size()) + return true; + if (!is_.good()) + return false; + return is_.peek() != idocstream::traits_type::eof(); } @@ -383,7 +485,7 @@ bool Parser::hasOpt() } -Parser::Arg Parser::getFullArg(char left, char right) +Parser::Arg Parser::getFullArg(char left, char right, bool allow_escaping) { skip_spaces(true); @@ -393,29 +495,41 @@ Parser::Arg Parser::getFullArg(char left, char right) return make_pair(false, string()); string result; - char c = getChar(); + Token t = get_token(); - if (c != left) { + if (t.cat() == catComment || t.cat() == catEscape || + t.character() != left) { putback(); return make_pair(false, string()); - } else - while ((c = getChar()) != right && good()) { + } else { + while (good()) { + t = get_token(); // Ignore comments - if (curr_token().cat() == catComment) { - if (!curr_token().cs().empty()) - cerr << "Ignoring comment: " << curr_token().asInput(); + if (t.cat() == catComment) { + if (!t.cs().empty()) + cerr << "Ignoring comment: " << t.asInput(); + continue; } - else - result += curr_token().asInput(); + if (allow_escaping) { + if (t.cat() != catEscape && t.character() == right) + break; + } else { + if (t.character() == right) { + if (t.cat() == catEscape) + result += '\\'; + break; + } + } + result += t.asInput(); } - + } return make_pair(true, result); } -string Parser::getArg(char left, char right) +string Parser::getArg(char left, char right, bool allow_escaping) { - return getFullArg(left, right).second; + return getFullArg(left, right, allow_escaping).second; } @@ -451,7 +565,7 @@ string Parser::getFullParentheseArg() } -string const Parser::verbatimEnvironment(string const & name) +string const Parser::ertEnvironment(string const & name) { if (!good()) return string(); @@ -464,7 +578,7 @@ string const Parser::verbatimEnvironment(string const & name) } else if (t.asInput() == "\\begin") { string const env = getArg('{', '}'); os << "\\begin{" << env << '}' - << verbatimEnvironment(env) + << ertEnvironment(env) << "\\end{" << env << '}'; } else if (t.asInput() == "\\end") { string const end = getArg('{', '}'); @@ -502,11 +616,129 @@ string const Parser::plainEnvironment(string const & name) } +string const Parser::plainCommand(char left, char right, string const & name) +{ + if (!good()) + return string(); + // check if first token is really the start character + Token tok = get_token(); + if (tok.character() != left) { + cerr << "first character does not match start character of command \\" << name << endl; + return string(); + } + ostringstream os; + for (Token t = get_token(); good(); t = get_token()) { + if (t.character() == right) { + return os.str(); + } else + os << t.asInput(); + } + cerr << "unexpected end of input" << endl; + return os.str(); +} + + +Parser::Arg Parser::verbatimStuff(string const & end_string, bool const allow_linebreak) +{ + if (!good()) + return Arg(false, string()); + + pushPosition(); + ostringstream oss; + size_t match_index = 0; + setCatcodes(VERBATIM_CATCODES); + for (Token t = get_token(); good(); t = get_token()) { + // FIXME t.asInput() might be longer than we need ? + if (t.asInput() == end_string.substr(match_index, + t.asInput().length())) { + match_index += t.asInput().length(); + if (match_index >= end_string.length()) + break; + } else { + if (!allow_linebreak && t.asInput() == "\n") { + cerr << "unexpected end of input" << endl; + popPosition(); + setCatcodes(NORMAL_CATCODES); + return Arg(false, string()); + } + if (match_index) { + oss << end_string.substr(0, match_index) + << t.asInput(); + match_index = 0; + } else + oss << t.asInput(); + } + } + + if (!good()) { + cerr << "unexpected end of input" << endl; + popPosition(); + setCatcodes(NORMAL_CATCODES); + return Arg(false, string()); + } + setCatcodes(NORMAL_CATCODES); + dropPosition(); + return Arg(true, oss.str()); +} + + +string const Parser::verbatimEnvironment(string const & name) +{ + //FIXME: do something if endstring is not found + string s = verbatimStuff("\\end{" + name + "}").second; + // ignore one newline at beginning or end of string + if (prefixIs(s, "\n")) + s.erase(0,1); + if (suffixIs(s, "\n")) + s.erase(s.length() - 1,1); + return s; +} + + +string Parser::verbatimOption() +{ + string res; + if (next_token().character() == '[') { + Token t = get_token(); + for (t = get_token(); t.character() != ']' && good(); t = get_token()) { + if (t.cat() == catBegin) { + putback(); + res += '{' + verbatim_item() + '}'; + } else + res += t.asInput(); + } + } + return res; +} + + +string Parser::verbatim_item() +{ + if (!good()) + error("stream bad"); + skip_spaces(); + if (next_token().cat() == catBegin) { + Token t = get_token(); // skip brace + string res; + for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) { + if (t.cat() == catBegin) { + putback(); + res += '{' + verbatim_item() + '}'; + } + else + res += t.asInput(); + } + return res; + } + return get_token().asInput(); +} + + void Parser::tokenize_one() { catInit(); char_type c; - if (!is_.get(c)) + if (!is_.get(c)) return; switch (catcode(c)) { @@ -519,7 +751,7 @@ void Parser::tokenize_one() push_back(Token(s, catSpace)); break; } - + case catNewline: { ++lineno_; docstring s(1, getNewline(is_, c)); @@ -532,7 +764,7 @@ void Parser::tokenize_one() push_back(Token(s, catNewline)); break; } - + case catComment: { // We don't treat "%\n" combinations here specially because // we want to preserve them in the preamble @@ -548,7 +780,7 @@ void Parser::tokenize_one() push_back(Token(s, catComment)); break; } - + case catEscape: { is_.get(c); if (!is_) { @@ -566,12 +798,12 @@ void Parser::tokenize_one() } break; } - + case catIgnore: { cerr << "ignoring a char: " << c << "\n"; break; } - + default: push_back(Token(docstring(1, c), catcode(c))); } @@ -599,61 +831,10 @@ void Parser::error(string const & msg) } -string Parser::verbatimOption() -{ - string res; - if (next_token().character() == '[') { - Token t = get_token(); - for (t = get_token(); t.character() != ']' && good(); t = get_token()) { - if (t.cat() == catBegin) { - putback(); - res += '{' + verbatim_item() + '}'; - } else - res += t.cs(); - } - } - return res; -} - - -string Parser::verbatim_item() -{ - if (!good()) - error("stream bad"); - skip_spaces(); - if (next_token().cat() == catBegin) { - Token t = get_token(); // skip brace - string res; - for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) { - if (t.cat() == catBegin) { - putback(); - res += '{' + verbatim_item() + '}'; - } - else - res += t.asInput(); - } - return res; - } - return get_token().asInput(); -} - - void Parser::reset() { pos_ = 0; } -void Parser::setCatCode(char c, CatCode cat) -{ - theCatcode[(unsigned char)c] = cat; -} - - -CatCode Parser::getCatCode(char c) const -{ - return theCatcode[(unsigned char)c]; -} - - } // namespace lyx