X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2Ftex2lyx%2FParser.cpp;h=41ab92063c031a50b2a496d67dd75de963f039a6;hb=298730215c21735f16e7278a5d5a4469fb0b9859;hp=06027e880ad2d0209116f7ebf840535a739e3258;hpb=1a5891e1fd80e9272e195cc482724708f0ab96dd;p=lyx.git

diff --git a/src/tex2lyx/Parser.cpp b/src/tex2lyx/Parser.cpp
index 06027e880a..41ab92063c 100644
--- a/src/tex2lyx/Parser.cpp
+++ b/src/tex2lyx/Parser.cpp
@@ -3,7 +3,7 @@
  * This file is part of LyX, the document processor.
  * Licence details can be found in the file COPYING.
  *
- * \author AndrÃ© PÃ¶nitz 
+ * \author AndrÃ© PÃ¶nitz
  *
  * Full author contact details are available in file CREDITS.
  */
@@ -12,55 +12,24 @@
 
 #include "Encoding.h"
 #include "Parser.h"
+#include "support/lstrings.h"
 #include "support/textutils.h"
 
 #include <iostream>
 
 using namespace std;
+using namespace lyx::support;
 
 namespace lyx {
 
 namespace {
 
-CatCode theCatcode[256];
-
-void catInit()
-{
-	static bool init_done = false;
-	if (init_done) 
-		return;
-	init_done = true;
-
-	fill(theCatcode, theCatcode + 256, catOther);
-	fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
-	fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
-
-	theCatcode[int('\\')] = catEscape;
-	theCatcode[int('{')]  = catBegin;
-	theCatcode[int('}')]  = catEnd;
-	theCatcode[int('$')]  = catMath;
-	theCatcode[int('&')]  = catAlign;
-	theCatcode[int('\n')] = catNewline;
-	theCatcode[int('#')]  = catParameter;
-	theCatcode[int('^')]  = catSuper;
-	theCatcode[int('_')]  = catSub;
-	theCatcode[0x7f]      = catIgnore;
-	theCatcode[int(' ')]  = catSpace;
-	theCatcode[int('\t')] = catSpace;
-	theCatcode[int('\r')] = catNewline;
-	theCatcode[int('~')]  = catActive;
-	theCatcode[int('%')]  = catComment;
-
-	// This is wrong!
-	theCatcode[int('@')]  = catLetter;
-}
-
 /*!
  * Translate a line ending to '\n'.
  * \p c must have catcode catNewline, and it must be the last character read
  * from \p is.
  */
-char_type getNewline(idocstream & is, char_type c)
+char_type getNewline(iparserdocstream & is, char_type c)
 {
 	// we have to handle 3 different line endings:
 	// - UNIX (\n)
@@ -79,16 +48,8 @@ char_type getNewline(idocstream & is, char_type c)
 	return c;
 }
 
-CatCode catcode(char_type c)
-{
-	if (c < 256)
-		return theCatcode[(unsigned char)c];
-	return catOther;
 }
 
-}
-
-
 //
 // Token
 //
@@ -152,22 +113,67 @@ void debugToken(std::ostream & os, Token const & t, unsigned int flags)
 #endif
 
 
+//
+// Wrapper
+//
+
+void iparserdocstream::setEncoding(std::string const & e)
+{
+	is_ << lyx::setEncoding(e);
+}
+
+
+void iparserdocstream::putback(char_type c)
+{
+	s_ = c + s_;
+}
+
+
+void iparserdocstream::putback(docstring s)
+{
+	s_ = s + s_;
+}
+
+
+iparserdocstream & iparserdocstream::get(char_type &c)
+{
+	if (s_.empty())
+		is_.get(c);
+	else {
+		//cerr << "unparsed: " << to_utf8(s_) <<endl;
+		c = s_[0];
+		s_.erase(0,1);
+	}
+	return *this;
+}
+
+
 //
 // Parser
 //
 
 
-Parser::Parser(idocstream & is)
-	: lineno_(0), pos_(0), iss_(0), is_(is), encoding_latex_("utf8")
+Parser::Parser(idocstream & is, std::string const & fixedenc)
+	: lineno_(0), pos_(0), iss_(0), is_(is),
+	  encoding_iconv_(fixedenc.empty() ? "UTF-8" : fixedenc),
+	  theCatcodesType_(NORMAL_CATCODES), curr_cat_(UNDECIDED_CATCODES),
+	  fixed_enc_(!fixedenc.empty())
 {
+	if (fixed_enc_)
+		is_.setEncoding(fixedenc);
+	catInit();
 }
 
 
 Parser::Parser(string const & s)
-	: lineno_(0), pos_(0), 
-	  iss_(new idocstringstream(from_utf8(s))), is_(*iss_), 
-	  encoding_latex_("utf8")
+	: lineno_(0), pos_(0),
+	  iss_(new idocstringstream(from_utf8(s))), is_(*iss_),
+	  encoding_iconv_("UTF-8"),
+	  theCatcodesType_(NORMAL_CATCODES), curr_cat_(UNDECIDED_CATCODES),
+	  // An idocstringstream can not change the encoding
+	  fixed_enc_(true)
 {
+	catInit();
 }
 
 
@@ -177,16 +183,98 @@ Parser::~Parser()
 }
 
 
-void Parser::setEncoding(std::string const & e)
+void Parser::deparse()
+{
+	string s;
+	for(size_type i = pos_ ; i < tokens_.size() ; ++i) {
+		s += tokens_[i].asInput();
+	}
+	is_.putback(from_utf8(s));
+	tokens_.erase(tokens_.begin() + pos_, tokens_.end());
+	// make sure that next token is read
+	tokenize_one();
+}
+
+
+bool Parser::setEncoding(std::string const & e, int const & p)
 {
-	Encoding const * enc = encodings.fromLaTeXName(e);
+	// We may (and need to) use unsafe encodings here: Since the text is
+	// converted to unicode while reading from is_, we never see text in
+	// the original encoding of the parser, but operate on utf8 strings
+	// instead. Therefore, we cannot misparse high bytes as {, } or \\.
+	Encoding const * const enc = encodings.fromLaTeXName(e, p, true);
 	if (!enc) {
 		cerr << "Unknown encoding " << e << ". Ignoring." << std::endl;
+		return false;
+	}
+	return setEncoding(enc->iconvName());
+}
+
+
+void Parser::catInit()
+{
+	if (curr_cat_ == theCatcodesType_)
 		return;
+	curr_cat_ = theCatcodesType_;
+
+	fill(theCatcode_, theCatcode_ + 256, catOther);
+	fill(theCatcode_ + 'a', theCatcode_ + 'z' + 1, catLetter);
+	fill(theCatcode_ + 'A', theCatcode_ + 'Z' + 1, catLetter);
+	// This is wrong!
+	theCatcode_[int('@')]  = catLetter;
+
+	if (theCatcodesType_ == NORMAL_CATCODES) {
+		theCatcode_[int('\\')] = catEscape;
+		theCatcode_[int('{')]  = catBegin;
+		theCatcode_[int('}')]  = catEnd;
+		theCatcode_[int('$')]  = catMath;
+		theCatcode_[int('&')]  = catAlign;
+		theCatcode_[int('\n')] = catNewline;
+		theCatcode_[int('#')]  = catParameter;
+		theCatcode_[int('^')]  = catSuper;
+		theCatcode_[int('_')]  = catSub;
+		theCatcode_[0x7f]      = catIgnore;
+		theCatcode_[int(' ')]  = catSpace;
+		theCatcode_[int('\t')] = catSpace;
+		theCatcode_[int('\r')] = catNewline;
+		theCatcode_[int('~')]  = catActive;
+		theCatcode_[int('%')]  = catComment;
 	}
-	//cerr << "setting encoding to " << enc->iconvName() << std::endl;
-	is_ << lyx::setEncoding(enc->iconvName());
-	encoding_latex_ = e;
+}
+
+CatCode Parser::catcode(char_type c) const
+{
+	if (c < 256)
+		return theCatcode_[(unsigned char)c];
+	return catOther;
+}
+
+
+void Parser::setCatcode(char c, CatCode cat)
+{
+	theCatcode_[(unsigned char)c] = cat;
+	deparse();
+}
+
+
+void Parser::setCatcodes(cat_type t)
+{
+	theCatcodesType_ = t;
+	deparse();
+}
+
+
+bool Parser::setEncoding(std::string const & e)
+{
+	//cerr << "setting encoding to " << e << std::endl;
+	encoding_iconv_ = e;
+	// If the encoding is fixed, we must not change the stream encoding
+	// (because the whole input uses that encoding, e.g. if it comes from
+	// the clipboard). We still need to track the original encoding in
+	// encoding_iconv_, so that the generated output is correct.
+	if (!fixed_enc_)
+		is_.setEncoding(e);
+	return true;
 }
 
 
@@ -216,7 +304,11 @@ Token const Parser::curr_token() const
 Token const Parser::next_token()
 {
 	static const Token dummy;
-	return good() ? tokens_[pos_] : dummy;
+	if (!good())
+		return dummy;
+	if (pos_ >= tokens_.size())
+		tokenize_one();
+	return pos_ < tokens_.size() ? tokens_[pos_] : dummy;
 }
 
 
@@ -224,12 +316,14 @@ Token const Parser::next_token()
 Token const Parser::next_next_token()
 {
 	static const Token dummy;
-	// If good() has not been called after the last get_token() we need
-	// to tokenize two more tokens.
-	if (pos_ + 1 >= tokens_.size()) {
+	if (!good())
+		return dummy;
+	// If tokenize_one() has not been called after the last get_token() we
+	// need to tokenize two more tokens.
+	if (pos_ >= tokens_.size())
 		tokenize_one();
+	if (pos_ + 1 >= tokens_.size())
 		tokenize_one();
-	}
 	return pos_ + 1 < tokens_.size() ? tokens_[pos_ + 1] : dummy;
 }
 
@@ -238,8 +332,16 @@ Token const Parser::next_next_token()
 Token const Parser::get_token()
 {
 	static const Token dummy;
-	//cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
-	return good() ? tokens_[pos_++] : dummy;
+	if (!good())
+		return dummy;
+	if (pos_ >= tokens_.size()) {
+		tokenize_one();
+		if (pos_ >= tokens_.size())
+			return dummy;
+	}
+	// cerr << "looking at token " << tokens_[pos_] 
+	//      << " pos: " << pos_ << '\n';
+	return tokens_[pos_++];
 }
 
 
@@ -331,23 +433,23 @@ void Parser::popPosition()
 {
 	pos_ = positions_.back();
 	positions_.pop_back();
+	deparse();
 }
 
 
-bool Parser::good()
+void Parser::dropPosition()
 {
-	if (pos_ < tokens_.size())
-		return true;
-	tokenize_one();
-	return pos_ < tokens_.size();
+	positions_.pop_back();
 }
 
 
-char Parser::getChar()
+bool Parser::good()
 {
-	if (!good())
-		error("The input stream is not well...");
-	return get_token().character();
+	if (pos_ < tokens_.size())
+		return true;
+	if (!is_.good())
+		return false;
+	return is_.peek() != idocstream::traits_type::eof();
 }
 
 
@@ -383,7 +485,7 @@ bool Parser::hasOpt()
 }
 
 
-Parser::Arg Parser::getFullArg(char left, char right)
+Parser::Arg Parser::getFullArg(char left, char right, bool allow_escaping)
 {
 	skip_spaces(true);
 
@@ -393,29 +495,41 @@ Parser::Arg Parser::getFullArg(char left, char right)
 		return make_pair(false, string());
 
 	string result;
-	char c = getChar();
+	Token t = get_token();
 
-	if (c != left) {
+	if (t.cat() == catComment || t.cat() == catEscape ||
+	    t.character() != left) {
 		putback();
 		return make_pair(false, string());
-	} else
-		while ((c = getChar()) != right && good()) {
+	} else {
+		while (good()) {
+			t = get_token();
 			// Ignore comments
-			if (curr_token().cat() == catComment) {
-				if (!curr_token().cs().empty())
-					cerr << "Ignoring comment: " << curr_token().asInput();
+			if (t.cat() == catComment) {
+				if (!t.cs().empty())
+					cerr << "Ignoring comment: " << t.asInput();
+				continue;
 			}
-			else
-				result += curr_token().asInput();
+			if (allow_escaping) {
+				if (t.cat() != catEscape && t.character() == right)
+					break;
+			} else {
+				if (t.character() == right) {
+					if (t.cat() == catEscape)
+						result += '\\';
+					break;
+				}
+			}
+			result += t.asInput();
 		}
-
+	}
 	return make_pair(true, result);
 }
 
 
-string Parser::getArg(char left, char right)
+string Parser::getArg(char left, char right, bool allow_escaping)
 {
-	return getFullArg(left, right).second;
+	return getFullArg(left, right, allow_escaping).second;
 }
 
 
@@ -451,7 +565,7 @@ string Parser::getFullParentheseArg()
 }
 
 
-string const Parser::verbatimEnvironment(string const & name)
+string const Parser::ertEnvironment(string const & name)
 {
 	if (!good())
 		return string();
@@ -464,7 +578,7 @@ string const Parser::verbatimEnvironment(string const & name)
 		} else if (t.asInput() == "\\begin") {
 			string const env = getArg('{', '}');
 			os << "\\begin{" << env << '}'
-			   << verbatimEnvironment(env)
+			   << ertEnvironment(env)
 			   << "\\end{" << env << '}';
 		} else if (t.asInput() == "\\end") {
 			string const end = getArg('{', '}');
@@ -502,11 +616,129 @@ string const Parser::plainEnvironment(string const & name)
 }
 
 
+string const Parser::plainCommand(char left, char right, string const & name)
+{
+	if (!good())
+		return string();
+	// check if first token is really the start character
+	Token tok = get_token();
+	if (tok.character() != left) {
+		cerr << "first character does not match start character of command \\" << name << endl;
+		return string();
+	}
+	ostringstream os;
+	for (Token t = get_token(); good(); t = get_token()) {
+		if (t.character() == right) {
+			return os.str();
+		} else
+			os << t.asInput();
+	}
+	cerr << "unexpected end of input" << endl;
+	return os.str();
+}
+
+
+Parser::Arg Parser::verbatimStuff(string const & end_string, bool const allow_linebreak)
+{
+	if (!good())
+		return Arg(false, string());
+
+	pushPosition();
+	ostringstream oss;
+	size_t match_index = 0;
+	setCatcodes(VERBATIM_CATCODES);
+	for (Token t = get_token(); good(); t = get_token()) {
+		// FIXME t.asInput() might be longer than we need ?
+		if (t.asInput() == end_string.substr(match_index,
+						     t.asInput().length())) {
+			match_index += t.asInput().length();
+			if (match_index >= end_string.length())
+				break;
+		} else {
+			if (!allow_linebreak && t.asInput() == "\n") {
+				cerr << "unexpected end of input" << endl;
+				popPosition();
+				setCatcodes(NORMAL_CATCODES);
+				return Arg(false, string());
+			}
+			if (match_index) {
+				oss << end_string.substr(0, match_index) 
+				    << t.asInput();
+				match_index = 0;
+			} else
+				oss << t.asInput();
+		}
+	}
+
+	if (!good()) {
+		cerr << "unexpected end of input" << endl;
+		popPosition();
+		setCatcodes(NORMAL_CATCODES);
+		return Arg(false, string());
+	}
+	setCatcodes(NORMAL_CATCODES);
+	dropPosition();
+	return Arg(true, oss.str());
+}
+
+
+string const Parser::verbatimEnvironment(string const & name)
+{
+	//FIXME: do something if endstring is not found
+	string s = verbatimStuff("\\end{" + name + "}").second;
+	// ignore one newline at beginning or end of string
+	if (prefixIs(s, "\n"))
+		s.erase(0,1);
+	if (suffixIs(s, "\n"))
+		s.erase(s.length() - 1,1);
+	return s;
+}
+
+
+string Parser::verbatimOption()
+{
+	string res;
+	if (next_token().character() == '[') {
+		Token t = get_token();
+		for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
+			if (t.cat() == catBegin) {
+				putback();
+				res += '{' + verbatim_item() + '}';
+			} else
+				res += t.asInput();
+		}
+	}
+	return res;
+}
+
+
+string Parser::verbatim_item()
+{
+	if (!good())
+		error("stream bad");
+	skip_spaces();
+	if (next_token().cat() == catBegin) {
+		Token t = get_token(); // skip brace
+		string res;
+		for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
+			if (t.cat() == catBegin) {
+				putback();
+				res += '{' + verbatim_item() + '}';
+			}
+			else
+				res += t.asInput();
+		}
+		return res;
+	}
+	return get_token().asInput();
+}
+
+
 void Parser::tokenize_one()
 {
 	catInit();
 	char_type c;
-	if (!is_.get(c)) 
+	if (!is_.get(c))
 		return;
 
 	switch (catcode(c)) {
@@ -519,7 +751,7 @@ void Parser::tokenize_one()
 		push_back(Token(s, catSpace));
 		break;
 	}
-		
+
 	case catNewline: {
 		++lineno_;
 		docstring s(1, getNewline(is_, c));
@@ -532,7 +764,7 @@ void Parser::tokenize_one()
 		push_back(Token(s, catNewline));
 		break;
 	}
-		
+
 	case catComment: {
 		// We don't treat "%\n" combinations here specially because
 		// we want to preserve them in the preamble
@@ -548,7 +780,7 @@ void Parser::tokenize_one()
 		push_back(Token(s, catComment));
 		break;
 	}
-		
+
 	case catEscape: {
 		is_.get(c);
 		if (!is_) {
@@ -566,12 +798,12 @@ void Parser::tokenize_one()
 		}
 		break;
 	}
-		
+
 	case catIgnore: {
 		cerr << "ignoring a char: " << c << "\n";
 		break;
 	}
-		
+
 	default:
 		push_back(Token(docstring(1, c), catcode(c)));
 	}
@@ -599,61 +831,10 @@ void Parser::error(string const & msg)
 }
 
 
-string Parser::verbatimOption()
-{
-	string res;
-	if (next_token().character() == '[') {
-		Token t = get_token();
-		for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
-			if (t.cat() == catBegin) {
-				putback();
-				res += '{' + verbatim_item() + '}';
-			} else
-				res += t.cs();
-		}
-	}
-	return res;
-}
-
-
-string Parser::verbatim_item()
-{
-	if (!good())
-		error("stream bad");
-	skip_spaces();
-	if (next_token().cat() == catBegin) {
-		Token t = get_token(); // skip brace
-		string res;
-		for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
-			if (t.cat() == catBegin) {
-				putback();
-				res += '{' + verbatim_item() + '}';
-			}
-			else
-				res += t.asInput();
-		}
-		return res;
-	}
-	return get_token().asInput();
-}
-
-
 void Parser::reset()
 {
 	pos_ = 0;
 }
 
 
-void Parser::setCatCode(char c, CatCode cat)
-{
-	theCatcode[(unsigned char)c] = cat;
-}
-
-
-CatCode Parser::getCatCode(char c) const
-{
-	return theCatcode[(unsigned char)c];
-}
-
-
 } // namespace lyx