X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2Ftex2lyx%2FParser.cpp;h=75b8ba4bdd3be80e6e7187cc13875f5fadf7eb91;hb=487c8b5bd34b1de999d213d83e27916a334d4891;hp=cdd5c3992160ac3a3d39d03b70a1f0b2bed9a074;hpb=04591a6e34a0ac52632f3a309df9ff375a91b713;p=lyx.git

diff --git a/src/tex2lyx/Parser.cpp b/src/tex2lyx/Parser.cpp
index cdd5c39921..75b8ba4bdd 100644
--- a/src/tex2lyx/Parser.cpp
+++ b/src/tex2lyx/Parser.cpp
@@ -3,37 +3,34 @@
  * This file is part of LyX, the document processor.
  * Licence details can be found in the file COPYING.
  *
- * \author André Pönitz
+ * \author AndrÃ© PÃ¶nitz 
  *
  * Full author contact details are available in file CREDITS.
  */
 
 #include <config.h>
 
+#include "Encoding.h"
 #include "Parser.h"
+#include "support/textutils.h"
 
 #include <iostream>
-#include <sstream>
 
+using namespace std;
 
 namespace lyx {
 
-using std::cerr;
-using std::endl;
-using std::fill;
-using std::istream;
-using std::istringstream;
-using std::ostringstream;
-using std::ostream;
-using std::string;
-
-
 namespace {
 
 CatCode theCatcode[256];
 
 void catInit()
 {
+	static bool init_done = false;
+	if (init_done) 
+		return;
+	init_done = true;
+
 	fill(theCatcode, theCatcode + 256, catOther);
 	fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
 	fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
@@ -58,13 +55,12 @@ void catInit()
 	theCatcode[int('@')]  = catLetter;
 }
 
-
 /*!
  * Translate a line ending to '\n'.
  * \p c must have catcode catNewline, and it must be the last character read
  * from \p is.
  */
-char getNewline(istream & is, char c)
+char_type getNewline(idocstream & is, char_type c)
 {
 	// we have to handle 3 different line endings:
 	// - UNIX (\n)
@@ -72,9 +68,10 @@ char getNewline(istream & is, char c)
 	// - DOS  (\r\n)
 	if (c == '\r') {
 		// MAC or DOS
-		if (is.get(c) && c != '\n') {
+		char_type wc;
+		if (is.get(wc) && wc != '\n') {
 			// MAC
-			is.putback(c);
+			is.putback(wc);
 		}
 		return '\n';
 	}
@@ -82,18 +79,14 @@ char getNewline(istream & is, char c)
 	return c;
 }
 
-}
-
-
-//
-// catcodes
-//
-
-CatCode catcode(unsigned char c)
+CatCode catcode(char_type c)
 {
-	return theCatcode[c];
+	if (c < 256)
+		return theCatcode[(unsigned char)c];
+	return catOther;
 }
 
+}
 
 
 //
@@ -109,29 +102,54 @@ ostream & operator<<(ostream & os, Token const & t)
 	else if (t.cat() == catEscape)
 		os << '\\' << t.cs() << ' ';
 	else if (t.cat() == catLetter)
-		os << t.character();
+		os << t.cs();
 	else if (t.cat() == catNewline)
 		os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
 	else
-		os << '[' << t.character() << ',' << t.cat() << ']';
+		os << '[' << t.cs() << ',' << t.cat() << ']';
 	return os;
 }
 
 
-string Token::asString() const
+string Token::asInput() const
 {
-	return cs_.size() ? cs_ : string(1, char_);
+	if (cat_ == catComment)
+		return '%' + cs_ + '\n';
+	if (cat_ == catEscape)
+		return '\\' + cs_;
+	return cs_;
 }
 
 
-string Token::asInput() const
+bool Token::isAlnumASCII() const
 {
-	if (cat_ == catComment)
-		return '%' + cs_ + '\n';
-	if (cat_ == catSpace || cat_ == catNewline)
-		return cs_;
-	return char_ ? string(1, char_) : '\\' + cs_;
+	return cat_ == catLetter ||
+	       (cat_ == catOther && cs_.length() == 1 && isDigitASCII(cs_[0]));
+}
+
+
+#ifdef FILEDEBUG
+void debugToken(std::ostream & os, Token const & t, unsigned int flags)
+{
+	char sep = ' ';
+	os << "t: " << t << " flags: " << flags;
+	if (flags & FLAG_BRACE_LAST) { os << sep << "BRACE_LAST"; sep = '|'; }
+	if (flags & FLAG_RIGHT     ) { os << sep << "RIGHT"     ; sep = '|'; }
+	if (flags & FLAG_END       ) { os << sep << "END"       ; sep = '|'; }
+	if (flags & FLAG_BRACK_LAST) { os << sep << "BRACK_LAST"; sep = '|'; }
+	if (flags & FLAG_TEXTMODE  ) { os << sep << "TEXTMODE"  ; sep = '|'; }
+	if (flags & FLAG_ITEM      ) { os << sep << "ITEM"      ; sep = '|'; }
+	if (flags & FLAG_LEAVE     ) { os << sep << "LEAVE"     ; sep = '|'; }
+	if (flags & FLAG_SIMPLE    ) { os << sep << "SIMPLE"    ; sep = '|'; }
+	if (flags & FLAG_EQUATION  ) { os << sep << "EQUATION"  ; sep = '|'; }
+	if (flags & FLAG_SIMPLE2   ) { os << sep << "SIMPLE2"   ; sep = '|'; }
+	if (flags & FLAG_OPTION    ) { os << sep << "OPTION"    ; sep = '|'; }
+	if (flags & FLAG_BRACED    ) { os << sep << "BRACED"    ; sep = '|'; }
+	if (flags & FLAG_CELL      ) { os << sep << "CELL"      ; sep = '|'; }
+	if (flags & FLAG_TABBING   ) { os << sep << "TABBING"   ; sep = '|'; }
+	os << "\n";
 }
+#endif
 
 
 //
@@ -139,55 +157,85 @@ string Token::asInput() const
 //
 
 
-Parser::Parser(istream & is)
-	: lineno_(0), pos_(0)
+Parser::Parser(idocstream & is)
+	: lineno_(0), pos_(0), iss_(0), is_(is), encoding_latex_("utf8")
 {
-	tokenize(is);
 }
 
 
 Parser::Parser(string const & s)
-	: lineno_(0), pos_(0)
+	: lineno_(0), pos_(0), 
+	  iss_(new idocstringstream(from_utf8(s))), is_(*iss_), 
+	  encoding_latex_("utf8")
 {
-	istringstream is(s);
-	tokenize(is);
 }
 
 
-void Parser::push_back(Token const & t)
+Parser::~Parser()
 {
-	tokens_.push_back(t);
+	delete iss_;
+}
+
+
+void Parser::setEncoding(std::string const & e)
+{
+	Encoding const * enc = encodings.fromLaTeXName(e);
+	if (!enc) {
+		cerr << "Unknown encoding " << e << ". Ignoring." << std::endl;
+		return;
+	}
+	//cerr << "setting encoding to " << enc->iconvName() << std::endl;
+	is_ << lyx::setEncoding(enc->iconvName());
+	encoding_latex_ = e;
 }
 
 
-void Parser::pop_back()
+void Parser::push_back(Token const & t)
 {
-	tokens_.pop_back();
+	tokens_.push_back(t);
 }
 
 
-Token const & Parser::prev_token() const
+// We return a copy here because the tokens_ vector may get reallocated
+Token const Parser::prev_token() const
 {
 	static const Token dummy;
 	return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
 }
 
 
-Token const & Parser::curr_token() const
+// We return a copy here because the tokens_ vector may get reallocated
+Token const Parser::curr_token() const
 {
 	static const Token dummy;
 	return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
 }
 
 
-Token const & Parser::next_token() const
+// We return a copy here because the tokens_ vector may get reallocated
+Token const Parser::next_token()
 {
 	static const Token dummy;
 	return good() ? tokens_[pos_] : dummy;
 }
 
 
-Token const & Parser::get_token()
+// We return a copy here because the tokens_ vector may get reallocated
+Token const Parser::next_next_token()
+{
+	static const Token dummy;
+	// If good() has not been called after the last get_token() we need
+	// to tokenize two more tokens.
+	if (pos_ + 1 >= tokens_.size()) {
+		tokenize_one();
+		tokenize_one();
+	}
+	return pos_ + 1 < tokens_.size() ? tokens_[pos_ + 1] : dummy;
+}
+
+
+// We return a copy here because the tokens_ vector may get reallocated
+Token const Parser::get_token()
 {
 	static const Token dummy;
 	//cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
@@ -195,7 +243,7 @@ Token const & Parser::get_token()
 }
 
 
-bool Parser::isParagraph() const
+bool Parser::isParagraph()
 {
 	// A new paragraph in TeX ist started
 	// - either by a newline, following any amount of whitespace
@@ -204,8 +252,7 @@ bool Parser::isParagraph() const
 	if (curr_token().cat() == catNewline &&
 	    (curr_token().cs().size() > 1 ||
 	     (next_token().cat() == catSpace &&
-	      pos_ < tokens_.size() - 1 &&
-	      tokens_[pos_ + 1].cat() == catNewline)))
+	      next_next_token().cat() == catNewline)))
 		return true;
 	if (curr_token().cat() == catEscape && curr_token().cs() == "par")
 		return true;
@@ -213,28 +260,37 @@ bool Parser::isParagraph() const
 }
 
 
-void Parser::skip_spaces(bool skip_comments)
+bool Parser::skip_spaces(bool skip_comments)
 {
 	// We just silently return if we have no more tokens.
 	// skip_spaces() should be callable at any time,
 	// the caller must check p::good() anyway.
+	bool skipped = false;
 	while (good()) {
 		get_token();
 		if (isParagraph()) {
 			putback();
 			break;
 		}
-		if ( curr_token().cat() == catSpace ||
-		     curr_token().cat() == catNewline ||
-		    (curr_token().cat() == catComment && curr_token().cs().empty()))
+		if (curr_token().cat() == catSpace ||
+		    curr_token().cat() == catNewline) {
+			skipped = true;
+			continue;
+		}
+		if ((curr_token().cat() == catComment && curr_token().cs().empty()))
 			continue;
-		if (skip_comments && curr_token().cat() == catComment)
-			cerr << "  Ignoring comment: " << curr_token().asInput();
-		else {
+		if (skip_comments && curr_token().cat() == catComment) {
+			// If positions_ is not empty we are doing some kind
+			// of look ahead
+			if (!positions_.empty())
+				cerr << "  Ignoring comment: "
+				     << curr_token().asInput();
+		} else {
 			putback();
 			break;
 		}
 	}
+	return skipped;
 }
 
 
@@ -246,7 +302,11 @@ void Parser::unskip_spaces(bool skip_comments)
 			putback();
 		else if (skip_comments && curr_token().cat() == catComment) {
 			// TODO: Get rid of this
-			cerr << "Unignoring comment: " << curr_token().asInput();
+			// If positions_ is not empty we are doing some kind
+			// of look ahead
+			if (!positions_.empty())
+				cerr << "Unignoring comment: "
+				     << curr_token().asInput();
 			putback();
 		}
 		else
@@ -261,8 +321,24 @@ void Parser::putback()
 }
 
 
-bool Parser::good() const
+void Parser::pushPosition()
 {
+	positions_.push_back(pos_);
+}
+
+
+void Parser::popPosition()
+{
+	pos_ = positions_.back();
+	positions_.pop_back();
+}
+
+
+bool Parser::good()
+{
+	if (pos_ < tokens_.size())
+		return true;
+	tokenize_one();
 	return pos_ < tokens_.size();
 }
 
@@ -271,7 +347,39 @@ char Parser::getChar()
 {
 	if (!good())
 		error("The input stream is not well...");
-	return tokens_[pos_++].character();
+	return get_token().character();
+}
+
+
+bool Parser::hasOpt()
+{
+	// An optional argument can occur in any of the following forms:
+	// - \foo[bar]
+	// - \foo [bar]
+	// - \foo
+	//   [bar]
+	// - \foo %comment
+	//   [bar]
+
+	// remember current position
+	unsigned int oldpos = pos_;
+	// skip spaces and comments
+	while (good()) {
+		get_token();
+		if (isParagraph()) {
+			putback();
+			break;
+		}
+		if (curr_token().cat() == catSpace ||
+		    curr_token().cat() == catNewline ||
+		    curr_token().cat() == catComment)
+			continue;
+		putback();
+		break;
+	}
+	bool const retval = (next_token().asInput() == "[");
+	pos_ = oldpos;
+	return retval;
 }
 
 
@@ -282,15 +390,22 @@ Parser::Arg Parser::getFullArg(char left, char right)
 	// This is needed if a partial file ends with a command without arguments,
 	// e. g. \medskip
 	if (! good())
-		return std::make_pair(false, string());
+		return make_pair(false, string());
 
 	string result;
 	char c = getChar();
 
 	if (c != left) {
 		putback();
-		return std::make_pair(false, string());
-	} else
+		return make_pair(false, string());
+	} else {
+		// a single '\' is only allowed within \verb, no matter what the delimiter is,
+		// for example "\verb+\+" (reported as bug #4468)
+		// To support this, we allow single '\' if it is the only character
+		// within equal delimiters
+		if (next_token().cat() == catEscape)
+			if (next_token().character() == right && right == left)
+				result += '\\';
 		while ((c = getChar()) != right && good()) {
 			// Ignore comments
 			if (curr_token().cat() == catComment) {
@@ -300,8 +415,8 @@ Parser::Arg Parser::getFullArg(char left, char right)
 			else
 				result += curr_token().asInput();
 		}
-
-	return std::make_pair(true, result);
+	}
+	return make_pair(true, result);
 }
 
 
@@ -311,19 +426,35 @@ string Parser::getArg(char left, char right)
 }
 
 
-string Parser::getFullOpt()
+string Parser::getFullOpt(bool keepws)
 {
 	Arg arg = getFullArg('[', ']');
 	if (arg.first)
 		return '[' + arg.second + ']';
-	return arg.second;
+	if (keepws)
+		unskip_spaces(true);
+	return string();
 }
 
 
-string Parser::getOpt()
+string Parser::getOpt(bool keepws)
 {
 	string const res = getArg('[', ']');
-	return res.empty() ? string() : '[' + res + ']';
+	if (res.empty()) {
+		if (keepws)
+			unskip_spaces(true);
+		return string();
+	}
+	return '[' + res + ']';
+}
+
+
+string Parser::getFullParentheseArg()
+{
+	Arg arg = getFullArg('(', ')');
+	if (arg.first)
+		return '(' + arg.second + ')';
+	return string();
 }
 
 
@@ -357,86 +488,123 @@ string const Parser::verbatimEnvironment(string const & name)
 }
 
 
-void Parser::tokenize(istream & is)
+string const Parser::plainEnvironment(string const & name)
 {
-	static bool init_done = false;
+	if (!good())
+		return string();
 
-	if (!init_done) {
-		catInit();
-		init_done = true;
+	ostringstream os;
+	for (Token t = get_token(); good(); t = get_token()) {
+		if (t.asInput() == "\\end") {
+			string const end = getArg('{', '}');
+			if (end == name)
+				return os.str();
+			else
+				os << "\\end{" << end << '}';
+		} else
+			os << t.asInput();
 	}
+	cerr << "unexpected end of input" << endl;
+	return os.str();
+}
 
-	char c;
-	while (is.get(c)) {
-		//cerr << "reading c: " << c << "\n";
 
-		switch (catcode(c)) {
-			case catSpace: {
-				string s(1, c);
-				while (is.get(c) && catcode(c) == catSpace)
-					s += c;
-				if (catcode(c) != catSpace)
-					is.putback(c);
-				push_back(Token(s, catSpace));
-				break;
-			}
-
-			case catNewline: {
-				++lineno_;
-				string s(1, getNewline(is, c));
-				while (is.get(c) && catcode(c) == catNewline) {
-					++lineno_;
-					s += getNewline(is, c);
-				}
-				if (catcode(c) != catNewline)
-					is.putback(c);
-				push_back(Token(s, catNewline));
-				break;
-			}
+string const Parser::plainCommand(char left, char right, string const & name)
+{
+	if (!good())
+		return string();
+	// check if first token is really the start character
+	Token tok = get_token();
+	if (tok.character() != left) {
+		cerr << "first character does not match start character of command \\" << name << endl;
+		return string();
+	}
+	ostringstream os;
+	for (Token t = get_token(); good(); t = get_token()) {
+		if (t.character() == right) {
+			return os.str();
+		} else
+			os << t.asInput();
+	}
+	cerr << "unexpected end of input" << endl;
+	return os.str();
+}
 
-			case catComment: {
-				// We don't treat "%\n" combinations here specially because
-				// we want to preserve them in the preamble
-				string s;
-				while (is.get(c) && catcode(c) != catNewline)
-					s += c;
-				// handle possible DOS line ending
-				if (catcode(c) == catNewline)
-					c = getNewline(is, c);
-				// Note: The '%' at the beginning and the '\n' at the end
-				// of the comment are not stored.
-				++lineno_;
-				push_back(Token(s, catComment));
-				break;
-			}
 
-			case catEscape: {
-				is.get(c);
-				if (!is) {
-					error("unexpected end of input");
-				} else {
-					string s(1, c);
-					if (catcode(c) == catLetter) {
-						// collect letters
-						while (is.get(c) && catcode(c) == catLetter)
-							s += c;
-						if (catcode(c) != catLetter)
-							is.putback(c);
-					}
-					push_back(Token(s, catEscape));
-				}
-				break;
-			}
+void Parser::tokenize_one()
+{
+	catInit();
+	char_type c;
+	if (!is_.get(c)) 
+		return;
 
-			case catIgnore: {
-				cerr << "ignoring a char: " << int(c) << "\n";
-				break;
+	switch (catcode(c)) {
+	case catSpace: {
+		docstring s(1, c);
+		while (is_.get(c) && catcode(c) == catSpace)
+			s += c;
+		if (catcode(c) != catSpace)
+			is_.putback(c);
+		push_back(Token(s, catSpace));
+		break;
+	}
+		
+	case catNewline: {
+		++lineno_;
+		docstring s(1, getNewline(is_, c));
+		while (is_.get(c) && catcode(c) == catNewline) {
+			++lineno_;
+			s += getNewline(is_, c);
+		}
+		if (catcode(c) != catNewline)
+			is_.putback(c);
+		push_back(Token(s, catNewline));
+		break;
+	}
+		
+	case catComment: {
+		// We don't treat "%\n" combinations here specially because
+		// we want to preserve them in the preamble
+		docstring s;
+		while (is_.get(c) && catcode(c) != catNewline)
+			s += c;
+		// handle possible DOS line ending
+		if (catcode(c) == catNewline)
+			c = getNewline(is_, c);
+		// Note: The '%' at the beginning and the '\n' at the end
+		// of the comment are not stored.
+		++lineno_;
+		push_back(Token(s, catComment));
+		break;
+	}
+		
+	case catEscape: {
+		is_.get(c);
+		if (!is_) {
+			error("unexpected end of input");
+		} else {
+			docstring s(1, c);
+			if (catcode(c) == catLetter) {
+				// collect letters
+				while (is_.get(c) && catcode(c) == catLetter)
+					s += c;
+				if (catcode(c) != catLetter)
+					is_.putback(c);
 			}
-
-			default:
-				push_back(Token(c, catcode(c)));
+			push_back(Token(s, catEscape));
 		}
+		break;
+	}
+		
+	case catIgnore: {
+		cerr << "ignoring a char: " << c << "\n";
+		break;
+	}
+		
+	default:
+		push_back(Token(docstring(1, c), catcode(c)));
 	}
+	//cerr << tokens_.back();
 }
 
 
@@ -465,12 +633,12 @@ string Parser::verbatimOption()
 	string res;
 	if (next_token().character() == '[') {
 		Token t = get_token();
-		for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
+		for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
 			if (t.cat() == catBegin) {
 				putback();
 				res += '{' + verbatim_item() + '}';
 			} else
-				res += t.asString();
+				res += t.cs();
 		}
 	}
 	return res;