X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2FLexer.cpp;h=098faa9737282c3fc40458ec67665b26b8b881ef;hb=4ea6d81437ec7b5bea165d78ca37cb163e262042;hp=5136c672073d6dc451360a22f6b088915c1a66f3;hpb=e7645f24391648fb08cf6b6d808fcabec76be744;p=lyx.git

diff --git a/src/Lexer.cpp b/src/Lexer.cpp
index 5136c67207..098faa9737 100644
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -4,7 +4,7 @@
  * Licence details can be found in the file COPYING.
  *
  * \author Alejandro Aguilar Sierra
- * \author Lars Gullik Bjønnes
+ * \author Lars Gullik BjÃ¸nnes
  * \author Jean-Marc Lasgouttes
  * \author John Levon
  *
@@ -14,12 +14,14 @@
 #include <config.h>
 
 #include "Lexer.h"
+#include "Format.h"
 
 #include "support/convert.h"
 #include "support/debug.h"
 #include "support/FileName.h"
 #include "support/filetools.h"
 #include "support/gzstream.h"
+#include "support/lassert.h"
 #include "support/lstrings.h"
 #include "support/lyxalgo.h"
 #include "support/types.h"
@@ -146,7 +148,7 @@ public:
 	}
 };
 
-} // end of anon namespace
+} // namespace
 
 
 Lexer::Pimpl::Pimpl(LexerKeyword * tab, int num)
@@ -173,7 +175,9 @@ void Lexer::Pimpl::printError(string const & message) const
 {
 	string const tmpmsg = subst(message, "$$Token", getString());
 	lyxerr << "LyX: " << tmpmsg << " [around line " << lineno
-		<< " of file " << to_utf8(makeDisplayPath(name)) << ']' << endl;
+		<< " of file " << to_utf8(makeDisplayPath(name))
+		<< " current token: '" << getString() << "'"
+		<< " context: '" << context << "']" << endl;
 }
 
 
@@ -235,9 +239,7 @@ void Lexer::Pimpl::popTable()
 bool Lexer::Pimpl::setFile(FileName const & filename)
 {
 	// Check the format of the file.
-	string const format = filename.guessFormatFromContents();
-
-	if (format == "gzip" || format == "zip" || format == "compress") {
+	if (theFormats().isZippedFile(filename)) {
 		LYXERR(Debug::LYXLEX, "lyxlex: compressed");
 		// The check only outputs a debug message, because it triggers
 		// a bug in compaq cxx 6.2, where is_open() returns 'true' for
@@ -247,9 +249,10 @@ bool Lexer::Pimpl::setFile(FileName const & filename)
 				"file or stream already set.");
 		gz_.open(filename.toFilesystemEncoding().c_str(), ios::in);
 		is.rdbuf(&gz_);
-		name = filename.absFilename();
+		name = filename.absFileName();
 		lineno = 0;
-		return gz_.is_open() && is.good();
+		if (!gz_.is_open() || !is.good())
+			return false;
 	} else {
 		LYXERR(Debug::LYXLEX, "lyxlex: UNcompressed");
 
@@ -260,12 +263,25 @@ bool Lexer::Pimpl::setFile(FileName const & filename)
 			LYXERR(Debug::LYXLEX, "Error in Lexer::setFile: "
 				"file or stream already set.");
 		}
-		fb_.open(filename.toFilesystemEncoding().c_str(), ios::in);
+		fb_.open(filename.toSafeFilesystemEncoding().c_str(), ios::in);
 		is.rdbuf(&fb_);
-		name = filename.absFilename();
+		name = filename.absFileName();
 		lineno = 0;
-		return fb_.is_open() && is.good();
+		if (!fb_.is_open() || !is.good())
+			return false;
+	}
+
+	// Skip byte order mark.
+	if (is.peek() == 0xef) {
+		is.get();
+		if (is.peek() == 0xbb) {
+			is.get();
+			LASSERT(is.get() == 0xbf, /**/);
+		} else
+			is.unget();
 	}
+
+	return true;
 }
 
 
@@ -304,18 +320,12 @@ bool Lexer::Pimpl::next(bool esc /* = false */)
 	}
 
 
-	unsigned char c = 0; // getc() returns an int
 	char cc = 0;
 	status = 0;
 	while (is && !status) {
 		is.get(cc);
-		c = cc;
+		unsigned char c = cc;
 
-		// skip ','s
-		if (esc && c == ',')
-			continue;
-
-		
 		if (c == commentChar) {
 			// Read rest of line (fast :-)
 #if 1
@@ -337,9 +347,8 @@ bool Lexer::Pimpl::next(bool esc /* = false */)
 
 			if (esc) {
 
-				bool escaped = false;
 				do {
-					escaped = false;
+					bool escaped = false;
 					is.get(cc);
 					c = cc;
 					if (c == '\r') continue;
@@ -380,13 +389,13 @@ bool Lexer::Pimpl::next(bool esc /* = false */)
 			break;
 		}
 
-		if (!esc && c == ',')
+		if (c == ',')
 			continue;              /* Skip ','s */
 
-			// using relational operators with chars other
-			// than == and != is not safe. And if it is done
-			// the type _have_ to be unsigned. It usually a
-			// lot better to use the functions from cctype
+		// using relational operators with chars other
+		// than == and != is not safe. And if it is done
+		// the type _have_ to be unsigned. It usually a
+		// lot better to use the functions from cctype
 		if (c > ' ' && is)  {
 			buff.clear();
 
@@ -404,7 +413,7 @@ bool Lexer::Pimpl::next(bool esc /* = false */)
 			status = LEX_TOKEN;
 		}
 
-		if (!esc && c == '\r' && is) {
+		if (c == '\r' && is) {
 			// The Windows support has lead to the
 			// possibility of "\r\n" at the end of
 			// a line.  This will stop LyX choking
@@ -461,7 +470,7 @@ bool Lexer::Pimpl::eatLine()
 		is.get(cc);
 		c = cc;
 		//LYXERR(Debug::LYXLEX, "Lexer::EatLine read char: `" << c << '\'');
-		if (c != '\r')
+		if (c != '\r' && is)
 			buff.push_back(c);
 	}
 
@@ -502,7 +511,7 @@ bool Lexer::Pimpl::nextToken()
 		char cc = 0;
 		is.get(cc);
 		c = cc;
-		if (c >= ' ' && is) {
+		if ((c >= ' ' || c == '\t') && is) {
 			buff.clear();
 
 			if (c == '\\') { // first char == '\\'
@@ -516,7 +525,7 @@ bool Lexer::Pimpl::nextToken()
 					buff.push_back(c);
 					is.get(cc);
 					c = cc;
-				} while (c >= ' ' && c != '\\' && is);
+				} while ((c >= ' ' || c == '\t') && c != '\\' && is);
 			}
 
 			if (c == '\\')
@@ -558,7 +567,7 @@ void Lexer::Pimpl::pushToken(string const & pt)
 //////////////////////////////////////////////////////////////////////
 
 Lexer::Lexer()
-	: pimpl_(new Pimpl(0, 0))
+	: pimpl_(new Pimpl(0, 0)), lastReadOk_(false)
 {}
 
 
@@ -639,6 +648,7 @@ void Lexer::setCommentChar(char c)
 	pimpl_->setCommentChar(c);
 }
 
+
 int Lexer::lex()
 {
 	return pimpl_->lex();
@@ -683,23 +693,23 @@ double Lexer::getFloat() const
 }
 
 
-string const Lexer::getString() const
+string const Lexer::getString(bool trim) const
 {
 	lastReadOk_ = pimpl_->status == LEX_DATA || pimpl_->status == LEX_TOKEN;
 
 	if (lastReadOk_)
-	return pimpl_->getString();
+		return trim ? support::trim(pimpl_->getString(), "\t ") : pimpl_->getString();
 
 	return string();
 }
 
 
-docstring const Lexer::getDocString() const
+docstring const Lexer::getDocString(bool trim) const
 {
 	lastReadOk_ = pimpl_->status == LEX_DATA || pimpl_->status == LEX_TOKEN;
 
 	if (lastReadOk_)
-		return pimpl_->getDocString();
+		return trim ? support::trim(pimpl_->getDocString(), "\t ") : pimpl_->getDocString();
 
 	return docstring();
 }
@@ -708,28 +718,27 @@ docstring const Lexer::getDocString() const
 // I would prefer to give a tag number instead of an explicit token
 // here, but it is not possible because Buffer::readDocument uses
 // explicit tokens (JMarc)
-string const Lexer::getLongString(string const & endtoken)
+docstring Lexer::getLongString(docstring const & endtoken)
 {
-	string str;
-	string prefix;
+	docstring str;
+	docstring prefix;
 	bool firstline = true;
 
 	while (pimpl_->is) { //< eatLine only reads from is, not from pushTok
 		if (!eatLine())
 			// blank line in the file being read
 			continue;
+		docstring tmpstr = getDocString();
+		docstring const token = trim(tmpstr, " \t");
 
-		string const token = trim(getString(), " \t");
-
-		LYXERR(Debug::PARSER, "LongString: `" << getString() << '\'');
+		LYXERR(Debug::PARSER, "LongString: `" << tmpstr << '\'');
 
 		// We do a case independent comparison, like searchKeyword does.
-		if (compare_ascii_no_case(token, endtoken) == 0)
+		if (compare_no_case(token, endtoken) == 0)
 			break;
 
-		string tmpstr = getString();
 		if (firstline) {
-			size_t i = tmpstr.find_first_not_of(' ');
+			size_t i = tmpstr.find_first_not_of(from_ascii(" \t"));
 			if (i != string::npos)
 				prefix = tmpstr.substr(0, i);
 			firstline = false;
@@ -738,14 +747,14 @@ string const Lexer::getLongString(string const & endtoken)
 
 		// further lines in long strings may have the same
 		// whitespace prefix as the first line. Remove it.
-		if (prefix.length() && prefixIs(tmpstr, prefix))
-			tmpstr.erase(0, prefix.length() - 1);
+		if (!prefix.empty() && prefixIs(tmpstr, prefix))
+			tmpstr.erase(0, prefix.length());
 
-		str += ltrim(tmpstr, "\t") + '\n';
+		str += tmpstr + '\n';
 	}
 
 	if (!pimpl_->is)
-		printError("Long string not ended by `" + endtoken + '\'');
+		printError("Long string not ended by `" + to_utf8(endtoken) + '\'');
 
 	return str;
 }
@@ -753,7 +762,7 @@ string const Lexer::getLongString(string const & endtoken)
 
 bool Lexer::getBool() const
 {
-	string const s = pimpl_->getString();	
+	string const s = pimpl_->getString();
 	if (s == "false" || s == "0") {
 		lastReadOk_ = true;
 		return false;
@@ -765,6 +774,7 @@ bool Lexer::getBool() const
 	pimpl_->printError("Bad boolean `$$Token'. "
 				 "Use \"false\" or \"true\"");
 	lastReadOk_ = false;
+	return false;
 }
 
 
@@ -902,12 +912,24 @@ string Lexer::quoteString(string const & arg)
 }
 
 
+// same for docstring
+docstring Lexer::quoteString(docstring const & arg)
+{
+	docstring res;
+	res += '"';
+	res += subst(subst(arg, from_ascii("\\"), from_ascii("\\\\")),
+		     from_ascii("\""), from_ascii("\\\""));
+	res += '"';
+	return res;
+}
+
+
 Lexer & Lexer::operator>>(char const * required)
 {
 	string token;
 	*this >> token;
 	if (token != required) {
-		LYXERR0("Missing '" << required << "'-tag in " << pimpl_->context 
+		LYXERR0("Missing '" << required << "'-tag in " << pimpl_->context
 			<< ". Got " << token << " instead. Line: " << lineNumber());
 		pushToken(token);
 	}