From 0dc7f34e00e06e8b378236e32f86023313067b7c Mon Sep 17 00:00:00 2001
From: Thibaut Cuvelier <tcuvelier@lyx.org>
Date: Sat, 19 Feb 2022 02:10:45 +0100
Subject: [PATCH] unicodesymbols: parse supplementary lines to encode
 alternative ways to encode symbols in raw LaTeX.

---
 lib/unicodesymbols      |   2 +
 src/Encoding.cpp        | 126 +++++++++++++++++++++++-----------------
 src/Encoding.h          |   5 ++
 src/insets/InsetERT.cpp |   2 +
 4 files changed, 81 insertions(+), 54 deletions(-)

diff --git a/lib/unicodesymbols b/lib/unicodesymbols
index 043f4dcc43..1de6910f36 100644
--- a/lib/unicodesymbols
+++ b/lib/unicodesymbols
@@ -60,6 +60,7 @@
 #
 0x00a0 "~"                        "" "force=cp862;cp1255;cp1256;koi8-u;iso8859-6;iso8859-7;utf8-platex,notermination=both" "~" "" # NO-BREAK SPACE
 0x00a1 "\\textexclamdown"         "" "force=cp862;cp1255;euc-jp;euc-jp-platex;euc-kr;utf8-platex" # INVERTED EXCLAMATION MARK
+0x00a1 "!`" ""
 0x00a2 "\\textcent"               "textcomp" "force=cp862;cp1255;cp1256;euc-jp;euc-jp-platex;jis;shift-jis-platex" #"\\mathcent" "txfonts|pxfonts" # CENT SIGN
 0x00a3 "\\pounds"                 "" "force=cp862;cp1255;cp1256;iso8859-7;euc-jp;euc-jp-platex;jis;shift-jis-platex" "\\pounds" "" # Â£ POUND SIGN
 0x00a4 "\\textcurrency"           "textcomp" "force=cp1256;euc-cn;euc-jp;euc-jp-platex;euc-kr;gbk;iso8859-6;utf8-platex" # CURRENCY SYMBOL
@@ -90,6 +91,7 @@
 0x00bd "\\textonehalf"            "textcomp" "force=cp862;cp1255;cp1256;iso8859-7;euc-kr" "\\sfrac{1}{2}" "xfrac" # 1/2 FRACTION
 0x00be "\\textthreequarters"      "textcomp" "force=cp1255;cp1256;euc-kr" "\\sfrac{3}{4}" "xfrac" # 3/4 FRACTION
 0x00bf "\\textquestiondown"       "" "force=cp862;cp1255;euc-jp;euc-kr" # INVERTED QUESTION MARK
+0x00bf "?`" ""
 0x00c0 "\\`{A}"                   "" "mathalpha,force=euc-jp" "\\grave{A}" # LATIN CAPITAL LETTER A WITH GRAVE
 0x00c1 "\\'{A}"                   "" "mathalpha,force=euc-jp" "\\acute{A}" # LATIN CAPITAL LETTER A WITH ACUTE
 0x00c2 "\\^{A}"                   "" "mathalpha,force=euc-jp" "\\hat{A}" # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
diff --git a/src/Encoding.cpp b/src/Encoding.cpp
index b0d56495cd..4e47daef8b 100644
--- a/src/Encoding.cpp
+++ b/src/Encoding.cpp
@@ -50,7 +50,7 @@ CharInfoMap unicodesymbols;
 typedef set<char_type> CharSet;
 typedef map<string, CharSet> CharSetMap;
 CharSet forced;
-CharSetMap forcedselected;
+CharSetMap forcedSelected;
 
 typedef set<char_type> MathAlphaSet;
 MathAlphaSet mathalpha;
@@ -99,7 +99,7 @@ CharInfo::CharInfo(
 Encoding::Encoding(string const & n, string const & l, string const & g,
 		   string const & i, bool f, bool u, Encoding::Package p)
 	: name_(n), latexName_(l), guiName_(g), iconvName_(i), fixedwidth_(f),
-	  unsafe_(u), forced_(&forcedselected[n]), package_(p)
+	  unsafe_(u), forced_(&forcedSelected[n]), package_(p)
 {
 	if (n == "ascii") {
 		// ASCII can encode 128 code points and nothing else
@@ -705,20 +705,20 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 {
 	// We must read the symbolsfile first, because the Encoding
 	// constructor depends on it.
-	CharSetMap forcednotselected;
-	Lexer symbolslex;
-	symbolslex.setFile(symbolsfile);
+	CharSetMap forcedNotSelected;
+	Lexer symbolsLex;
+	symbolsLex.setFile(symbolsfile);
 	bool getNextToken = true;
-	while (symbolslex.isOK()) {
+	while (symbolsLex.isOK()) {
 		char_type symbol;
 
 		if (getNextToken) {
-			if (!symbolslex.next(true))
+			if (!symbolsLex.next(true))
 				break;
 		} else
 			getNextToken = true;
 
-		istringstream is(symbolslex.getString());
+		istringstream is(symbolsLex.getString());
 		// reading symbol directly does not work if
 		// char_type == wchar_t.
 		uint32_t tmp;
@@ -726,20 +726,38 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 			break;
 		symbol = tmp;
 
-		if (!symbolslex.next(true))
+		// Special case: more than one entry for one character (to add other LaTeX commands).
+		if (unicodesymbols.contains(symbol)) {
+			if (!symbolsLex.next(true))
+				break;
+			docstring textCommand = symbolsLex.getDocString();
+			if (!symbolsLex.next(true))
+				break;
+			string mathCommand = symbolsLex.getString();
+
+			if (!textCommand.empty())
+				unicodesymbols.at(symbol).addTextCommand(textCommand);
+			if (!mathCommand.empty())
+				unicodesymbols.at(symbol).addMathCommand(textCommand);
+
+			continue;
+		}
+
+		// If the symbol is not the same as the previous entry, consider it is a totally new symbol.
+		if (!symbolsLex.next(true))
 			break;
-		docstring textcommand = symbolslex.getDocString();
-		if (!symbolslex.next(true))
+		docstring textCommand = symbolsLex.getDocString();
+		if (!symbolsLex.next(true))
 			break;
-		string textpreamble = symbolslex.getString();
-		if (!symbolslex.next(true))
+		string textPreamble = symbolsLex.getString();
+		if (!symbolsLex.next(true))
 			break;
-		string sflags = symbolslex.getString();
+		string sflags = symbolsLex.getString();
 
-		string tipashortcut;
+		string tipaShortcut;
 		int flags = 0;
 
-		if (suffixIs(textcommand, '}'))
+		if (suffixIs(textCommand, '}'))
 			flags |= CharInfoTextNoTermination;
 		while (!sflags.empty()) {
 			string flag;
@@ -753,13 +771,13 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 				vector<string> encs =
 					getVectorFromString(flag.substr(6), ";");
 				for (auto const & enc : encs)
-					forcedselected[enc].insert(symbol);
+					forcedSelected[enc].insert(symbol);
 				flags |= CharInfoForceSelected;
 			} else if (prefixIs(flag, "force!=")) {
 				vector<string> encs =
 					getVectorFromString(flag.substr(7), ";");
 				for (auto const & enc : encs)
-					forcednotselected[enc].insert(symbol);
+					forcedNotSelected[enc].insert(symbol);
 				flags |= CharInfoForceSelected;
 			} else if (flag == "mathalpha") {
 				mathalpha.insert(symbol);
@@ -773,8 +791,8 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 			} else if (flag == "notermination=none") {
 				flags &= ~CharInfoTextNoTermination;
 				flags &= ~CharInfoMathNoTermination;
-			} else if (contains(flag, "tipaShortcut=")) {
-				tipashortcut = split(flag, '=');
+			} else if (contains(flag, "tipashortcut=")) {
+				tipaShortcut = split(flag, '=');
 			} else if (flag == "deprecated") {
 				flags |= CharInfoDeprecated;
 			} else {
@@ -786,25 +804,25 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 		}
 		// mathCommand and mathPreamble have been added for 1.6.0.
 		// make them optional so that old files still work.
-		int const lineno = symbolslex.lineNumber();
+		int const lineNo = symbolsLex.lineNumber();
 		bool breakout = false;
-		docstring mathcommand;
-		string mathpreamble;
-		if (symbolslex.next(true)) {
-			if (symbolslex.lineNumber() != lineno) {
+		docstring mathCommand;
+		string mathPreamble;
+		if (symbolsLex.next(true)) {
+			if (symbolsLex.lineNumber() != lineNo) {
 				// line in old format without mathCommand and mathPreamble
 				getNextToken = false;
 			} else {
-				mathcommand = symbolslex.getDocString();
-				if (suffixIs(mathcommand, '}'))
+				mathCommand = symbolsLex.getDocString();
+				if (suffixIs(mathCommand, '}'))
 					flags |= CharInfoMathNoTermination;
-				if (symbolslex.next(true)) {
-					if (symbolslex.lineNumber() != lineno) {
+				if (symbolsLex.next(true)) {
+					if (symbolsLex.lineNumber() != lineNo) {
 						// line in new format with mathCommand only
 						getNextToken = false;
 					} else {
 						// line in new format with mathCommand and mathPreamble
-						mathpreamble = symbolslex.getString();
+						mathPreamble = symbolsLex.getString();
 					}
 				} else
 					breakout = true;
@@ -814,20 +832,20 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 		}
 
 		// backward compatibility
-		if (mathpreamble == "esintoramsmath")
-			mathpreamble = "esint|amsmath";
+		if (mathPreamble == "esintoramsmath")
+			mathPreamble = "esint|amsmath";
 
-		if (!textpreamble.empty())
-			if (textpreamble[0] != '\\')
+		if (!textPreamble.empty())
+			if (textPreamble[0] != '\\')
 				flags |= CharInfoTextFeature;
-		if (!mathpreamble.empty())
-			if (mathpreamble[0] != '\\')
+		if (!mathPreamble.empty())
+			if (mathPreamble[0] != '\\')
 				flags |= CharInfoMathFeature;
 
 		CharInfo info = CharInfo(
-			textcommand, mathcommand,
-			textpreamble, mathpreamble,
-			tipashortcut, flags);
+				textCommand, mathCommand,
+				textPreamble, mathPreamble,
+				tipaShortcut, flags);
 		LYXERR(Debug::INFO, "Read unicode symbol " << symbol << " '"
 		                                           << to_utf8(info.textCommand()) << "' '" << info.textPreamble()
 		                                           << " '" << info.textFeature() << ' ' << info.textNoTermination()
@@ -851,12 +869,12 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 		et_end
 	};
 
-	LexerKeyword encodingtags[] = {
+	LexerKeyword encodingTags[] = {
 		{ "encoding", et_encoding },
 		{ "end", et_end }
 	};
 
-	Lexer lex(encodingtags);
+	Lexer lex(encodingTags);
 	lex.setFile(encfile);
 	lex.setContext("Encodings::read");
 	while (lex.isOK()) {
@@ -866,21 +884,21 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 			lex.next();
 			string const name = lex.getString();
 			lex.next();
-			string const latexname = lex.getString();
+			string const latexName = lex.getString();
 			lex.next();
-			string const guiname = lex.getString();
+			string const guiName = lex.getString();
 			lex.next();
-			string const iconvname = lex.getString();
+			string const iconvName = lex.getString();
 			lex.next();
 			string const width = lex.getString();
-			bool fixedwidth = false;
+			bool fixedWidth = false;
 			bool unsafe = false;
 			if (width == "fixed")
-				fixedwidth = true;
+				fixedWidth = true;
 			else if (width == "variable")
-				fixedwidth = false;
+				fixedWidth = false;
 			else if (width == "variableunsafe") {
-				fixedwidth = false;
+				fixedWidth = false;
 				unsafe = true;
 			}
 			else
@@ -901,9 +919,9 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 				lex.printError("Unknown package");
 
 			LYXERR(Debug::INFO, "Reading encoding " << name);
-			encodinglist[name] = Encoding(name, latexname,
-				guiname, iconvname, fixedwidth, unsafe,
-				package);
+			encodinglist[name] = Encoding(name, latexName,
+			                              guiName, iconvName, fixedWidth, unsafe,
+			                              package);
 
 			if (lex.lex() != et_end)
 				lex.printError("Missing end");
@@ -920,9 +938,9 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile)
 		}
 	}
 
-	// Move all information from forcednotselected to forcedselected
-	for (CharSetMap::const_iterator it1 = forcednotselected.begin(); it1 != forcednotselected.end(); ++it1) {
-		for (CharSetMap::iterator it2 = forcedselected.begin(); it2 != forcedselected.end(); ++it2) {
+	// Move all information from forcedNotSelected to forcedSelected
+	for (CharSetMap::const_iterator it1 = forcedNotSelected.begin(); it1 != forcedNotSelected.end(); ++it1) {
+		for (CharSetMap::iterator it2 = forcedSelected.begin(); it2 != forcedSelected.end(); ++it2) {
 			if (it2->first != it1->first)
 				it2->second.insert(it1->second.begin(), it1->second.end());
 		}
diff --git a/src/Encoding.h b/src/Encoding.h
index 2bfbda051d..6016f05f02 100644
--- a/src/Encoding.h
+++ b/src/Encoding.h
@@ -69,6 +69,11 @@ public:
 		std::vector<docstring> const & text_commands, std::vector<docstring> const & math_commands,
 		std::string const & text_preamble, std::string const & math_preamble,
 		std::string const & tipa_shortcut, unsigned int flags);
+	// Add a new text command for this symbol.
+	void addTextCommand(const docstring& newTextCommand) { text_commands_.emplace_back(newTextCommand); }
+	// Add a new math command for this symbol.
+	void addMathCommand(const docstring& newMathCommand) { math_commands_.emplace_back(newMathCommand); }
+
 	// we assume that at least one command is nonempty when using unicodesymbols
 	bool isUnicodeSymbol() const { return !text_commands_.empty() || !math_commands_.empty(); }
 	/// LaTeX command (text mode) for this character
diff --git a/src/insets/InsetERT.cpp b/src/insets/InsetERT.cpp
index 00f497f031..b3e73ffded 100644
--- a/src/insets/InsetERT.cpp
+++ b/src/insets/InsetERT.cpp
@@ -158,6 +158,8 @@ void InsetERT::docbook(XMLStream & xs, OutputParams const & runparams) const
 			os_trimmed.insert(4, from_ascii("}"));
 		}
 
+		std::cout << to_utf8(os_trimmed) << std::endl;
+
 		// Look into the global table of Unicode characters if there is a match.
 		bool termination;
 		docstring rem;
-- 
2.39.5