From f2d2b7cc7dc88d2ba018ac3db5f3358c510020e3 Mon Sep 17 00:00:00 2001 From: Georg Baum Date: Sun, 24 Feb 2013 15:44:16 +0100 Subject: [PATCH] Fix some texl2yx accent bugs This fixes bug #8554 and some recently introduced busg: - Encodings::fromLaTeXCommand() can now handle all combining characters, not only the one letter ones - The remainder returned from Encodings::fromLaTeXCommand() must never be thrown away in tex2lyx, but output as ERT - No special case for combining diacritical marks needed anymore in parse_text() - No special cases for accents and IPA combining diacritical marks needed anymore in parse_text() - special tipa short cuts may only be recognized if the tipa package is loaded - Use requirements returned by Encodings::fromLaTeXCommand() instead of hardcoded registering of tipa and tipax - Get rid of the name2 variable in parse_text(): We must use name, otherwise the extra stuff that might have been put into name vanishes --- src/Encoding.cpp | 95 ++++++++++-- src/tex2lyx/test/test-insets.lyx.lyx | 122 +-------------- src/tex2lyx/text.cpp | 221 +++++++++++---------------- 3 files changed, 183 insertions(+), 255 deletions(-) diff --git a/src/Encoding.cpp b/src/Encoding.cpp index 82f2ea91f0..1f7665ff2e 100644 --- a/src/Encoding.cpp +++ b/src/Encoding.cpp @@ -559,14 +559,28 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype, bool const mathmode = cmdtype & MATH_CMD; bool const textmode = cmdtype & TEXT_CMD; docstring symbols; - size_t i = 0; size_t const cmdend = cmd.size(); + size_t prefix = 0; CharInfoMap::const_iterator const uniend = unicodesymbols.end(); - for (size_t j = 0; j < cmdend; ++j) { + for (size_t i = 0, j = 0; j < cmdend; ++j) { // Also get the char after a backslash - if (j + 1 < cmdend && cmd[j] == '\\') + if (j + 1 < cmdend && cmd[j] == '\\') { ++j; + prefix = 1; + // Detect things like \=*{e} as well + if (j + 3 < cmdend && cmd[j+1] == '*' && + cmd[j+2] == '{') { + ++j; + prefix = 2; + } + } + // position of the last character before a possible macro + // argument + size_t m = j; // If a macro argument follows, get it, too + // Do it here only for single character commands. Other + // combining commands need this too, but they are handled in + // the loop below for performance reasons. if (j + 1 < cmdend && cmd[j + 1] == '{') { size_t k = j + 1; int count = 1; @@ -579,12 +593,19 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype, } if (k != docstring::npos) j = k; + } else if (m + 1 < cmdend && isAlphaASCII(cmd[m])) { + while (m + 2 < cmdend && isAlphaASCII(cmd[m+1])) + m++; } // Start with this substring and try augmenting it when it is // the prefix of some command in the unicodesymbols file - docstring const subcmd = cmd.substr(i, j - i + 1); + docstring subcmd = cmd.substr(i, j - i + 1); CharInfoMap::const_iterator it = unicodesymbols.begin(); + // First part of subcmd which might be a combining character + docstring combcmd = (m == j) ? docstring() : cmd.substr(i, m - i + 1); + // The combining character of combcmd if it exists + CharInfoMap::const_iterator combining = uniend; size_t unicmd_size = 0; char_type c = 0; for (; it != uniend; ++it) { @@ -592,6 +613,9 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype, : docstring(); docstring const text = textmode ? it->second.textcommand : docstring(); + if (!combcmd.empty() && it->second.combining() && + (math == combcmd || text == combcmd)) + combining = it; size_t cur_size = max(math.size(), text.size()); // The current math or text unicode command cannot // match, or we already matched a longer one @@ -618,6 +642,26 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype, // If this is an exact match, we found a (longer) // matching entry in the unicodesymbols file. + if (math != tmp && text != tmp) + continue; + // If we found a combining command, we need to append + // the macro argument if this has not been done above. + if (tmp == combcmd && combining != uniend && + k < cmdend && cmd[k] == '{') { + size_t l = k; + int count = 1; + while (l < cmdend && count && l != docstring::npos) { + l = cmd.find_first_of(from_ascii("{}"), l + 1); + if (cmd[l] == '{') + ++count; + else + --count; + } + if (l != docstring::npos) { + j = l; + subcmd = cmd.substr(i, j - i + 1); + } + } // If the entry doesn't start with '\', we take note // of the match and continue (this is not a ultimate // acceptance, as some other entry may match a longer @@ -627,12 +671,13 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype, // (nonletter) char macro, or nothing else follows, // or what follows is a nonletter char, or the last // character is a }. - if ((math == tmp || text == tmp) - && (tmp[0] != '\\' - || (tmp.size() == 2 && !isAlphaASCII(tmp[1])) + else if (tmp[0] != '\\' + || (tmp.size() == prefix + 1 && + !isAlphaASCII(tmp[1]) && + (prefix == 1 || !isAlphaASCII(tmp[2]))) || k == cmdend || !isAlphaASCII(cmd[k]) - || tmp[tmp.size() - 1] == '}') + || tmp[tmp.size() - 1] == '}' ) { c = it->first; j = k - 1; @@ -654,7 +699,39 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype, } if (unicmd_size) symbols += c; - else if (j + 1 == cmdend) { + else if (combining != uniend && + prefixIs(subcmd, combcmd + '{')) { + // We know that subcmd starts with combcmd and + // contains an argument in braces. + docstring const arg = subcmd.substr( + combcmd.length() + 1, + subcmd.length() - combcmd.length() - 2); + // If arg is a single character we can construct a + // combining sequence. + char_type a; + bool argcomb = false; + if (arg.size() == 1 && isAlnumASCII(arg[0])) + a = arg[0]; + else { + // Use the version of fromLaTeXCommand() that + // parses only one command, since we cannot + // use more than one character. + bool dummy = false; + set r; + a = fromLaTeXCommand(arg, cmdtype, argcomb, + dummy, &r); + if (a && req && !argcomb) + req->insert(r.begin(), r.end()); + } + if (a && !argcomb) { + // In unicode the combining character comes + // after its base + symbols += a; + symbols += combining->first; + unicmd_size = 2; + } + } + if (j + 1 == cmdend && !unicmd_size) { // No luck. Return what remains rem = cmd.substr(i); if (needsTermination && !rem.empty()) { diff --git a/src/tex2lyx/test/test-insets.lyx.lyx b/src/tex2lyx/test/test-insets.lyx.lyx index 5f2554cbe0..6d49239ab1 100644 --- a/src/tex2lyx/test/test-insets.lyx.lyx +++ b/src/tex2lyx/test/test-insets.lyx.lyx @@ -5761,19 +5761,7 @@ TIPA \begin_inset IPA \begin_layout Standard -e̥ -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout - -\backslash -r{e} -\end_layout - -\end_inset - - e̬ e̤ e˷ e̼ e̪ e̺ e̻ e +e̥ e̊ e̬ e̤ ḛ e̼ e̪ e̺ e̻ e \begin_inset script superscript \begin_layout Standard @@ -5782,83 +5770,7 @@ h \end_inset - e̹ e̜ e̟ -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout - -\backslash -={*} -\end_layout - -\end_inset - - -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout -{ -\end_layout - -\end_inset - -e -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout -} -\end_layout - -\end_inset - - -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout - -\backslash -b{e} -\end_layout - -\end_inset - - ë e̽ -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout - -\backslash -s -\end_layout - -\end_inset - - -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout -{ -\end_layout - -\end_inset - -e -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout -} -\end_layout - -\end_inset - - e̩ e̯ e˞e + e̹ e̜ e̟ e̠ e̠ ë e̽ e̩ e̩ e̯ e˞e \begin_inset script superscript \begin_layout Standard @@ -5912,23 +5824,11 @@ l \end_inset - e˺ -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout - -\backslash -H{e} -\end_layout - -\end_inset - - + e˺ e̋ \end_layout \begin_layout Standard -è ē é eȅ ě ê e᷄ e᷅ e᷈ ĕ +è ē é ȅ ě ê e᷄ e᷅ e᷈ ĕ \end_layout \end_inset @@ -6053,19 +5953,7 @@ tz \begin_inset IPA \begin_layout Standard -:;eˈˌ|‖. -\begin_inset ERT -status collapsed - -\begin_layout Plain Layout - -\backslash -t{*} -\end_layout - -\end_inset - - +:;eˈˌ|‖.͡* \begin_inset ERT status collapsed diff --git a/src/tex2lyx/text.cpp b/src/tex2lyx/text.cpp index 2a988bcb6b..8d84955752 100644 --- a/src/tex2lyx/text.cpp +++ b/src/tex2lyx/text.cpp @@ -2425,6 +2425,9 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, if (!s.empty()) { context.check_layout(os); os << to_utf8(s); + if (!rem.empty()) + output_ert_inset(os, + to_utf8(rem), context); } else // we did not find a non-ert version output_ert_inset(os, name, context); @@ -3255,34 +3258,11 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, p.skip_spaces(); } - // the TIPA Combining diacritical marks - else if (is_known(t.cs(), known_tipa_marks) || t.cs() == "textvertline") { - preamble.registerAutomaticallyLoadedPackage("tipa"); - preamble.registerAutomaticallyLoadedPackage("tipx"); - context.check_layout(os); - if (t.cs() == "textvertline") { - os << "|"; - skip_braces(p); - continue; - } - // try to see whether the string is in unicodesymbols - bool termination; - docstring rem; - string content = trimSpaceAndEol(p.verbatim_item()); - string command = t.asInput() + "{" + content + "}"; - set req; - docstring s = encodings.fromLaTeXCommand(from_utf8(command), - Encodings::TEXT_CMD | Encodings::MATH_CMD, - termination, rem, &req); - if (!s.empty()) { - if (!rem.empty()) - cerr << "When parsing " << command - << ", result is " << to_utf8(s) - << "+" << to_utf8(rem) << endl; - os << content << to_utf8(s); - } else - // we did not find a non-ert version - output_ert_inset(os, command, context); + else if (t.cs() == "textvertline") { + // FIXME: This is not correct, \textvertline is higher than | + os << "|"; + skip_braces(p); + continue; } else if (t.cs() == "tone" ) { @@ -3304,11 +3284,9 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, Encodings::TEXT_CMD | Encodings::MATH_CMD, termination, rem, &req); if (!s.empty()) { - if (!rem.empty()) - cerr << "When parsing " << command - << ", result is " << to_utf8(s) - << "+" << to_utf8(rem) << endl; os << to_utf8(s); + if (!rem.empty()) + output_ert_inset(os, to_utf8(rem), context); } else // we did not find a non-ert version output_ert_inset(os, command, context); @@ -3908,33 +3886,6 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, else if (t.cs() == "=" && (flags & FLAG_TABBING)) output_ert_inset(os, t.asInput(), context); - // accents (see Table 6 in Comprehensive LaTeX Symbol List) - else if (t.cs().size() == 1 - && contains("\"'.=^`bcdHkrtuv~", t.cs())) { - context.check_layout(os); - // try to see whether the string is in unicodesymbols - bool termination; - docstring rem; - string command = t.asInput() + "{" - + trimSpaceAndEol(p.verbatim_item()) - + "}"; - set req; - docstring s = encodings.fromLaTeXCommand(from_utf8(command), - Encodings::TEXT_CMD | Encodings::MATH_CMD, - termination, rem, &req); - if (!s.empty()) { - if (!rem.empty()) - cerr << "When parsing " << command - << ", result is " << to_utf8(s) - << "+" << to_utf8(rem) << endl; - os << to_utf8(s); - for (set::const_iterator it = req.begin(); it != req.end(); ++it) - preamble.registerAutomaticallyLoadedPackage(*it); - } else - // we did not find a non-ert version - output_ert_inset(os, command, context); - } - else if (t.cs() == "\\") { context.check_layout(os); if (p.hasOpt()) @@ -4607,15 +4558,12 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, // try to see whether the string is in unicodesymbols // Only use text mode commands, since we are in text mode here, // and math commands may be invalid (bug 6797) - bool termination; - docstring rem; - set req; string name = t.asInput(); // handle the dingbats and Cyrillic if (name == "\\ding" || name == "\\textcyr") name = name + '{' + p.getArg('{', '}') + '}'; // handle the ifsym characters - if (name == "\\textifsymbol") { + else if (name == "\\textifsymbol") { string const optif = p.getFullOpt(); string const argif = p.getArg('{', '}'); name = name + optif + '{' + argif + '}'; @@ -4623,84 +4571,99 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, // handle the \ascii characters // the case of \ascii within braces, as LyX outputs it, is already // handled for t.cat() == catBegin - if (name == "\\ascii") { + else if (name == "\\ascii") { // the code is "\asci\xxx" name = "{" + name + p.get_token().asInput() + "}"; skip_braces(p); } // handle some TIPA special characters - if (name == "\\textglobfall") { - name = "End"; - skip_braces(p); - } - if (name == "\\textdoublevertline") { - name = "\\textbardbl"; - skip_braces(p); - } - if (name == "\\!" ) { - if (p.next_token().asInput() == "b") { - p.get_token(); // eat 'b' - name = "\\texthtb"; + else if (preamble.isPackageUsed("tipa")) { + if (name == "\\textglobfall") { + name = "End"; skip_braces(p); - } - if (p.next_token().asInput() == "d") { - p.get_token(); - name = "\\texthtd"; - skip_braces(p); - } - if (p.next_token().asInput() == "g") { - p.get_token(); - name = "\\texthtg"; - skip_braces(p); - } - if (p.next_token().asInput() == "G") { - p.get_token(); - name = "\\texthtscg"; - skip_braces(p); - } - if (p.next_token().asInput() == "j") { - p.get_token(); - name = "\\texthtbardotlessj"; - skip_braces(p); - } - if (p.next_token().asInput() == "o") { + } else if (name == "\\s") { + // fromLaTeXCommand() does not yet + // recognize tipa short cuts + name = "\\textsyllabic"; + } else if (name == "\\=" && + p.next_token().asInput() == "*") { + // fromLaTeXCommand() does not yet + // recognize tipa short cuts p.get_token(); - name = "\\textbullseye"; + name = "\\b"; + } else if (name == "\\textdoublevertline") { + // FIXME: This is not correct, + // \textvertline is higher than \textbardbl + name = "\\textbardbl"; skip_braces(p); + } else if (name == "\\!" ) { + if (p.next_token().asInput() == "b") { + p.get_token(); // eat 'b' + name = "\\texthtb"; + skip_braces(p); + } else if (p.next_token().asInput() == "d") { + p.get_token(); + name = "\\texthtd"; + skip_braces(p); + } else if (p.next_token().asInput() == "g") { + p.get_token(); + name = "\\texthtg"; + skip_braces(p); + } else if (p.next_token().asInput() == "G") { + p.get_token(); + name = "\\texthtscg"; + skip_braces(p); + } else if (p.next_token().asInput() == "j") { + p.get_token(); + name = "\\texthtbardotlessj"; + skip_braces(p); + } else if (p.next_token().asInput() == "o") { + p.get_token(); + name = "\\textbullseye"; + skip_braces(p); + } + } else if (name == "\\*" ) { + if (p.next_token().asInput() == "k") { + p.get_token(); + name = "\\textturnk"; + skip_braces(p); + } else if (p.next_token().asInput() == "r") { + p.get_token(); // eat 'b' + name = "\\textturnr"; + skip_braces(p); + } else if (p.next_token().asInput() == "t") { + p.get_token(); + name = "\\textturnt"; + skip_braces(p); + } else if (p.next_token().asInput() == "w") { + p.get_token(); + name = "\\textturnw"; + skip_braces(p); + } } } - if (name == "\\*" ) { - if (p.next_token().asInput() == "k") { - p.get_token(); - name = "\\textturnk"; - skip_braces(p); - } - if (p.next_token().asInput() == "r") { - p.get_token(); // eat 'b' - name = "\\textturnr"; - skip_braces(p); - } - if (p.next_token().asInput() == "t") { - p.get_token(); - name = "\\textturnt"; - skip_braces(p); - } - if (p.next_token().asInput() == "w") { - p.get_token(); - name = "\\textturnw"; - skip_braces(p); - } + if ((name.size() == 2 && + contains("\"'.=^`bcdHkrtuv~", name[1]) && + p.next_token().asInput() != "*") || + is_known(name.substr(1), known_tipa_marks)) { + // name is a command that corresponds to a + // combining character in unicodesymbols. + // Append the argument, fromLaTeXCommand() + // will either convert it to a single + // character or a combining sequence. + name += '{' + p.verbatim_item() + '}'; } // now get the character from unicodesymbols + bool termination; + docstring rem; + set req; docstring s = encodings.fromLaTeXCommand(from_utf8(name), Encodings::TEXT_CMD, termination, rem, &req); if (!s.empty()) { - if (!rem.empty()) - cerr << "When parsing " << t.cs() - << ", result is " << to_utf8(s) - << "+" << to_utf8(rem) << endl; context.check_layout(os); os << to_utf8(s); + if (!rem.empty()) + output_ert_inset(os, to_utf8(rem), context); if (termination) skip_spaces_braces(p); for (set::const_iterator it = req.begin(); it != req.end(); ++it) @@ -4720,14 +4683,14 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer, output_ert_inset(os, s + ' ', context); */ else { - string name2 = t.asInput(); - if (p.next_token().asInput() == "*") { + if (t.asInput() == name && + p.next_token().asInput() == "*") { // Starred commands like \vspace*{} p.get_token(); // Eat '*' - name2 += '*'; + name += '*'; } - if (!parse_command(name2, p, os, outer, context)) - output_ert_inset(os, name2, context); + if (!parse_command(name, p, os, outer, context)) + output_ert_inset(os, name, context); } } -- 2.39.2