]> git.lyx.org Git - lyx.git/commitdiff
mode convertLaTeXCommands from BiblioInfo to Encodings
authorJuergen Spitzmueller <spitz@lyx.org>
Sat, 20 Jul 2024 15:37:45 +0000 (17:37 +0200)
committerJuergen Spitzmueller <spitz@lyx.org>
Sat, 20 Jul 2024 15:39:58 +0000 (17:39 +0200)
So it can also be used in other contexts.

This includes also an improvement of math parsing

src/BiblioInfo.cpp
src/Encoding.cpp
src/Encoding.h

index d70de5d1698dafb148513a0c8f7f4ad37122d650..d933bda4c966f65245567a5aed895c70b0a2df23 100644 (file)
@@ -329,141 +329,6 @@ bool multipleAuthors(docstring const & author)
 }
 
 
-// converts a string containing LaTeX commands into unicode
-// for display.
-docstring convertLaTeXCommands(docstring const & str)
-{
-       docstring val = str;
-       docstring ret;
-
-       bool scanning_cmd = false;
-       bool scanning_math = false;
-       bool is_section = false;
-       bool escaped = false; // used to catch \$, etc.
-       while (!val.empty()) {
-               char_type const ch = val[0];
-
-               // if we're scanning math, we output everything until we
-               // find an unescaped $, at which point we break out.
-               if (scanning_math) {
-                       if (escaped)
-                               escaped = false;
-                       else if (ch == '\\')
-                               escaped = true;
-                       else if (ch == '$')
-                               scanning_math = false;
-                       ret += ch;
-                       val = val.substr(1);
-                       continue;
-               }
-
-               // if we're scanning a command name, then we just
-               // discard characters until we hit something that
-               // isn't alpha.
-               if (scanning_cmd) {
-                       if (!is_section && ch == 'S') {
-                               is_section = true;
-                               val = val.substr(1);
-                               continue;
-                       }
-                       if (isAlphaASCII(ch)) {
-                               is_section = false;
-                               val = val.substr(1);
-                               escaped = false;
-                               continue;
-                       } else if (is_section) {
-                               ret.push_back(0x00a7);
-                               is_section = false;
-                               continue;
-                       }
-                       // so we're done with this command.
-                       // now we fall through and check this character.
-                       is_section = false;
-                       scanning_cmd = false;
-               }
-
-               // was the last character a \? If so, then this is something like:
-               // \\ or \$, so we'll just output it. That's probably not always right...
-               if (escaped) {
-                       // exception: output \, as THIN SPACE
-                       if (ch == ',')
-                               ret.push_back(0x2009);
-                       else
-                               ret += ch;
-                       val = val.substr(1);
-                       escaped = false;
-                       continue;
-               }
-
-               if (ch == '~') {
-                       ret += char_type(0x00a0);
-                       val = val.substr(1);
-                       continue;
-               }
-
-               if (ch == '$') {
-                       ret += ch;
-                       val = val.substr(1);
-                       scanning_math = true;
-                       continue;
-               }
-
-               // Change text mode accents in the form
-               // {\v a} to \v{a} (see #9340).
-               // FIXME: This is a sort of mini-tex2lyx.
-               //        Use the real tex2lyx instead!
-               static regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}");
-               if (regex_search(to_utf8(val), tma_reg)) {
-                       val = val.substr(1);
-                       val.replace(2, 1, from_ascii("{"));
-                       continue;
-               }
-
-               // Apart from the above, we just ignore braces
-               if (ch == '{' || ch == '}') {
-                       val = val.substr(1);
-                       continue;
-               }
-
-               // we're going to check things that look like commands, so if
-               // this doesn't, just output it.
-               if (ch != '\\') {
-                       ret += ch;
-                       val = val.substr(1);
-                       continue;
-               }
-
-               // ok, could be a command of some sort
-               // let's see if it corresponds to some unicode
-               // unicodesymbols has things in the form: \"{u},
-               // whereas we may see things like: \"u. So we'll
-               // look for that and change it, if necessary.
-               // FIXME: This is a sort of mini-tex2lyx.
-               //        Use the real tex2lyx instead!
-               static regex const reg("^\\\\\\W\\w");
-               if (regex_search(to_utf8(val), reg)) {
-                       val.insert(3, from_ascii("}"));
-                       val.insert(2, from_ascii("{"));
-               }
-               bool termination;
-               docstring rem;
-               docstring const cnvtd = Encodings::fromLaTeXCommand(val,
-                               Encodings::TEXT_CMD, termination, rem);
-               if (!cnvtd.empty()) {
-                       // it did, so we'll take that bit and proceed with what's left
-                       ret += cnvtd;
-                       val = rem;
-                       continue;
-               }
-               // it's a command of some sort
-               scanning_cmd = true;
-               escaped = true;
-               val = val.substr(1);
-       }
-       return ret;
-}
-
-
 // Escape '<' and '>' and remove richtext markers (e.g. {!this is richtext!}) from a string.
 docstring processRichtext(docstring const & str, bool richtext)
 {
@@ -639,7 +504,7 @@ docstring const BibTeXInfo::getAuthorList(Buffer const * buf,
                        retval = constructName(authors[0], citenameform) + (buf ? buf->B_(etal) : from_ascii(etal));
        }
 
-       return convertLaTeXCommands(retval);
+       return Encodings::convertLaTeXCommands(retval);
 }
 
 
@@ -1068,7 +933,7 @@ docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs,
        }
 
        if (!richtext && !info_.empty()) {
-               info_ = convertLaTeXCommands(processRichtext(info_, false));
+               info_ = Encodings::convertLaTeXCommands(processRichtext(info_, false));
                return info_;
        }
        if (richtext && !info_richtext_.empty())
@@ -1090,11 +955,11 @@ docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs,
        }
 
        if (richtext) {
-               info_richtext_ = convertLaTeXCommands(processRichtext(info_, true));
+               info_richtext_ = Encodings::convertLaTeXCommands(processRichtext(info_, true));
                return info_richtext_;
        }
 
-       info_ = convertLaTeXCommands(processRichtext(info_, false));
+       info_ = Encodings::convertLaTeXCommands(processRichtext(info_, false));
        return info_;
 }
 
@@ -1110,7 +975,7 @@ docstring const BibTeXInfo::getLabel(BibTeXInfoList const & xrefs,
 
        if (!loclabel.empty() && !next) {
                loclabel = processRichtext(loclabel, ci.richtext);
-               loclabel = convertLaTeXCommands(loclabel);
+               loclabel = Encodings::convertLaTeXCommands(loclabel);
        }
 
        return loclabel;
index 2dd3c9de8f6fd1e06c70657716a9ad593bfb6f5c..538eead253b50eb5bbd0f9fd709dd910cd256ac8 100644 (file)
@@ -26,6 +26,7 @@
 #include <algorithm>
 #include <cstdint>
 #include <iterator>
+#include <regex>
 #include <sstream>
 
 using namespace std;
@@ -613,6 +614,156 @@ docstring Encodings::fromLaTeXCommand(docstring const & cmd, int cmdtype,
 }
 
 
+docstring Encodings::convertLaTeXCommands(docstring const & str)
+{
+       docstring val = str;
+       docstring ret;
+       docstring mret;
+
+       bool scanning_cmd = false;
+       bool scanning_math = false;
+       bool is_section = false;
+       bool escaped = false; // used to catch \$, etc.
+       while (!val.empty()) {
+               char_type const ch = val[0];
+
+               // if we're scanning math, we output everything until we
+               // find an unescaped $, at which point we break out.
+               if (scanning_math) {
+                       if (escaped)
+                               escaped = false;
+                       else if (ch == '\\')
+                               escaped = true;
+                       else if (ch == '$') {
+                               scanning_math = false;
+                               bool termination;
+                               docstring rem;
+                               ret += fromLaTeXCommand(mret, MATH_CMD, termination, rem);
+                               // parse remaining math
+                               while (!rem.empty()) {
+                                       docstring rrem;
+                                       // split command from normal text
+                                       docstring cmd = split(rem, rrem, '\\');
+                                       ret += rrem;
+                                       // done of no command was found
+                                       if (cmd.empty())
+                                               break;
+                                       // go on ...
+                                       ret += fromLaTeXCommand(from_ascii("\\") + cmd, MATH_CMD, termination, rem);
+                               }
+                               mret = docstring();
+                       }
+                       mret += ch;
+                       val = val.substr(1);
+                       continue;
+               }
+
+               // if we're scanning a command name, then we just
+               // discard characters until we hit something that
+               // isn't alpha.
+               if (scanning_cmd) {
+                       if (!is_section && ch == 'S') {
+                               is_section = true;
+                               val = val.substr(1);
+                               continue;
+                       }
+                       if (isAlphaASCII(ch)) {
+                               is_section = false;
+                               val = val.substr(1);
+                               escaped = false;
+                               continue;
+                       } else if (is_section) {
+                               ret.push_back(0x00a7);
+                               is_section = false;
+                               continue;
+                       }
+                       // so we're done with this command.
+                       // now we fall through and check this character.
+                       is_section = false;
+                       scanning_cmd = false;
+               }
+
+               // was the last character a \? If so, then this is something like:
+               // \\ or \$, so we'll just output it. That's probably not always right...
+               if (escaped) {
+                       // exception: output \, as THIN SPACE
+                       if (ch == ',')
+                               ret.push_back(0x2009);
+                       else
+                               ret += ch;
+                       val = val.substr(1);
+                       escaped = false;
+                       continue;
+               }
+
+               if (ch == '~') {
+                       ret += char_type(0x00a0);
+                       val = val.substr(1);
+                       continue;
+               }
+
+               if (ch == '$') {
+                       val = val.substr(1);
+                       scanning_math = true;
+                       continue;
+               }
+
+               // Change text mode accents in the form
+               // {\v a} to \v{a} (see #9340).
+               // FIXME: This is a sort of mini-tex2lyx.
+               //        Use the real tex2lyx instead!
+               static regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}");
+               if (regex_search(to_utf8(val), tma_reg)) {
+                       val = val.substr(1);
+                       val.replace(2, 1, from_ascii("{"));
+                       continue;
+               }
+
+               // Apart from the above, we just ignore braces
+               if (ch == '{' || ch == '}') {
+                       val = val.substr(1);
+                       continue;
+               }
+
+               // we're going to check things that look like commands, so if
+               // this doesn't, just output it.
+               if (ch != '\\') {
+                       ret += ch;
+                       val = val.substr(1);
+                       continue;
+               }
+
+               // ok, could be a command of some sort
+               // let's see if it corresponds to some unicode
+               // unicodesymbols has things in the form: \"{u},
+               // whereas we may see things like: \"u. So we'll
+               // look for that and change it, if necessary.
+               // FIXME: This is a sort of mini-tex2lyx.
+               //        Use the real tex2lyx instead!
+               static regex const reg("^\\\\\\W\\w");
+               if (regex_search(to_utf8(val), reg)) {
+                       val.insert(3, from_ascii("}"));
+                       val.insert(2, from_ascii("{"));
+               }
+               bool termination;
+               docstring rem;
+               docstring const cnvtd = fromLaTeXCommand(val,
+                               TEXT_CMD, termination, rem);
+               if (!cnvtd.empty()) {
+                       // it did, so we'll take that bit and proceed with what's left
+                       ret += cnvtd;
+                       val = rem;
+                       continue;
+               }
+               // it's a command of some sort
+               scanning_cmd = true;
+               escaped = true;
+               val = val.substr(1);
+       }
+       return ret;
+}
+
+
 CharInfo const & Encodings::unicodeCharInfo(char_type c)
 {
        static CharInfo empty;
index 4913ddc238346193941fc5d2e5addc8e7ad12ac9..3177132514b23ab0784db3e0f2ef30df743dd8bd 100644 (file)
@@ -349,6 +349,9 @@ public:
        static char_type fromLaTeXCommand(docstring const & cmd, int cmdtype,
                        bool & combining, bool & needsTermination,
                        std::set<std::string> * req = nullptr);
+       /// converts a string containing LaTeX commands into unicode
+       /// for display.
+       static docstring convertLaTeXCommands(docstring const & str);
        ///
        enum LatexCmd {
                ///