From 7a03fa6f1d4e4d7930fff68c549608515b9e8d76 Mon Sep 17 00:00:00 2001 From: Kornel Benko Date: Fri, 12 Oct 2018 16:47:07 +0200 Subject: [PATCH] Advanced search with format: Prepare latex for find Our findadv expects something like prefix + 'search' so that the regex (which is latexified too) can work on 'search' (In the source, the prefix is denoted by lead_as_string) The latex output contains structs like \foreignlaguage(abc}{xx\textbf{boldxx\textcolor{blue}{blue 1 blue 2} XX}} which would never match the simple prefix. Now the above is converted to \foreignlaguage(abc}{xx}\\ \foreignlaguage(abc}{\textbf{boldxx}} \foreignlaguage(abc}{\textbf{\textcolor{blue}{blue 1 blue 2}}}\\ \foreignlaguage(abc}{\textbf{ XX}} Of course, more than one language or color in an inset can be searched for now. --- src/lyxfind.cpp | 497 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 402 insertions(+), 95 deletions(-) diff --git a/src/lyxfind.cpp b/src/lyxfind.cpp index f59840cead..03a8d20d06 100644 --- a/src/lyxfind.cpp +++ b/src/lyxfind.cpp @@ -901,22 +901,26 @@ static Features identifyFeatures(string const & s) /* * Faster search for the related closing parenthesis */ -static int findclosing(string p, int start, int end) + static int findclosing(string p, int start, int end) { int skip = 0; int depth = 0; + int lastunclosed = start-1; for (int i = start; i < end; i += 1 + skip) { char c; c = p[i]; skip = 0; if (c == '\\') skip = 1; - else if (c == '{') depth++; + else if (c == '{') { + depth++; + lastunclosed = i; + } else if (c == '}') { if (depth == 0) return(i); --depth; } } - return(-1); + return(0 - lastunclosed); } /* @@ -975,6 +979,373 @@ static string removefontinfo(string par) return(par); } + +class LangInfo { + public: + enum Type { + Invalid, + Valid, + LastValid, + }; + Type valid; + + /*LangInfo(LangInfo &orig) : + par(orig.par), + maxoffset(orig.maxoffset), + search(orig.search) {valid = Invalid;} + */ + LangInfo(string par, string search1 = "", int start = 0, int end = -1) + : par(par), + _tokenend(0), + _dataEnd(0), + actualdeptindex(0) + { + valid = Invalid; + _tokenstart = start; + if (end > int(par.length())) { + maxoffset = par.length(); + } + else if (end > 0) + maxoffset = end; + else + maxoffset = par.length(); + if (!search1.empty()) + _search = search1; + } + bool nextInfo(); // of the same type, from the last start in the same reagion + bool firstInfo(string search, int datastart); + void setDataEnd(int value); + void setDataStart(int value); + int getDataStart() { return _dataStart;}; + string name() { return _search;}; + string lasttoken() { if (valid == Valid) return _foundtoken; else return "";}; + int getStart() { return _tokenstart;}; + int getTokenEnd() { return _tokenend;}; + int getEnd() { return _dataEnd;}; + bool isValid() { return (valid == Valid); }; + void process(ostringstream &os); + void output(ostringstream &os, int); + void addIntervall(int upper); + void addIntervall(int low, int upper); /* if explicit */ + void handleParentheses(int lastpos); + string show(int lastpos); + private: + string par; + string _search; + string _foundtoken; + int _tokenstart; + int _tokenend; + int _dataStart; + int _dataEnd; + bool atEnd; + size_t maxoffset; + int depts[20]; + int closes[20]; + int actualdeptindex; + int ignoreIntervalls[10][2]; + int ignoreidx; +}; + +void LangInfo::setDataEnd(int dataend) +{ + if (dataend < _tokenend) { + _dataEnd = _tokenend; + // cout << "Wrong data start, too low\n"; + } + else if (size_t(dataend) > par.length()) { + // cout << "Wrong data start, too high\n"; + _dataEnd = par.length(); + } + else + _dataEnd = dataend; +} + +void LangInfo::setDataStart(int datastart) +{ + if (datastart < _tokenend) { + _dataStart = _tokenend; + // cout << "Wrong data start, too low\n"; + } + else if (size_t(datastart) > par.length()) { + // cout << "Wrong data start, too high\n"; + _dataStart = par.length(); + } + else + _dataStart = datastart; + //cout << "found entry at " << _tokenstart << "\n"; + actualdeptindex = 1; /* == Number of open brases */ + depts[0] = _dataStart; + closes[0] = -1; + depts[1] = _dataStart; + ignoreidx = 0; + ignoreIntervalls[ignoreidx][0] = _dataStart; + if ((par[_dataStart] == '{') && (par[_dataStart+1] == '}')) { + // First candidates to be ignored + ignoreIntervalls[ignoreidx][1] = _dataStart+2; + } + else + ignoreIntervalls[ignoreidx][1] = _dataStart; +} + +void LangInfo::handleParentheses(int lastpos) +{ + int skip = 0; + for (int i = depts[actualdeptindex]; i < lastpos; i+= 1 + skip) { + char c; + c = par[i]; + skip = 0; + if (c == '\\') skip = 1; + else if (c == '{') { + actualdeptindex++; + depts[actualdeptindex] = i+1; + closes[actualdeptindex] = -1; + } + else if (c == '}') { + if (actualdeptindex <= 0) { + LYXERR0("ERROR ERROR ERROR"); /* should never happen! */ + } + else { + closes[actualdeptindex] = i+1; + actualdeptindex--; + } + } + } +} + +void LangInfo::addIntervall(int low, int upper) +{ + int idx; + if (low == upper) return; + for (idx = ignoreidx+1; idx > 0; --idx) { + if (low > ignoreIntervalls[idx-1][1]) { + break; + } + } + if (idx > ignoreidx) { + ignoreIntervalls[idx][0] = low; + ignoreIntervalls[idx][1] = upper; + } + else { + // Expand only if one of the new bounds is inside the interwall + if (((low <= ignoreIntervalls[idx][1]) && (low >= ignoreIntervalls[idx][0])) || + ((upper <= ignoreIntervalls[idx][1]) && (upper >= ignoreIntervalls[idx][0]))) { + if (low < ignoreIntervalls[idx][0]) + ignoreIntervalls[idx][0] = low; + if (upper > ignoreIntervalls[idx][1]) + ignoreIntervalls[idx][1] = upper; + } + } + ignoreidx = idx; /* because upper is in all cases bigger */ +} + +void LangInfo::addIntervall(int upper) +{ + int low; + if (actualdeptindex >= 0) + low = depts[actualdeptindex]; /* the position of last unclosed '{' */ + else { + LYXERR0("ERROR ERROR ERROR2"); + low = upper; + } + addIntervall(low, upper); +} + +string LangInfo::show(int lastpos) +{ + ostringstream os; + + os << par.substr(_tokenstart, _tokenend - _tokenstart); + int idx = 0; + for (int i = _dataStart; i < lastpos;) { + if (i <= ignoreIntervalls[idx][0]) { + os << par.substr(i, ignoreIntervalls[idx][0] - i); + i = ignoreIntervalls[idx][1]; + } + idx++; + if (idx > ignoreidx) { + os << par.substr(i, lastpos-i); + break; + } + } + for (int i = actualdeptindex; i > 0; --i) + os << "}"; + return os.str(); +} + +void LangInfo::output(ostringstream &os, int lastpos) +{ + // get number of chars to output + int idx = 0; /* int intervalls */ + int count = 0; + for (int i = _dataStart; i < lastpos;) { + if (i <= ignoreIntervalls[idx][0]) { + count += ignoreIntervalls[idx][0] - i; + i = ignoreIntervalls[idx][1]; + } + idx++; + if (idx > ignoreidx) { + count += lastpos-i; + break; + } + } + //cout << "Number of output chars would be " << count + actualdeptindex << "\n"; + if (count > 0) { + // Now the acual data + os << par.substr(_tokenstart, _tokenend - _tokenstart); + idx = 0; + for (int i = _dataStart; i < lastpos;) { + if (i <= ignoreIntervalls[idx][0]) { + os << par.substr(i, ignoreIntervalls[idx][0] - i); + i = ignoreIntervalls[idx][1]; + } + idx++; + if (idx > ignoreidx) { + os << par.substr(i, lastpos-i); + break; + } + } + for (int i = actualdeptindex; i > 0; --i) + os << "}"; + } + handleParentheses(lastpos); +} + +bool LangInfo::nextInfo() +{ + int start = _tokenstart; + + // cout << par << "\n"; + if (valid == Invalid) + _dataEnd = _tokenstart; + else if (valid == LastValid) + return false; + // cout << "Start search at " << _tokenclose << " for \"" << _search << "\n"; + size_t foundstart = par.find(_search, _dataEnd); + if (foundstart == string::npos) { + if (valid == Valid) + valid = LastValid; + return false; // not found + } + if (foundstart >= maxoffset) + return false; + start = foundstart; + int closelang = findclosing(par, start + _search.length(), maxoffset); + if (closelang < 0) + return false; + if (size_t(closelang) >= maxoffset) + return false; + if (par[closelang] != '}') + return false; + valid = Valid; + _foundtoken = par.substr(start, closelang - start + 2); + _tokenstart = start; + _tokenend = closelang+2; + setDataStart(_tokenend); + closelang = findclosing(par, _dataStart, maxoffset); + if (closelang < 0) { + _dataEnd = maxoffset; + atEnd = true; + } + else { + _dataEnd = closelang; + atEnd = false; + } + return true; +} + +bool LangInfo::firstInfo(string search1, int datastart) +{ + if (!search1.empty()) { + if (_search.compare(search1) != 0) { + _tokenstart = datastart; + _search = search1; + valid = Invalid; + } + } + return nextInfo(); +} + +void LangInfo::process(ostringstream &os) +{ + LangInfo color(*this); + (void) color.firstInfo("\\textcolor{", _dataStart); + while (color.isValid() && (color.getStart() < _dataEnd)) { + bool isEmpty = false; + if (color.getDataStart() == color.getEnd()) { + // Empty, e.g. par[color.getDataStart()] == '}' + isEmpty = true; + } + else if ((par[color.getDataStart()] == '{') && (par[color.getDataStart()+1] == '}')) { + // color starts with '{}', discard it + if (color.getDataStart()+2 == color.getEnd()) + isEmpty = true; + else { + // discard the first '{}' + addIntervall(color.getDataStart(), color.getDataStart()+2); + } + } + if (isEmpty) { + // it is emty, so ignore and go to next color + addIntervall(color.getStart(), color.getEnd()+1); + } + else { + if (par[color.getStart()-1] != '{') { + output(os, color.getStart()); + addIntervall(color.getStart()); + } + // Check if color empty + output(os, color.getEnd()+1); + addIntervall(color.getEnd()+1); + } + for (int i = color.getEnd()+1; par[i] == '}'; i++) { + handleParentheses(i+1); + addIntervall(i+1); + } + color.nextInfo(); + } + if (par[_dataEnd] != '}') + output(os, _dataEnd-1); + else + output(os, _dataEnd); +} + +/* + * Called only if the par starts with lang spec + */ + +string splitForColors(string par) { + ostringstream os; + LangInfo firstLanguage(par, "\\foreignlanguage{"); + if (firstLanguage.firstInfo("\\foreignlanguage{", 0)) { + LangInfo nextLanguage(firstLanguage); + nextLanguage.setDataEnd(firstLanguage.getDataStart()); + if (nextLanguage.firstInfo("\\foreignlanguage{", firstLanguage.getTokenEnd())) { + firstLanguage.setDataEnd(nextLanguage.getStart()); + } + firstLanguage.process(os); + while (nextLanguage.isValid()) { + nextLanguage.process(os); + // To handle the gap, we need the end of last languuage to start of next + int gapstart = nextLanguage.getEnd()+1; + int gapend; + nextLanguage.nextInfo(); + if (nextLanguage.isValid()) + gapend = nextLanguage.getStart(); + else + gapend = par.length(); + // Now handle the gap, if there is one + if (gapend > gapstart) { + // cout << "Gap found, size = " << gapend - gapstart << "\n"; + firstLanguage.setDataEnd(gapend); + firstLanguage.setDataStart(gapstart); + firstLanguage.process(os); + } + } + } + string s = os.str(); + return s; +} + /* * Try to unify the language specs in the latexified text. * Resulting modified string is set to "", if @@ -982,9 +1353,6 @@ static string removefontinfo(string par) */ static string correctlanguagesetting(string par, bool from_regex, bool withformat) { - static string langstart = "\\foreignlanguage{"; - static int llen = langstart.length(); - static bool removefirstlang = false; static Features regex_f; static int missed = 0; static bool regex_with_format = false; @@ -994,113 +1362,43 @@ static string correctlanguagesetting(string par, bool from_regex, bool withforma while ((parlen > 0) && (par[parlen-1] == '\n')) { parlen--; } + string result = removefontinfo(par.substr(0, parlen)); + result = splitForColors(result); + LYXERR(Debug::FIND, "Converted: \"" << result << "\""); + bool handle_colors = false; if (from_regex) { missed = 0; if (withformat) { - regex_f = identifyFeatures(par); + regex_f = identifyFeatures(result); + string features = ""; for (auto it = regex_f.cbegin(); it != regex_f.cend(); ++it) { string a = it->first; regex_with_format = true; + if (a.compare(0,10,"textcolor{") == 0) + handle_colors = true; + features += " " + a; // LYXERR0("Identified regex format:" << a); } + LYXERR(Debug::FIND, "Identified Features" << features); } } else if (regex_with_format) { - Features info = identifyFeatures(par); + Features info = identifyFeatures(result); for (auto it = regex_f.cbegin(); it != regex_f.cend(); ++it) { string a = it->first; bool b = it->second; if (b && ! info[a]) { missed++; - // LYXERR0("Missed(" << missed << ", srclen = " << parlen ); + LYXERR(Debug::FIND, "Missed(" << missed << " " << a <<", srclen = " << parlen ); return(""); } + else if (a.compare(0,10,"textcolor{") == 0) + handle_colors = true; } } else { // LYXERR0("No regex formats"); } - string result = removefontinfo(par.substr(0, parlen)); - parlen = result.length(); - if (result.compare(0, llen, langstart) == 0) { - if (from_regex) { - removefirstlang = false; - } - int i = findclosing(result, llen, parlen); - if (removefirstlang) { - if (i < 0) - result = ""; - else { - int closepos = findclosing(result, i+2, parlen); - if (closepos > 0) { - result = result.substr(i+2, closepos-i-2) + result.substr(closepos+1, parlen - closepos-1); - } - else { - result = result.substr(i+2, parlen-i-2); - } - } - } - else if (i > 0) { - // skip '}{' after the language spec - string samelang = ""; - int startpos = i+2; - int closepos; - while(true) { - closepos = findclosing(result, startpos, parlen); - if (closepos >0) { - if (result[closepos+1] == '{') { - samelang += result.substr(startpos, closepos-startpos); - startpos = closepos + 2; - } - else { - samelang += result.substr(startpos, closepos-startpos); - result = result.substr(0, i+2) + samelang + result.substr(closepos); - closepos = i+2 + samelang.length(); - break; - } - } - else { - result = result.substr(0, i+2) + samelang + result.substr(startpos) +"}"; - closepos = result.length() - 1; - break; - } - } - size_t insertpos = result.find(langstart, i+2); - - if (insertpos == string::npos) - insertpos = result.length(); - if (closepos < 0) { - if (insertpos == result.length()) { - // there are no closing in par, and no next lang spec - result += "}"; - } - else { - // Add '}' at insertpos only, because closing is missing - result = result.substr(0,insertpos) + "}" + result.substr(insertpos, parlen-insertpos); - } - } - else if ((size_t) closepos > insertpos) { - // Add '}' at insertpos and remove from closepos if closepos > insertpos - result = result.substr(0,insertpos) + "}" + result.substr(insertpos, closepos - insertpos) + result.substr(closepos+1, parlen -closepos-1); - } - else { - // here closepos < insertpos - if ((size_t) closepos +1 < insertpos) { - result = result.substr(0, closepos) + result.substr(closepos+1, insertpos-closepos-1) + "}" + result.substr(insertpos); - } - } - } - else { - // result not good?, no closing '}' for \foreignlanguage{ ...>>> found - // For i == 0, it is empty language spec - // and for i < 0 it is Error - } - } - else { - if (from_regex) { - removefirstlang = true; - } - } // remove possible disturbing macros while (regex_replace(result, result, "\\\\(noindent )", "")) ; @@ -1127,6 +1425,10 @@ static string correctlanguagesetting(string par, bool from_regex, bool withforma } } + if (handle_colors) { + while (regex_replace(result, result, "(\\{\\\\textcolor\\{[a-z]+\\}\\{)\\s*\\{\\}\\s*", "$1")); + while (regex_replace(result, result, "\\{\\\\textcolor\\{[a-z]+\\}\\{\\s*\\}\\s*\\}", "")); + } return(result); } @@ -1309,6 +1611,9 @@ int MatchStringAdv::findAux(DocIterator const & cur, int len, bool at_begin) con if (m.size() > 1) leadingsize = m[1].second - m[1].first; int result; + for (size_t i = 0; i < m.size(); i++) { + LYXERR(Debug::FIND, "Match " << i << " is " << m[i].second - m[i].first << " long"); + } if (close_wildcards == 0) result = m[0].second - m[0].first; @@ -1393,7 +1698,7 @@ string MatchStringAdv::normalize(docstring const & s, bool hack_braces) const while (regex_replace(t, t, "\\\\((emph|noun|text(bf|sl|sf|it|tt|color\\{[a-z]+\\})|(u|uu)line|(s|x)out|uwave)|((sub)?(((sub)?section)|paragraph)|part)\\*?)(\\{(\\{\\})?\\})+", "")) LYXERR(Debug::FIND, " further removing stale empty \\emph{}, \\textbf{} macros from: " << t); - while (regex_replace(t, t, "\\\\foreignlanguage\\{[a-z]+\\}(\\{(\\\\item |\\{\\})?\\})+", "")); + while (regex_replace(t, t, "\\\\(foreignlanguage|textcolor)\\{[a-z]+\\}(\\{(\\\\item |\\{\\})?\\})+", "")); // FIXME - check what preceeds the brace if (hack_braces) { if (opt.ignoreformat) @@ -1477,9 +1782,8 @@ docstring latexifyFromCursor(DocIterator const & cur, int len) endpos = cur.pos() + len; TeXOnePar(buf, *cur.innerText(), cur.pit(), os, runparams, string(), cur.pos(), endpos); - LYXERR(Debug::FIND, "Latexified text: '" << lyx::to_utf8(ods.str()) << "'"); string s = correctlanguagesetting(lyx::to_utf8(ods.str()), false, false); - LYXERR(Debug::FIND, "Latexified text: '" << s << "'"); + LYXERR(Debug::FIND, "Latexified +modified text: '" << s << "'"); return(lyx::from_utf8(s)); } else if (cur.inMathed()) { // Retrieve the math environment type, and add '$' or '$[' or others (\begin{equation}) accordingly @@ -1538,7 +1842,7 @@ int findAdvFinalize(DocIterator & cur, MatchStringAdv const & match) cur.forwardPos(); } while (cur && cur.depth() > d && match(cur) > 0); cur = old_cur; - LASSERT(match(cur) > 0, return 0); + if (match(cur) <= 0) return 0; LYXERR(Debug::FIND, "Ok"); // Compute the match length @@ -1594,7 +1898,10 @@ int findForwardAdv(DocIterator & cur, MatchStringAdv & match) match_len_zero_count = 0; } else { - LYXERR(Debug::FIND, "match_len2_zero_count: " << match_len_zero_count << ", match_len was " << match_len); + if (++match_len_zero_count > 3) { + LYXERR(Debug::FIND, "match_len2_zero_count: " << match_len_zero_count << ", match_len was " << match_len); + match_len_zero_count = 0; + } break; } } -- 2.39.2