From 0f35e3141bc5b6baf2eb44bc63a34b5427db2cd5 Mon Sep 17 00:00:00 2001 From: Juergen Spitzmueller Date: Fri, 5 Jul 2024 14:05:26 +0200 Subject: [PATCH] Remove performance bottleneck in getAuthors() The regex is expensive which is especially noticeable with very long author lists. This introduces a case-insensitive subst() variant which is much faster. (cherry picked from commit 8ba74fe9589fca3b00134e4d4f1fc130ad960c69) --- src/BiblioInfo.cpp | 7 +------ src/support/lstrings.cpp | 26 +++++++++++++++++++------- src/support/lstrings.h | 3 ++- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/BiblioInfo.cpp b/src/BiblioInfo.cpp index 253fb3759c..e2ea4bde54 100644 --- a/src/BiblioInfo.cpp +++ b/src/BiblioInfo.cpp @@ -279,12 +279,7 @@ vector const getAuthors(docstring const & author) // Then, we temporarily make all " and " strings to ampersands in order // to handle them later on a per-char level. Note that arbitrary casing // ("And", "AND", "aNd", ...) is allowed in bibtex (#10465). - static regex const and_reg("(.* )([aA][nN][dD])( .*)"); - smatch sub; - string res = to_utf8(iname); - while (regex_match(res, sub, and_reg)) - res = sub.str(1) + "&" + sub.str(3); - iname = from_utf8(res); + iname = subst(iname, from_ascii(" and "), from_ascii(" & "), false); // Now we traverse through the string and replace the "&" by the proper // output in- and outside groups docstring name; diff --git a/src/support/lstrings.cpp b/src/support/lstrings.cpp index 61ea5bf36a..600885f80c 100644 --- a/src/support/lstrings.cpp +++ b/src/support/lstrings.cpp @@ -913,16 +913,27 @@ String const subst_string(String const & a, docstring const subst_string(docstring const & a, - docstring const & oldstr, docstring const & newstr) + docstring const & oldstr, docstring const & newstr, + bool const case_sens) { LASSERT(!oldstr.empty(), return a); docstring lstr = a; size_t i = 0; size_t const olen = oldstr.length(); - while ((i = lstr.find(oldstr, i)) != string::npos) { - lstr.replace(i, olen, newstr); - i += newstr.length(); // We need to be sure that we don't - // use the same i over and over again. + if (case_sens) + while ((i = lstr.find(oldstr, i)) != string::npos) { + lstr.replace(i, olen, newstr); + i += newstr.length(); // We need to be sure that we don't + // use the same i over and over again. + } + else { + docstring lcstr = lowercase(lstr); + while ((i = lcstr.find(oldstr, i)) != string::npos) { + lstr.replace(i, olen, newstr); + i += newstr.length(); // We need to be sure that we don't + // use the same i over and over again. + lcstr = lowercase(lstr); + } } return lstr; } @@ -951,9 +962,10 @@ string const subst(string const & a, docstring const subst(docstring const & a, - docstring const & oldstr, docstring const & newstr) + docstring const & oldstr, docstring const & newstr, + bool case_sens) { - return subst_string(a, oldstr, newstr); + return subst_string(a, oldstr, newstr, case_sens); } diff --git a/src/support/lstrings.h b/src/support/lstrings.h index 390d29c66a..b406f30994 100644 --- a/src/support/lstrings.h +++ b/src/support/lstrings.h @@ -196,7 +196,8 @@ std::string const subst(std::string const & a, /// substitutes all instances of \a oldstr with \a newstr docstring const subst(docstring const & a, - docstring const & oldstr, docstring const & newstr); + docstring const & oldstr, docstring const & newstr, + bool case_sens = true); /// Count all occurrences of char \a chr inside \a str int count_char(std::string const & str, char chr); -- 2.39.5