src/BiblioInfo.cpp

   1 /**
   2  * \file BiblioInfo.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Angus Leeming
   7  * \author Herbert Voß
   8  * \author Richard Kimberly Heck
   9  * \author Julien Rioux
  10  * \author Jürgen Spitzmüller
  11  *
  12  * Full author contact details are available in file CREDITS.
  13  */
  14
  15 #include <config.h>
  16
  17 #include "BiblioInfo.h"
  18
  19 #include "Buffer.h"
  20 #include "BufferParams.h"
  21 #include "Citation.h"
  22 #include "Encoding.h"
  23 #include "Language.h"
  24 #include "TextClass.h"
  25 #include "TocBackend.h"
  26 #include "xml.h"
  27
  28 #include "support/convert.h"
  29 #include "support/debug.h"
  30 #include "support/docstream.h"
  31 #include "support/FileName.h"
  32 #include "support/gettext.h"
  33 #include "support/lassert.h"
  34 #include "support/lstrings.h"
  35 #include "support/textutils.h"
  36
  37 #include <map>
  38 #include <regex>
  39 #include <set>
  40
  41 using namespace std;
  42 using namespace lyx::support;
  43
  44
  45 namespace lyx {
  46
  47 namespace {
  48
  49 // Remove placeholders from names
  50 docstring renormalize(docstring const & input)
  51 {
  52         docstring res = subst(input, from_ascii("$$space!"), from_ascii(" "));
  53         return subst(res, from_ascii("$$comma!"), from_ascii(","));
  54 }
  55
  56
  57 // Split the surname into prefix ("von-part") and family name
  58 pair<docstring, docstring> parseSurname(docstring const & sname)
  59 {
  60         // Split the surname into its tokens
  61         vector<docstring> pieces = getVectorFromString(sname, from_ascii(" "));
  62         if (pieces.size() < 2)
  63                 return make_pair(docstring(), sname);
  64
  65         // Now we look for pieces that begin with a lower case letter.
  66         // All except for the very last token constitute the "von-part".
  67         docstring prefix;
  68         vector<docstring>::const_iterator it = pieces.begin();
  69         vector<docstring>::const_iterator const en = pieces.end();
  70         bool first = true;
  71         for (; it != en; ++it) {
  72                 if ((*it).empty())
  73                         continue;
  74                 // If this is the last piece, then what we now have is
  75                 // the family name, notwithstanding the casing.
  76                 if (it + 1 == en)
  77                         break;
  78                 char_type const c = (*it)[0];
  79                 // If the piece starts with a upper case char, we assume
  80                 // this is part of the surname.
  81                 if (!isLower(c))
  82                         break;
  83                 // Nothing of the former, so add this piece to the prename
  84                 if (!first)
  85                         prefix += " ";
  86                 else
  87                         first = false;
  88                 prefix += *it;
  89         }
  90
  91         // Reconstruct the family name.
  92         // Note that if we left the loop with because it + 1 == en,
  93         // then this will still do the right thing, i.e., make surname
  94         // just be the last piece.
  95         docstring surname;
  96         first = true;
  97         for (; it != en; ++it) {
  98                 if (!first)
  99                         surname += " ";
 100                 else
 101                         first = false;
 102                 surname += *it;
 103         }
 104         return make_pair(prefix, surname);
 105 }
 106
 107
 108 struct name_parts {
 109         docstring surname;
 110         docstring prename;
 111         docstring suffix;
 112         docstring prefix;
 113 };
 114
 115
 116 // gets the name parts (prename, surname, prefix, suffix) from an author-type string
 117 name_parts nameParts(docstring const & iname)
 118 {
 119         name_parts res;
 120         if (iname.empty())
 121                 return res;
 122
 123         // First we check for goupings (via {...}) and replace blanks and
 124         // commas inside groups with temporary placeholders
 125         docstring name;
 126         int gl = 0;
 127         docstring::const_iterator p = iname.begin();
 128         while (p != iname.end()) {
 129                 // count grouping level
 130                 if (*p == '{')
 131                         ++gl;
 132                 else if (*p == '}')
 133                         --gl;
 134                 // generate string with probable placeholders
 135                 if (*p == ' ' && gl > 0)
 136                         name += from_ascii("$$space!");
 137                 else if (*p == ',' && gl > 0)
 138                         name += from_ascii("$$comma!");
 139                 else
 140                         name += *p;
 141                 ++p;
 142         }
 143
 144         // Now we look for a comma, and take the last name to be everything
 145         // preceding the right-most one, so that we also get the name suffix
 146         // (aka "jr" part).
 147         vector<docstring> pieces = getVectorFromString(name);
 148         if (pieces.size() > 1) {
 149                 // Whether we have a name suffix or not, the prename is
 150                 // always last item
 151                 res.prename = renormalize(pieces.back());
 152                 // The family name, conversely, is always the first item.
 153                 // However, it might contain a prefix (aka "von" part)
 154                 docstring const sname = pieces.front();
 155                 res.prefix = renormalize(parseSurname(sname).first);
 156                 res.surname = renormalize(parseSurname(sname).second);
 157                 // If we have three pieces (the maximum allowed by BibTeX),
 158                 // the second one is the name suffix.
 159                 if (pieces.size() > 2)
 160                         res.suffix = renormalize(pieces.at(1));
 161                 return res;
 162         }
 163
 164         // OK, so now we want to look for the last name.
 165         // Split on spaces, to get various tokens.
 166         pieces = getVectorFromString(name, from_ascii(" "));
 167         // No space: Only a family name given
 168         if (pieces.size() < 2) {
 169                 res.surname = renormalize(pieces.back());
 170                 return res;
 171         }
 172         // If we get two pieces, assume "prename surname"
 173         if (pieces.size() == 2) {
 174                 res.prename = renormalize(pieces.front());
 175                 res.surname = renormalize(pieces.back());
 176                 return res;
 177         }
 178
 179         // More than 3 pieces: A name prefix (aka "von" part) might be included.
 180         // We look for the first piece that begins with a lower case letter
 181         // (which is the name prefix, if it is not the last token) or the last token.
 182         docstring prename;
 183         vector<docstring>::const_iterator it = pieces.begin();
 184         vector<docstring>::const_iterator const en = pieces.end();
 185         bool first = true;
 186         for (; it != en; ++it) {
 187                 if ((*it).empty())
 188                         continue;
 189                 char_type const c = (*it)[0];
 190                 // If the piece starts with a lower case char, we assume
 191                 // this is the name prefix and thus prename is complete.
 192                 if (isLower(c))
 193                         break;
 194                 // Same if this is the last piece, which is always the surname.
 195                 if (it + 1 == en)
 196                         break;
 197                 // Nothing of the former, so add this piece to the prename
 198                 if (!first)
 199                         prename += " ";
 200                 else
 201                         first = false;
 202                 prename += *it;
 203         }
 204
 205         // Now reconstruct the family name and strip the prefix.
 206         // Note that if we left the loop because it + 1 == en,
 207         // then this will still do the right thing, i.e., make surname
 208         // just be the last piece.
 209         docstring surname;
 210         first = true;
 211         for (; it != en; ++it) {
 212                 if (!first)
 213                         surname += " ";
 214                 else
 215                         first = false;
 216                 surname += *it;
 217         }
 218         res.prename = renormalize(prename);
 219         res.prefix = renormalize(parseSurname(surname).first);
 220         res.surname = renormalize(parseSurname(surname).second);
 221         return res;
 222 }
 223
 224
 225 docstring constructName(docstring const & name, string const & scheme)
 226 {
 227         // re-constructs a name from name parts according
 228         // to a given scheme
 229         docstring const prename = nameParts(name).prename;
 230         docstring const surname = nameParts(name).surname;
 231         docstring const prefix = nameParts(name).prefix;
 232         docstring const suffix = nameParts(name).suffix;
 233         string res = scheme;
 234         static regex const reg1("(.*)(\\{%prename%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 235         static regex const reg2("(.*)(\\{%suffix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 236         static regex const reg3("(.*)(\\{%prefix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 237         smatch sub;
 238         // Changing the first parameter of regex_match() may corrupt the
 239         // second one. In this case we use the temporary string tmp.
 240         if (regex_match(scheme, sub, reg1)) {
 241                 res = sub.str(1);
 242                 if (!prename.empty())
 243                         res += sub.str(3);
 244                 res += sub.str(5);
 245         }
 246         if (regex_match(res, sub, reg2)) {
 247                 string tmp = sub.str(1);
 248                 if (!suffix.empty())
 249                         tmp += sub.str(3);
 250                 res = tmp + sub.str(5);
 251         }
 252         if (regex_match(res, sub, reg3)) {
 253                 string tmp = sub.str(1);
 254                 if (!prefix.empty())
 255                         tmp += sub.str(3);
 256                 res = tmp + sub.str(5);
 257         }
 258         docstring result = from_ascii(res);
 259         result = subst(result, from_ascii("%prename%"), prename);
 260         result = subst(result, from_ascii("%surname%"), surname);
 261         result = subst(result, from_ascii("%prefix%"), prefix);
 262         result = subst(result, from_ascii("%suffix%"), suffix);
 263         return result;
 264 }
 265
 266
 267 vector<docstring> const getAuthors(docstring const & author)
 268 {
 269         // We check for goupings (via {...}) and only consider " and "
 270         // outside groups as author separator. This is to account
 271         // for cases such as {{Barnes and Noble, Inc.}}, which
 272         // need to be treated as one single family name.
 273         // We use temporary placeholders in order to differentiate the
 274         // diverse " and " cases.
 275
 276         // First, we temporarily replace all ampersands. It is rather unusual
 277         // in author names, but can happen (consider cases such as "C \& A Corp.").
 278         docstring iname = subst(author, from_ascii("&"), from_ascii("$$amp!"));
 279         // Then, we temporarily make all " and " strings to ampersands in order
 280         // to handle them later on a per-char level. Note that arbitrary casing
 281         // ("And", "AND", "aNd", ...) is allowed in bibtex (#10465).
 282         static regex const and_reg("(.* )([aA][nN][dD])( .*)");
 283         smatch sub;
 284         string res = to_utf8(iname);
 285         while (regex_match(res, sub, and_reg))
 286                 res = sub.str(1) + "&" + sub.str(3);
 287         iname = from_utf8(res);
 288         // Now we traverse through the string and replace the "&" by the proper
 289         // output in- and outside groups
 290         docstring name;
 291         int gl = 0;
 292         docstring::const_iterator p = iname.begin();
 293         while (p != iname.end()) {
 294                 // count grouping level
 295                 if (*p == '{')
 296                         ++gl;
 297                 else if (*p == '}')
 298                         --gl;
 299                 // generate string with probable placeholders
 300                 if (*p == '&') {
 301                         if (gl > 0)
 302                                 // Inside groups, we output "and"
 303                                 name += from_ascii("and");
 304                         else
 305                                 // Outside groups, we output a separator
 306                                 name += from_ascii("$$namesep!");
 307                 }
 308                 else
 309                         name += *p;
 310                 ++p;
 311         }
 312
 313         // re-insert the literal ampersands
 314         name = subst(name, from_ascii("$$amp!"), from_ascii("&"));
 315
 316         // Now construct the actual vector
 317         return getVectorFromString(name, from_ascii(" $$namesep! "));
 318 }
 319
 320
 321 bool multipleAuthors(docstring const & author)
 322 {
 323         return getAuthors(author).size() > 1;
 324 }
 325
 326
 327 // converts a string containing LaTeX commands into unicode
 328 // for display.
 329 docstring convertLaTeXCommands(docstring const & str)
 330 {
 331         docstring val = str;
 332         docstring ret;
 333
 334         bool scanning_cmd = false;
 335         bool scanning_math = false;
 336         bool is_section = false;
 337         bool escaped = false; // used to catch \$, etc.
 338         while (!val.empty()) {
 339                 char_type const ch = val[0];
 340
 341                 // if we're scanning math, we output everything until we
 342                 // find an unescaped $, at which point we break out.
 343                 if (scanning_math) {
 344                         if (escaped)
 345                                 escaped = false;
 346                         else if (ch == '\\')
 347                                 escaped = true;
 348                         else if (ch == '$')
 349                                 scanning_math = false;
 350                         ret += ch;
 351                         val = val.substr(1);
 352                         continue;
 353                 }
 354
 355                 // if we're scanning a command name, then we just
 356                 // discard characters until we hit something that
 357                 // isn't alpha.
 358                 if (scanning_cmd) {
 359                         if (!is_section && ch == 'S') {
 360                                 is_section = true;
 361                                 val = val.substr(1);
 362                                 continue;
 363                         }
 364                         if (isAlphaASCII(ch)) {
 365                                 is_section = false;
 366                                 val = val.substr(1);
 367                                 escaped = false;
 368                                 continue;
 369                         } else if (is_section) {
 370                                 ret.push_back(0x00a7);
 371                                 is_section = false;
 372                                 continue;
 373                         }
 374                         // so we're done with this command.
 375                         // now we fall through and check this character.
 376                         is_section = false;
 377                         scanning_cmd = false;
 378                 }
 379
 380                 // was the last character a \? If so, then this is something like:
 381                 // \\ or \$, so we'll just output it. That's probably not always right...
 382                 if (escaped) {
 383                         // exception: output \, as THIN SPACE
 384                         if (ch == ',')
 385                                 ret.push_back(0x2009);
 386                         else
 387                                 ret += ch;
 388                         val = val.substr(1);
 389                         escaped = false;
 390                         continue;
 391                 }
 392
 393                 if (ch == '~') {
 394                         ret += char_type(0x00a0);
 395                         val = val.substr(1);
 396                         continue;
 397                 }
 398
 399                 if (ch == '$') {
 400                         ret += ch;
 401                         val = val.substr(1);
 402                         scanning_math = true;
 403                         continue;
 404                 }
 405
 406                 // Change text mode accents in the form
 407                 // {\v a} to \v{a} (see #9340).
 408                 // FIXME: This is a sort of mini-tex2lyx.
 409                 //        Use the real tex2lyx instead!
 410                 static regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}");
 411                 if (regex_search(to_utf8(val), tma_reg)) {
 412                         val = val.substr(1);
 413                         val.replace(2, 1, from_ascii("{"));
 414                         continue;
 415                 }
 416
 417                 // Apart from the above, we just ignore braces
 418                 if (ch == '{' || ch == '}') {
 419                         val = val.substr(1);
 420                         continue;
 421                 }
 422
 423                 // we're going to check things that look like commands, so if
 424                 // this doesn't, just output it.
 425                 if (ch != '\\') {
 426                         ret += ch;
 427                         val = val.substr(1);
 428                         continue;
 429                 }
 430
 431                 // ok, could be a command of some sort
 432                 // let's see if it corresponds to some unicode
 433                 // unicodesymbols has things in the form: \"{u},
 434                 // whereas we may see things like: \"u. So we'll
 435                 // look for that and change it, if necessary.
 436                 // FIXME: This is a sort of mini-tex2lyx.
 437                 //        Use the real tex2lyx instead!
 438                 static regex const reg("^\\\\\\W\\w");
 439                 if (regex_search(to_utf8(val), reg)) {
 440                         val.insert(3, from_ascii("}"));
 441                         val.insert(2, from_ascii("{"));
 442                 }
 443                 bool termination;
 444                 docstring rem;
 445                 docstring const cnvtd = Encodings::fromLaTeXCommand(val,
 446                                 Encodings::TEXT_CMD, termination, rem);
 447                 if (!cnvtd.empty()) {
 448                         // it did, so we'll take that bit and proceed with what's left
 449                         ret += cnvtd;
 450                         val = rem;
 451                         continue;
 452                 }
 453                 // it's a command of some sort
 454                 scanning_cmd = true;
 455                 escaped = true;
 456                 val = val.substr(1);
 457         }
 458         return ret;
 459 }
 460
 461
 462 // Escape '<' and '>' and remove richtext markers (e.g. {!this is richtext!}) from a string.
 463 docstring processRichtext(docstring const & str, bool richtext)
 464 {
 465         docstring val = str;
 466         docstring ret;
 467
 468         bool scanning_rich = false;
 469         while (!val.empty()) {
 470                 char_type const ch = val[0];
 471                 if (ch == '{' && val.size() > 1 && val[1] == '!') {
 472                         // beginning of rich text
 473                         scanning_rich = true;
 474                         val = val.substr(2);
 475                         continue;
 476                 }
 477                 if (scanning_rich && ch == '!' && val.size() > 1 && val[1] == '}') {
 478                         // end of rich text
 479                         scanning_rich = false;
 480                         val = val.substr(2);
 481                         continue;
 482                 }
 483                 if (richtext) {
 484                         if (scanning_rich)
 485                                 ret += ch;
 486                         else {
 487                                 // we need to escape '<' and '>'
 488                                 if (ch == '<')
 489                                         ret += "&lt;";
 490                                 else if (ch == '>')
 491                                         ret += "&gt;";
 492                                 else
 493                                         ret += ch;
 494                         }
 495                 } else if (!scanning_rich /* && !richtext */)
 496                         ret += ch;
 497                 // else the character is discarded, which will happen only if
 498                 // richtext == false and we are scanning rich text
 499                 val = val.substr(1);
 500         }
 501         return ret;
 502 }
 503
 504 } // namespace
 505
 506
 507 //////////////////////////////////////////////////////////////////////
 508 //
 509 // BibTeXInfo
 510 //
 511 //////////////////////////////////////////////////////////////////////
 512
 513 BibTeXInfo::BibTeXInfo(docstring const & key, docstring const & type)
 514         : is_bibtex_(true), bib_key_(key), num_bib_key_(0), entry_type_(type),
 515           info_(), format_(), modifier_(0)
 516 {}
 517
 518
 519
 520 docstring const BibTeXInfo::getAuthorOrEditorList(Buffer const * buf,
 521                                           bool full, bool forceshort) const
 522 {
 523         docstring author = operator[]("author");
 524         if (author.empty())
 525                 author = operator[]("editor");
 526
 527         return getAuthorList(buf, author, full, forceshort);
 528 }
 529
 530
 531 docstring const BibTeXInfo::getAuthorList(Buffer const * buf,
 532                 docstring const & author, bool const full, bool const forceshort,
 533                 bool const allnames, bool const beginning) const
 534 {
 535         // Maxnames treshold depend on engine
 536         size_t maxnames = buf ?
 537                 buf->params().documentClass().max_citenames() : 2;
 538
 539         if (!is_bibtex_) {
 540                 docstring const opt = label();
 541                 if (opt.empty())
 542                         return docstring();
 543
 544                 docstring authors;
 545                 docstring const remainder = trim(split(opt, authors, '('));
 546                 if (remainder.empty())
 547                         // in this case, we didn't find a "(",
 548                         // so we don't have author (year)
 549                         return docstring();
 550                 if (full) {
 551                         // Natbib syntax is "Jones et al.(1990)Jones, Baker, and Williams"
 552                         docstring const fullauthors = trim(rsplit(remainder, ')'));
 553                         if (!fullauthors.empty())
 554                                 return fullauthors;
 555                 }
 556                 return authors;
 557         }
 558
 559         if (author.empty())
 560                 return author;
 561
 562         // OK, we've got some names. Let's format them.
 563         // Try to split the author list
 564         vector<docstring> const authors = getAuthors(author);
 565
 566         docstring retval;
 567
 568         CiteEngineType const engine_type = buf ? buf->params().citeEngineType()
 569                                                : ENGINE_TYPE_DEFAULT;
 570
 571         // These are defined in the styles
 572         string const etal =
 573                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_etal")
 574                     : " et al.";
 575         string const namesep =
 576                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_namesep")
 577                    : ", ";
 578         string const lastnamesep =
 579                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_lastnamesep")
 580                     : ", and ";
 581         string const pairnamesep =
 582                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_pairnamesep")
 583                      : " and ";
 584         string firstnameform =
 585                         buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstnameform")
 586                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 587         if (!beginning)
 588                 firstnameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstbynameform")
 589                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 590         string othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!othernameform")
 591                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 592         if (!beginning)
 593                 othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!otherbynameform")
 594                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 595         string citenameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!citenameform")
 596                              : "{%prefix%[[%prefix% ]]}%surname%";
 597
 598         // Shorten the list (with et al.) if forceshort is set
 599         // and the list can actually be shortened, else if maxcitenames
 600         // is passed and full is not set.
 601         bool shorten = forceshort && authors.size() > 1;
 602         vector<docstring>::const_iterator it = authors.begin();
 603         vector<docstring>::const_iterator en = authors.end();
 604         for (size_t i = 0; it != en; ++it, ++i) {
 605                 if (i >= maxnames && !full) {
 606                         shorten = true;
 607                         break;
 608                 }
 609                 if (*it == "others") {
 610                         retval += buf ? buf->B_(etal) : from_ascii(etal);
 611                         break;
 612                 }
 613                 if (i > 0 && i == authors.size() - 1) {
 614                         if (authors.size() == 2)
 615                                 retval += buf ? buf->B_(pairnamesep) : from_ascii(pairnamesep);
 616                         else
 617                                 retval += buf ? buf->B_(lastnamesep) : from_ascii(lastnamesep);
 618                 } else if (i > 0)
 619                         retval += buf ? buf->B_(namesep) : from_ascii(namesep);
 620                 if (allnames)
 621                         retval += (i == 0) ? constructName(*it, firstnameform)
 622                                 : constructName(*it, othernameform);
 623                 else
 624                         retval += constructName(*it, citenameform);
 625         }
 626         if (shorten) {
 627                 if (allnames)
 628                         retval = constructName(authors[0], firstnameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 629                 else
 630                         retval = constructName(authors[0], citenameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 631         }
 632
 633         return convertLaTeXCommands(retval);
 634 }
 635
 636
 637 docstring const BibTeXInfo::getYear() const
 638 {
 639         if (is_bibtex_) {
 640                 // first try legacy year field
 641                 docstring year = operator[]("year");
 642                 if (!year.empty())
 643                         return year;
 644                 // now try biblatex's date field
 645                 year = operator[]("date");
 646                 // Format is [-]YYYY-MM-DD*/[-]YYYY-MM-DD*
 647                 // We only want the years.
 648                 static regex const yreg("[-]?([\\d]{4}).*");
 649                 static regex const ereg(".*/[-]?([\\d]{4}).*");
 650                 smatch sm;
 651                 string const date = to_utf8(year);
 652                 if (!regex_match(date, sm, yreg))
 653                         // cannot parse year.
 654                         return docstring();
 655                 year = from_ascii(sm[1]);
 656                 // check for an endyear
 657                 if (regex_match(date, sm, ereg))
 658                         year += char_type(0x2013) + from_ascii(sm[1]);
 659                 return year;
 660         }
 661
 662         docstring const opt = label();
 663         if (opt.empty())
 664                 return docstring();
 665
 666         docstring authors;
 667         docstring tmp = split(opt, authors, '(');
 668         if (tmp.empty())
 669                 // we don't have author (year)
 670                 return docstring();
 671         docstring year;
 672         tmp = split(tmp, year, ')');
 673         return year;
 674 }
 675
 676
 677 void BibTeXInfo::getLocators(docstring & doi, docstring & url, docstring & file) const
 678 {
 679         if (is_bibtex_) {
 680                 // get "doi" entry from citation record
 681                 doi = operator[]("doi");
 682                 if (!doi.empty() && !prefixIs(doi,from_ascii("http")))
 683                         doi = "https://doi.org/" + doi;
 684                 // get "url" entry from citation record
 685                 url = operator[]("url");
 686                 // get "file" entry from citation record
 687                 file = operator[]("file");
 688
 689                 // Jabref case, "file" field has a format (depending on exporter):
 690                 // Description:Location:Filetype;Description:Location:Filetype...
 691                 // or simply:
 692                 // Location;Location;...
 693                 // We will strip out the locations and return an \n-separated list
 694                 if (!file.empty()) {
 695                         docstring filelist;
 696                         vector<docstring> files = getVectorFromString(file, from_ascii(";"));
 697                         for (auto const & f : files) {
 698                                 // first try if we have Description:Location:Filetype
 699                                 docstring ret, filedest, tmp;
 700                                 ret = split(f, tmp, ':');
 701                                 tmp = split(ret, filedest, ':');
 702                                 if (filedest.empty())
 703                                         // we haven't, so use the whole string
 704                                         filedest = f;
 705                                 // TODO howto deal with relative directories?
 706                                 FileName fn(to_utf8(filedest));
 707                                 if (fn.exists()) {
 708                                         if (!filelist.empty())
 709                                                 filelist += '\n';
 710                                         filelist += "file:///" + filedest;
 711                                 }
 712                         }
 713                         if (!filelist.empty())
 714                                 file = filelist;
 715                 }
 716
 717                 // kbibtex case, "localfile" field with format:
 718                 // file1.pdf;file2.pdf
 719                 // We will strip out the locations and return an \n-separated list
 720                 docstring kfile;
 721                 if (file.empty())
 722                         kfile = operator[]("localfile");
 723                 if (!kfile.empty()) {
 724                         docstring filelist;
 725                         vector<docstring> files = getVectorFromString(kfile, from_ascii(";"));
 726                         for (auto const & f : files) {
 727                                 // TODO howto deal with relative directories?
 728                                 FileName fn(to_utf8(f));
 729                                 if (fn.exists()) {
 730                                         if (!filelist.empty())
 731                                                 filelist += '\n';
 732                                         filelist = "file:///" + f;
 733                                 }
 734                         }
 735                         if (!filelist.empty())
 736                                 file = filelist;
 737                 }
 738
 739                 if (!url.empty())
 740                         return;
 741
 742                 // try biblatex specific fields, see its manual
 743                 // 3.13.7 "Electronic Publishing Informationl"
 744                 docstring eprinttype = operator[]("eprinttype");
 745                 docstring eprint = operator[]("eprint");
 746                 if (eprint.empty())
 747                         return;
 748
 749                 if (eprinttype == "arxiv")
 750                         url = "https://arxiv.org/abs/" + eprint;
 751                 if (eprinttype == "jstor")
 752                         url = "https://www.jstor.org/stable/" + eprint;
 753                 if (eprinttype == "pubmed")
 754                         url = "http://www.ncbi.nlm.nih.gov/pubmed/" + eprint;
 755                 if (eprinttype == "hdl")
 756                         url = "https://hdl.handle.net/" + eprint;
 757                 if (eprinttype == "googlebooks")
 758                         url = "http://books.google.com/books?id=" + eprint;
 759
 760                 return;
 761         }
 762
 763         // Here can be handled the bibliography environment. All one could do
 764         // here is let LyX scan the entry for URL or HRef insets.
 765 }
 766
 767
 768 namespace {
 769
 770 docstring parseOptions(docstring const & format, string & optkey,
 771                     docstring & ifpart, docstring & elsepart);
 772
 773 // Calls parseOptions to deal with an embedded option, such as:
 774 //   {%number%[[, no.~%number%]]}
 775 // which must appear at the start of format. ifelsepart gets the
 776 // whole of the option, and we return what's left after the option.
 777 // we return format if there is an error.
 778 docstring parseEmbeddedOption(docstring const & format, docstring & ifelsepart)
 779 {
 780         LASSERT(format[0] == '{' && format[1] == '%', return format);
 781         string optkey;
 782         docstring ifpart;
 783         docstring elsepart;
 784         docstring const rest = parseOptions(format, optkey, ifpart, elsepart);
 785         if (format == rest) { // parse error
 786                 LYXERR0("ERROR! Couldn't parse `" << format <<"'.");
 787                 return format;
 788         }
 789         LASSERT(rest.size() <= format.size(),
 790                 { ifelsepart = docstring(); return format; });
 791         ifelsepart = format.substr(0, format.size() - rest.size());
 792         return rest;
 793 }
 794
 795
 796 // Gets a "clause" from a format string, where the clause is
 797 // delimited by '[[' and ']]'. Returns what is left after the
 798 // clause is removed, and returns format if there is an error.
 799 docstring getClause(docstring const & format, docstring & clause)
 800 {
 801         docstring fmt = format;
 802         // remove '[['
 803         fmt = fmt.substr(2);
 804         // we'll remove characters from the front of fmt as we
 805         // deal with them
 806         while (!fmt.empty()) {
 807                 if (fmt[0] == ']' && fmt.size() > 1 && fmt[1] == ']') {
 808                         // that's the end
 809                         fmt = fmt.substr(2);
 810                         break;
 811                 }
 812                 // check for an embedded option
 813                 if (fmt[0] == '{' && fmt.size() > 1 && fmt[1] == '%') {
 814                         docstring part;
 815                         docstring const rest = parseEmbeddedOption(fmt, part);
 816                         if (fmt == rest) {
 817                                 LYXERR0("ERROR! Couldn't parse embedded option in `" << format <<"'.");
 818                                 return format;
 819                         }
 820                         clause += part;
 821                         fmt = rest;
 822                 } else { // it's just a normal character
 823                                 clause += fmt[0];
 824                                 fmt = fmt.substr(1);
 825                 }
 826         }
 827         return fmt;
 828 }
 829
 830
 831 // parse an options string, which must appear at the start of the
 832 // format parameter. puts the parsed bits in optkey, ifpart, and
 833 // elsepart and returns what's left after the option is removed.
 834 // if there's an error, it returns format itself.
 835 docstring parseOptions(docstring const & format, string & optkey,
 836                     docstring & ifpart, docstring & elsepart)
 837 {
 838         LASSERT(format[0] == '{' && format[1] == '%', return format);
 839         // strip '{%'
 840         docstring fmt = format.substr(2);
 841         size_t pos = fmt.find('%'); // end of key
 842         if (pos == string::npos) {
 843                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of key.");
 844                 return format;
 845         }
 846         optkey = to_utf8(fmt.substr(0, pos));
 847         fmt = fmt.substr(pos + 1);
 848         // [[format]] should be next
 849         if (fmt[0] != '[' || fmt[1] != '[') {
 850                 LYXERR0("Error parsing  `" << format <<"'. Can't find '[[' after key.");
 851                 return format;
 852         }
 853
 854         docstring curfmt = fmt;
 855         fmt = getClause(curfmt, ifpart);
 856         if (fmt == curfmt) {
 857                 LYXERR0("Error parsing  `" << format <<"'. Couldn't get if clause.");
 858                 return format;
 859         }
 860
 861         if (fmt[0] == '}') // we're done, no else clause
 862                 return fmt.substr(1);
 863
 864         // else part should follow
 865         if (fmt[0] != '[' || fmt[1] != '[') {
 866                 LYXERR0("Error parsing  `" << format <<"'. Can't find else clause.");
 867                 return format;
 868         }
 869
 870         curfmt = fmt;
 871         fmt = getClause(curfmt, elsepart);
 872         // we should be done
 873         if (fmt == curfmt || fmt[0] != '}') {
 874                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of option.");
 875                 return format;
 876         }
 877         return fmt.substr(1);
 878 }
 879
 880
 881 } // namespace
 882
 883 /* FIXME
 884 Bug #9131 revealed an oddity in how we are generating citation information
 885 when more than one key is given. We end up building a longer and longer format
 886 string as we go, which we then have to re-parse, over and over and over again,
 887 rather than generating the information for the individual keys and then putting
 888 all of that together. We do that to deal with the way separators work, from what
 889 I can tell, but it still feels like a hack. Fixing this would require quite a
 890 bit of work, however.
 891 */
 892 docstring BibTeXInfo::expandFormat(docstring const & format,
 893                 BibTeXInfoList const & xrefs, int & counter, Buffer const & buf,
 894                 CiteItem const & ci, bool next, bool second) const
 895 {
 896         // incorrect use of macros could put us in an infinite loop
 897         static int const max_passes = 5000;
 898         // the use of overly large keys can lead to performance problems, due
 899         // to eventual attempts to convert LaTeX macros to unicode. See bug
 900         // #8944. By default, the size is limited to 128 (in CiteItem), but
 901         // for specific purposes (such as XHTML export), it needs to be enlarged
 902         // This is perhaps not the best solution, but it will have to do for now.
 903         size_t const max_keysize = ci.max_key_size;
 904         odocstringstream ret; // return value
 905         string key;
 906         bool scanning_key = false;
 907         bool scanning_rich = false;
 908
 909         CiteEngineType const engine_type = buf.params().citeEngineType();
 910         docstring fmt = format;
 911         // we'll remove characters from the front of fmt as we
 912         // deal with them
 913         while (!fmt.empty()) {
 914                 if (counter > max_passes) {
 915                         LYXERR0("Recursion limit reached while parsing `"
 916                                 << format << "'.");
 917                         return _("ERROR!");
 918                 }
 919
 920                 char_type thischar = fmt[0];
 921                 if (thischar == '%') {
 922                         // beginning or end of key
 923                         if (scanning_key) {
 924                                 // end of key
 925                                 scanning_key = false;
 926                                 // so we replace the key with its value, which may be empty
 927                                 if (key[0] == '!') {
 928                                         // macro
 929                                         string const val =
 930                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 931                                         fmt = from_utf8(val) + fmt.substr(1);
 932                                         counter += 1;
 933                                         continue;
 934                                 } else if (prefixIs(key, "B_")) {
 935                                         // a translatable bit (to the Buffer language)
 936                                         string const val =
 937                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 938                                         docstring const trans =
 939                                                 translateIfPossible(from_utf8(val), buf.params().language->code());
 940                                         ret << trans;
 941                                 } else if (key[0] == '_') {
 942                                         // a translatable bit (to the GUI language)
 943                                         string const val =
 944                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 945                                         docstring const trans =
 946                                                 translateIfPossible(from_utf8(val));
 947                                         ret << trans;
 948                                 } else {
 949                                         docstring const val =
 950                                                 getValueForKey(key, buf, ci, xrefs, max_keysize);
 951                                         if (!scanning_rich)
 952                                                 ret << from_ascii("{!<span class=\"bib-" + key + "\">!}");
 953                                         ret << val;
 954                                         if (!scanning_rich)
 955                                                 ret << from_ascii("{!</span>!}");
 956                                 }
 957                         } else {
 958                                 // beginning of key
 959                                 key.clear();
 960                                 scanning_key = true;
 961                         }
 962                 }
 963                 else if (thischar == '{') {
 964                         // beginning of option?
 965                         if (scanning_key) {
 966                                 LYXERR0("ERROR: Found `{' when scanning key in `" << format << "'.");
 967                                 return _("ERROR!");
 968                         }
 969                         if (fmt.size() > 1) {
 970                                 if (fmt[1] == '%') {
 971                                         // it is the beginning of an optional format
 972                                         string optkey;
 973                                         docstring ifpart;
 974                                         docstring elsepart;
 975                                         docstring const newfmt =
 976                                                 parseOptions(fmt, optkey, ifpart, elsepart);
 977                                         if (newfmt == fmt) // parse error
 978                                                 return _("ERROR!");
 979                                         fmt = newfmt;
 980                                         docstring const val =
 981                                                 getValueForKey(optkey, buf, ci, xrefs);
 982                                         if (optkey == "next" && next)
 983                                                 ret << ifpart; // without expansion
 984                                         else if (optkey == "second" && second) {
 985                                                 int newcounter = 0;
 986                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 987                                                         ci, next);
 988                                         } else if (!val.empty()) {
 989                                                 int newcounter = 0;
 990                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 991                                                         ci, next);
 992                                         } else if (!elsepart.empty()) {
 993                                                 int newcounter = 0;
 994                                                 ret << expandFormat(elsepart, xrefs, newcounter, buf,
 995                                                         ci, next);
 996                                         }
 997                                         // fmt will have been shortened for us already
 998                                         continue;
 999                                 }
1000                                 if (fmt[1] == '!') {
1001                                         // beginning of rich text
1002                                         scanning_rich = true;
1003                                         fmt = fmt.substr(2);
1004                                         ret << from_ascii("{!");
1005                                         continue;
1006                                 }
1007                         }
1008                         // we are here if '{' was not followed by % or !.
1009                         // So it's just a character.
1010                         ret << thischar;
1011                 }
1012                 else if (scanning_rich && thischar == '!'
1013                          && fmt.size() > 1 && fmt[1] == '}') {
1014                         // end of rich text
1015                         scanning_rich = false;
1016                         fmt = fmt.substr(2);
1017                         ret << from_ascii("!}");
1018                         continue;
1019                 }
1020                 else if (scanning_key)
1021                         key += char(thischar);
1022                 else {
1023                         try {
1024                                 ret.put(thischar);
1025                         } catch (EncodingException & /* e */) {
1026                                 LYXERR0("Uncodable character '" << docstring(1, thischar) << " in citation label!");
1027                         }
1028                 }
1029                 fmt = fmt.substr(1);
1030         } // for loop
1031         if (scanning_key) {
1032                 LYXERR0("Never found end of key in `" << format << "'!");
1033                 return _("ERROR!");
1034         }
1035         if (scanning_rich) {
1036                 LYXERR0("Never found end of rich text in `" << format << "'!");
1037                 return _("ERROR!");
1038         }
1039         return ret.str();
1040 }
1041
1042
1043 docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs,
1044         Buffer const & buf, CiteItem const & ci, docstring const & format_in) const
1045 {
1046         bool const richtext = ci.richtext;
1047
1048         CiteEngineType const engine_type = buf.params().citeEngineType();
1049         DocumentClass const & dc = buf.params().documentClass();
1050         docstring const & format = format_in.empty()?
1051                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_)))
1052                               : format_in;
1053
1054         if (format != format_) {
1055                 // clear caches since format changed
1056                 info_.clear();
1057                 info_richtext_.clear();
1058                 format_ = format;
1059         }
1060
1061         if (!richtext && !info_.empty()) {
1062                 info_ = convertLaTeXCommands(processRichtext(info_, false));
1063                 return info_;
1064         }
1065         if (richtext && !info_richtext_.empty())
1066                 return info_richtext_;
1067
1068         if (!is_bibtex_) {
1069                 BibTeXInfo::const_iterator it = find(from_ascii("ref"));
1070                 info_ = it->second;
1071                 return info_;
1072         }
1073
1074         int counter = 0;
1075         info_ = expandFormat(format, xrefs, counter, buf,
1076                 ci, false, false);
1077
1078         if (info_.empty()) {
1079                 // this probably shouldn't happen
1080                 return info_;
1081         }
1082
1083         if (richtext) {
1084                 info_richtext_ = convertLaTeXCommands(processRichtext(info_, true));
1085                 return info_richtext_;
1086         }
1087
1088         info_ = convertLaTeXCommands(processRichtext(info_, false));
1089         return info_;
1090 }
1091
1092
1093 docstring const BibTeXInfo::getLabel(BibTeXInfoList const & xrefs,
1094         Buffer const & buf, docstring const & format,
1095         CiteItem const & ci, bool next, bool second) const
1096 {
1097         docstring loclabel;
1098
1099         int counter = 0;
1100         loclabel = expandFormat(format, xrefs, counter, buf, ci, next, second);
1101
1102         if (!loclabel.empty() && !next) {
1103                 loclabel = processRichtext(loclabel, ci.richtext);
1104                 loclabel = convertLaTeXCommands(loclabel);
1105         }
1106
1107         return loclabel;
1108 }
1109
1110
1111 docstring const & BibTeXInfo::operator[](docstring const & field) const
1112 {
1113         BibTeXInfo::const_iterator it = find(field);
1114         if (it != end())
1115                 return it->second;
1116         static docstring const empty_value = docstring();
1117         return empty_value;
1118 }
1119
1120
1121 docstring const & BibTeXInfo::operator[](string const & field) const
1122 {
1123         return operator[](from_ascii(field));
1124 }
1125
1126
1127 docstring BibTeXInfo::getValueForKey(string const & oldkey, Buffer const & buf,
1128         CiteItem const & ci, BibTeXInfoList const & xrefs, size_t maxsize) const
1129 {
1130         // anything less is pointless
1131         LASSERT(maxsize >= 16, maxsize = 16);
1132         string key = oldkey;
1133         bool cleanit = false;
1134         if (prefixIs(oldkey, "clean:")) {
1135                 key = oldkey.substr(6);
1136                 cleanit = true;
1137         }
1138
1139         docstring ret = operator[](key);
1140         if (ret.empty()) {
1141                 docstring subtype;
1142                 if (contains(key, ':'))
1143                         subtype = from_ascii(token(key, ':', 1));
1144                 // some special keys
1145                 // FIXME: dialog, textbefore and textafter have nothing to do with this
1146                 if (key == "dialog" && ci.context == CiteItem::Dialog)
1147                         ret = from_ascii("x"); // any non-empty string will do
1148                 else if (key == "export" && ci.context == CiteItem::Export)
1149                         ret = from_ascii("x"); // any non-empty string will do
1150                 else if (key == "ifstar" && ci.Starred)
1151                         ret = from_ascii("x"); // any non-empty string will do
1152                 else if (key == "ifqualified" && ci.isQualified)
1153                         ret = from_ascii("x"); // any non-empty string will do
1154                 else if (key == "entrytype")
1155                         ret = entry_type_;
1156                 else if (prefixIs(key, "ifentrytype:")
1157                          && from_ascii(key.substr(12)) == entry_type_)
1158                         ret = from_ascii("x"); // any non-empty string will do
1159                 else if (key == "key")
1160                         ret = bib_key_;
1161                 else if (key == "label")
1162                         ret = label_;
1163                 else if (key == "modifier" && modifier_ != 0)
1164                         ret = modifier_;
1165                 else if (key == "numericallabel")
1166                         ret = cite_number_;
1167                 else if (prefixIs(key, "ifmultiple:")) {
1168                         // Return whether we have multiple authors
1169                         docstring const kind = operator[](subtype);
1170                         if (multipleAuthors(kind))
1171                                 ret = from_ascii("x"); // any non-empty string will do
1172                 }
1173                 else if (prefixIs(key, "abbrvnames:")) {
1174                         // Special key to provide abbreviated name list,
1175                         // with respect to maxcitenames. Suitable for Bibliography
1176                         // beginnings.
1177                         docstring const kind = operator[](subtype);
1178                         ret = getAuthorList(&buf, kind, false, false, true);
1179                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1180                                 ret[0] = uppercase(ret[0]);
1181                 } else if (prefixIs(key, "fullnames:")) {
1182                         // Return a full name list. Suitable for Bibliography
1183                         // beginnings.
1184                         docstring const kind = operator[](subtype);
1185                         ret = getAuthorList(&buf, kind, true, false, true);
1186                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1187                                 ret[0] = uppercase(ret[0]);
1188                 } else if (prefixIs(key, "forceabbrvnames:")) {
1189                         // Special key to provide abbreviated name lists,
1190                         // irrespective of maxcitenames. Suitable for Bibliography
1191                         // beginnings.
1192                         docstring const kind = operator[](subtype);
1193                         ret = getAuthorList(&buf, kind, false, true, true);
1194                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1195                                 ret[0] = uppercase(ret[0]);
1196                 } else if (prefixIs(key, "abbrvbynames:")) {
1197                         // Special key to provide abbreviated name list,
1198                         // with respect to maxcitenames. Suitable for further names inside a
1199                         // bibliography item // (such as "ed. by ...")
1200                         docstring const kind = operator[](subtype);
1201                         ret = getAuthorList(&buf, kind, false, false, true, false);
1202                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1203                                 ret[0] = uppercase(ret[0]);
1204                 } else if (prefixIs(key, "fullbynames:")) {
1205                         // Return a full name list. Suitable for further names inside a
1206                         // bibliography item // (such as "ed. by ...")
1207                         docstring const kind = operator[](subtype);
1208                         ret = getAuthorList(&buf, kind, true, false, true, false);
1209                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1210                                 ret[0] = uppercase(ret[0]);
1211                 } else if (prefixIs(key, "forceabbrvbynames:")) {
1212                         // Special key to provide abbreviated name lists,
1213                         // irrespective of maxcitenames. Suitable for further names inside a
1214                         // bibliography item // (such as "ed. by ...")
1215                         docstring const kind = operator[](subtype);
1216                         ret = getAuthorList(&buf, kind, false, true, true, false);
1217                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1218                                 ret[0] = uppercase(ret[0]);
1219                 } else if (key == "abbrvciteauthor") {
1220                         // Special key to provide abbreviated author or
1221                         // editor names (suitable for citation labels),
1222                         // with respect to maxcitenames.
1223                         ret = getAuthorOrEditorList(&buf, false, false);
1224                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1225                                 ret[0] = uppercase(ret[0]);
1226                 } else if (key == "fullciteauthor") {
1227                         // Return a full author or editor list (for citation labels)
1228                         ret = getAuthorOrEditorList(&buf, true, false);
1229                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1230                                 ret[0] = uppercase(ret[0]);
1231                 } else if (key == "forceabbrvciteauthor") {
1232                         // Special key to provide abbreviated author or
1233                         // editor names (suitable for citation labels),
1234                         // irrespective of maxcitenames.
1235                         ret = getAuthorOrEditorList(&buf, false, true);
1236                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1237                                 ret[0] = uppercase(ret[0]);
1238                 } else if (key == "bibentry") {
1239                         // Special key to provide the full bibliography entry: see getInfo()
1240                         CiteEngineType const engine_type = buf.params().citeEngineType();
1241                         DocumentClass const & dc = buf.params().documentClass();
1242                         docstring const & format =
1243                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_), false));
1244                         int counter = 0;
1245                         ret = expandFormat(format, xrefs, counter, buf, ci, false, false);
1246                 } else if (key == "textbefore")
1247                         ret = ci.textBefore;
1248                 else if (key == "textafter")
1249                         ret = ci.textAfter;
1250                 else if (key == "curpretext") {
1251                         vector<pair<docstring, docstring>> pres = ci.getPretexts();
1252                         vector<pair<docstring, docstring>>::iterator it = pres.begin();
1253                         int numkey = 1;
1254                         for (; it != pres.end() ; ++it) {
1255                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1256                                         ret = (*it).second;
1257                                         pres.erase(it);
1258                                         break;
1259                                 }
1260                                 if ((*it).first == bib_key_)
1261                                         ++numkey;
1262                         }
1263                 } else if (key == "curposttext") {
1264                         vector<pair<docstring, docstring>> posts = ci.getPosttexts();
1265                         vector<pair<docstring, docstring>>::iterator it = posts.begin();
1266                         int numkey = 1;
1267                         for (; it != posts.end() ; ++it) {
1268                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1269                                         ret = (*it).second;
1270                                         posts.erase(it);
1271                                         break;
1272                                 }
1273                                 if ((*it).first == bib_key_)
1274                                         ++numkey;
1275                         }
1276                 } else if (key == "year")
1277                         ret = getYear();
1278         }
1279
1280         // If we have no result, check in the cross-ref'ed entries
1281         if (ret.empty() && !xrefs.empty()) {
1282                 bool const biblatex =
1283                         buf.params().documentClass().citeFramework() == "biblatex";
1284                 // xr is a (reference to a) BibTeXInfo const *
1285                 for (auto const & xr : xrefs) {
1286                         if (!xr)
1287                                 continue;
1288                         // use empty BibTeXInfoList to avoid loops
1289                         BibTeXInfoList xr_dummy;
1290                         ret = xr->getValueForKey(oldkey, buf, ci, xr_dummy, maxsize);
1291                         if (!ret.empty())
1292                                 // success!
1293                                 break;
1294                         // in biblatex, cross-ref'ed titles are mapped
1295                         // to booktitle. Same for subtitle etc.
1296                         if (biblatex && prefixIs(key, "book"))
1297                                 ret = (*xr)[key.substr(4)];
1298                         // likewise, author is maped onto bookauthor
1299                         else if (biblatex && contains(key, ":bookauthor"))
1300                                 ret = xr->getValueForKey(subst(key, "bookauthor", "author"),
1301                                                          buf, ci, xr_dummy, maxsize);
1302                         if (!ret.empty())
1303                                 // success!
1304                                 break;
1305                 }
1306         }
1307
1308         if (cleanit)
1309                 ret = xml::cleanAttr(ret);
1310
1311         // make sure it is not too big
1312         support::truncateWithEllipsis(ret, maxsize);
1313         return ret;
1314 }
1315
1316
1317 //////////////////////////////////////////////////////////////////////
1318 //
1319 // BiblioInfo
1320 //
1321 //////////////////////////////////////////////////////////////////////
1322
1323 namespace {
1324
1325 // A functor for use with sort, leading to case insensitive sorting
1326 bool compareNoCase(const docstring & a, const docstring & b) {
1327         return compare_no_case(a, b) < 0;
1328 }
1329
1330 } // namespace
1331
1332
1333 vector<docstring> const BiblioInfo::getXRefs(BibTeXInfo const & data, bool const nested) const
1334 {
1335         vector<docstring> result;
1336         if (!data.isBibTeX())
1337                 return result;
1338         // Legacy crossref field. This is not nestable.
1339         if (!nested && !data["crossref"].empty()) {
1340                 docstring const xrefkey = data["crossref"];
1341                 result.push_back(xrefkey);
1342                 // However, check for nested xdatas
1343                 BiblioInfo::const_iterator it = find(xrefkey);
1344                 if (it != end()) {
1345                         BibTeXInfo const & xref = it->second;
1346                         vector<docstring> const nxdata = getXRefs(xref, true);
1347                         if (!nxdata.empty())
1348                                 result.insert(result.end(), nxdata.begin(), nxdata.end());
1349                 }
1350         }
1351         // Biblatex's xdata field. Infinitely nestable.
1352         // XData field can consist of a comma-separated list of keys
1353         vector<docstring> const xdatakeys = getVectorFromString(data["xdata"]);
1354         if (!xdatakeys.empty()) {
1355                 for (auto const & xdatakey : xdatakeys) {
1356                         result.push_back(xdatakey);
1357                         BiblioInfo::const_iterator it = find(xdatakey);
1358                         if (it != end()) {
1359                                 BibTeXInfo const & xdata = it->second;
1360                                 vector<docstring> const nxdata = getXRefs(xdata, true);
1361                                 if (!nxdata.empty())
1362                                         result.insert(result.end(), nxdata.begin(), nxdata.end());
1363                         }
1364                 }
1365         }
1366         return result;
1367 }
1368
1369
1370 vector<docstring> const BiblioInfo::getKeys() const
1371 {
1372         vector<docstring> bibkeys;
1373         for (auto const & bi : *this)
1374                 bibkeys.push_back(bi.first);
1375         sort(bibkeys.begin(), bibkeys.end(), &compareNoCase);
1376         return bibkeys;
1377 }
1378
1379
1380 vector<docstring> const BiblioInfo::getFields() const
1381 {
1382         vector<docstring> bibfields;
1383         for (auto const & fn : field_names_)
1384                 bibfields.push_back(fn);
1385         sort(bibfields.begin(), bibfields.end());
1386         return bibfields;
1387 }
1388
1389
1390 vector<docstring> const BiblioInfo::getEntries() const
1391 {
1392         vector<docstring> bibentries;
1393         for (auto const & et : entry_types_)
1394                 bibentries.push_back(et);
1395         sort(bibentries.begin(), bibentries.end());
1396         return bibentries;
1397 }
1398
1399
1400 docstring const BiblioInfo::getAuthorOrEditorList(docstring const & key, Buffer const & buf) const
1401 {
1402         BiblioInfo::const_iterator it = find(key);
1403         if (it == end())
1404                 return docstring();
1405         BibTeXInfo const & data = it->second;
1406         return data.getAuthorOrEditorList(&buf, false);
1407 }
1408
1409
1410 docstring const BiblioInfo::getCiteNumber(docstring const & key) const
1411 {
1412         BiblioInfo::const_iterator it = find(key);
1413         if (it == end())
1414                 return docstring();
1415         BibTeXInfo const & data = it->second;
1416         return data.citeNumber();
1417 }
1418
1419 void BiblioInfo::getLocators(docstring const & key, docstring & doi, docstring & url, docstring & file) const
1420 {
1421         BiblioInfo::const_iterator it = find(key);
1422          if (it == end())
1423                 return;
1424         BibTeXInfo const & data = it->second;
1425         data.getLocators(doi,url,file);
1426 }
1427
1428
1429 docstring const BiblioInfo::getYear(docstring const & key, bool use_modifier) const
1430 {
1431         BiblioInfo::const_iterator it = find(key);
1432         if (it == end())
1433                 return docstring();
1434         BibTeXInfo const & data = it->second;
1435         docstring year = data.getYear();
1436         if (year.empty()) {
1437                 // let's try the crossrefs
1438                 vector<docstring> const xrefs = getXRefs(data);
1439                 if (xrefs.empty())
1440                         // no luck
1441                         return docstring();
1442                 for (docstring const & xref : xrefs) {
1443                         BiblioInfo::const_iterator const xrefit = find(xref);
1444                         if (xrefit == end())
1445                                 continue;
1446                         BibTeXInfo const & xref_data = xrefit->second;
1447                         year = xref_data.getYear();
1448                         if (!year.empty())
1449                                 // success!
1450                                 break;
1451                 }
1452         }
1453         if (use_modifier && data.modifier() != 0)
1454                 year += data.modifier();
1455         return year;
1456 }
1457
1458
1459 docstring const BiblioInfo::getYear(docstring const & key, Buffer const & buf, bool use_modifier) const
1460 {
1461         docstring const year = getYear(key, use_modifier);
1462         if (year.empty())
1463                 return buf.B_("No year");
1464         return year;
1465 }
1466
1467
1468 docstring const BiblioInfo::getInfo(docstring const & key,
1469         Buffer const & buf, CiteItem const & ci, docstring const & format) const
1470 {
1471         BiblioInfo::const_iterator it = find(key);
1472         if (it == end())
1473                 return _("Bibliography entry not found!");
1474         BibTeXInfo const & data = it->second;
1475         BibTeXInfoList xrefptrs;
1476         for (docstring const & xref : getXRefs(data)) {
1477                 BiblioInfo::const_iterator const xrefit = find(xref);
1478                 if (xrefit != end())
1479                         xrefptrs.push_back(&(xrefit->second));
1480         }
1481         return data.getInfo(xrefptrs, buf, ci, format);
1482 }
1483
1484
1485 docstring const BiblioInfo::getLabel(vector<docstring> keys,
1486         Buffer const & buf, string const & style, CiteItem const & ci) const
1487 {
1488         size_t max_size = ci.max_size;
1489         // shorter makes no sense
1490         LASSERT(max_size >= 16, max_size = 16);
1491
1492         // we can't display more than 10 of these, anyway
1493         // but since we truncate in the middle,
1494         // we need to split into two halfs.
1495         bool const too_many_keys = keys.size() > 10;
1496         vector<docstring> lkeys;
1497         if (too_many_keys) {
1498                 lkeys.insert(lkeys.end(), keys.end() - 5, keys.end());
1499                 keys.resize(5);
1500                 keys.insert(keys.end(), lkeys.begin(), lkeys.end());
1501         }
1502
1503         CiteEngineType const engine_type = buf.params().citeEngineType();
1504         DocumentClass const & dc = buf.params().documentClass();
1505         docstring const & format = from_utf8(dc.getCiteFormat(engine_type, style, false, "cite"));
1506         docstring ret = format;
1507         vector<docstring>::const_iterator key = keys.begin();
1508         vector<docstring>::const_iterator ken = keys.end();
1509         vector<docstring> handled_keys;
1510         for (int i = 0; key != ken; ++key, ++i) {
1511                 handled_keys.push_back(*key);
1512                 int n = 0;
1513                 for (auto const & k : handled_keys) {
1514                         if (k == *key)
1515                                 ++n;
1516                 }
1517                 BiblioInfo::const_iterator it = find(*key);
1518                 BibTeXInfo empty_data;
1519                 empty_data.key(*key);
1520                 BibTeXInfo & data = empty_data;
1521                 vector<BibTeXInfo const *> xrefptrs;
1522                 if (it != end()) {
1523                         data = it->second;
1524                         for (docstring const & xref : getXRefs(data)) {
1525                                 BiblioInfo::const_iterator const xrefit = find(xref);
1526                                 if (xrefit != end())
1527                                         xrefptrs.push_back(&(xrefit->second));
1528                         }
1529                 }
1530                 data.numKey(n);
1531                 ret = data.getLabel(xrefptrs, buf, ret, ci, key + 1 != ken, i == 1);
1532         }
1533
1534         support::truncateWithEllipsis(ret, max_size, true);
1535
1536         return ret;
1537 }
1538
1539
1540 bool BiblioInfo::isBibtex(docstring const & key) const
1541 {
1542         docstring key1;
1543         split(key, key1, ',');
1544         BiblioInfo::const_iterator it = find(key1);
1545         if (it == end())
1546                 return false;
1547         return it->second.isBibTeX();
1548 }
1549
1550
1551 BiblioInfo::CiteStringMap const BiblioInfo::getCiteStrings(
1552         vector<docstring> const & keys, vector<CitationStyle> const & styles,
1553         Buffer const & buf, CiteItem const & ci) const
1554 {
1555         if (empty())
1556                 return vector<pair<docstring,docstring>>();
1557
1558         string style;
1559         CiteStringMap csm(styles.size());
1560         for (size_t i = 0; i != csm.size(); ++i) {
1561                 style = styles[i].name;
1562                 csm[i] = make_pair(from_ascii(style), getLabel(keys, buf, style, ci));
1563         }
1564
1565         return csm;
1566 }
1567
1568
1569 void BiblioInfo::mergeBiblioInfo(BiblioInfo const & info)
1570 {
1571         bimap_.insert(info.begin(), info.end());
1572         field_names_.insert(info.field_names_.begin(), info.field_names_.end());
1573         entry_types_.insert(info.entry_types_.begin(), info.entry_types_.end());
1574 }
1575
1576
1577 namespace {
1578
1579 // used in xhtml to sort a list of BibTeXInfo objects
1580 bool lSorter(BibTeXInfo const * lhs, BibTeXInfo const * rhs)
1581 {
1582         docstring const lauth = lhs->getAuthorOrEditorList();
1583         docstring const rauth = rhs->getAuthorOrEditorList();
1584         docstring const lyear = lhs->getYear();
1585         docstring const ryear = rhs->getYear();
1586         docstring const ltitl = lhs->operator[]("title");
1587         docstring const rtitl = rhs->operator[]("title");
1588         return  (lauth < rauth)
1589                 || (lauth == rauth && lyear < ryear)
1590                 || (lauth == rauth && lyear == ryear && ltitl < rtitl);
1591 }
1592
1593 } // namespace
1594
1595
1596 void BiblioInfo::collectCitedEntries(Buffer const & buf)
1597 {
1598         cited_entries_.clear();
1599         // We are going to collect all the citation keys used in the document,
1600         // getting them from the TOC.
1601         // FIXME We may want to collect these differently, in the first case,
1602         // so that we might have them in order of appearance.
1603         set<docstring> citekeys;
1604         Toc const & toc = *buf.tocBackend().toc("citation");
1605         for (auto const & t : toc) {
1606                 if (t.str().empty())
1607                         continue;
1608                 vector<docstring> const keys = getVectorFromString(t.str());
1609                 citekeys.insert(keys.begin(), keys.end());
1610         }
1611         if (citekeys.empty())
1612                 return;
1613
1614         // We have a set of the keys used in this document.
1615         // We will now convert it to a list of the BibTeXInfo objects used in
1616         // this document...
1617         vector<BibTeXInfo const *> bi;
1618         for (auto const & ck : citekeys) {
1619                 BiblioInfo::const_iterator const bt = find(ck);
1620                 if (bt == end() || !bt->second.isBibTeX())
1621                         continue;
1622                 bi.push_back(&(bt->second));
1623         }
1624         // ...and sort it.
1625         sort(bi.begin(), bi.end(), lSorter);
1626
1627         // Now we can write the sorted keys
1628         // b is a BibTeXInfo const *
1629         for (auto const & b : bi)
1630                 cited_entries_.push_back(b->key());
1631 }
1632
1633
1634 void BiblioInfo::makeCitationLabels(Buffer const & buf)
1635 {
1636         collectCitedEntries(buf);
1637         CiteEngineType const engine_type = buf.params().citeEngineType();
1638         bool const numbers = (engine_type & ENGINE_TYPE_NUMERICAL);
1639
1640         int keynumber = 0;
1641         char modifier = 0;
1642         // used to remember the last one we saw
1643         // we'll be comparing entries to see if we need to add
1644         // modifiers, like "1984a"
1645         map<docstring, BibTeXInfo>::iterator last = bimap_.end();
1646
1647         // add letters to years
1648         for (auto const & ce : cited_entries_) {
1649                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1650                 // this shouldn't happen, but...
1651                 if (biit == bimap_.end())
1652                         // ...fail gracefully, anyway.
1653                         continue;
1654                 BibTeXInfo & entry = biit->second;
1655                 if (numbers) {
1656                         docstring const num = convert<docstring>(++keynumber);
1657                         entry.setCiteNumber(num);
1658                 } else {
1659                         // The first test here is checking whether this is the first
1660                         // time through the loop. If so, then we do not have anything
1661                         // with which to compare.
1662                         if (last != bimap_.end()
1663                             && entry.getAuthorOrEditorList() == last->second.getAuthorOrEditorList()
1664                             // we access the year via getYear() so as to get it from the xref,
1665                             // if we need to do so
1666                             && getYear(entry.key()) == getYear(last->second.key())) {
1667                                 if (modifier == 0) {
1668                                         // so the last one should have been 'a'
1669                                         last->second.setModifier('a');
1670                                         modifier = 'b';
1671                                 } else if (modifier == 'z')
1672                                         modifier = 'A';
1673                                 else
1674                                         modifier++;
1675                         } else {
1676                                 modifier = 0;
1677                         }
1678                         entry.setModifier(modifier);
1679                         // remember the last one
1680                         last = biit;
1681                 }
1682         }
1683         // Set the labels
1684         for (auto const & ce : cited_entries_) {
1685                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1686                 // this shouldn't happen, but...
1687                 if (biit == bimap_.end())
1688                         // ...fail gracefully, anyway.
1689                         continue;
1690                 BibTeXInfo & entry = biit->second;
1691                 if (numbers) {
1692                         entry.label(entry.citeNumber());
1693                 } else {
1694                         docstring const auth = entry.getAuthorOrEditorList(&buf, false);
1695                         // we do it this way so as to access the xref, if necessary
1696                         // note that this also gives us the modifier
1697                         docstring const year = getYear(ce, buf, true);
1698                         if (!auth.empty() && !year.empty())
1699                                 entry.label(auth + ' ' + year);
1700                         else
1701                                 entry.label(entry.key());
1702                 }
1703         }
1704 }
1705
1706
1707 //////////////////////////////////////////////////////////////////////
1708 //
1709 // CitationStyle
1710 //
1711 //////////////////////////////////////////////////////////////////////
1712
1713
1714 CitationStyle citationStyleFromString(string const & command,
1715                                       BufferParams const & params)
1716 {
1717         CitationStyle cs;
1718         if (command.empty())
1719                 return cs;
1720
1721         string const alias = params.getCiteAlias(command);
1722         string cmd = alias.empty() ? command : alias;
1723         if (isUpperCase(command[0])) {
1724                 cs.forceUpperCase = true;
1725                 cmd[0] = lowercase(cmd[0]);
1726         }
1727
1728         size_t const n = command.size() - 1;
1729         if (command[n] == '*') {
1730                 cs.hasStarredVersion = true;
1731                 if (suffixIs(cmd, '*'))
1732                         cmd = cmd.substr(0, cmd.size() - 1);
1733         }
1734
1735         cs.name = cmd;
1736         return cs;
1737 }
1738
1739
1740 string citationStyleToString(const CitationStyle & cs, bool const latex)
1741 {
1742         string cmd = latex ? cs.cmd : cs.name;
1743         if (cs.forceUpperCase)
1744                 cmd[0] = uppercase(cmd[0]);
1745         if (cs.hasStarredVersion)
1746                 cmd += '*';
1747         return cmd;
1748 }
1749
1750
1751 void authorsToDocBookAuthorGroup(docstring const & authorsString, XMLStream & xs, Buffer const & buf,
1752                                  const std::string type)
1753 {
1754         // This function closely mimics getAuthorList, but produces DocBook instead of text.
1755         // It has been greatly simplified, as the complete list of authors is always produced. No separators are required,
1756         // as the output has a database-like shape.
1757         // constructName has also been merged within, as it becomes really simple and leads to no copy-paste.
1758
1759         if (! type.empty() && (type != "author" && type != "book")) {
1760                 LYXERR0("ERROR! Unexpected author contribution `" << type <<"'.");
1761                 return;
1762         }
1763
1764         if (authorsString.empty()) {
1765                 return;
1766         }
1767
1768         // Split the input list of authors into individual authors.
1769         vector<docstring> const authors = getAuthors(authorsString);
1770
1771         // Retrieve the "et al." variation.
1772         string const etal = buf.params().documentClass().getCiteMacro(buf.params().citeEngineType(), "_etal");
1773
1774         // Output the list of authors.
1775         xs << xml::StartTag("authorgroup");
1776         xs << xml::CR();
1777
1778         auto it = authors.cbegin();
1779         auto en = authors.cend();
1780         for (size_t i = 0; it != en; ++it, ++i) {
1781                 const std::string tag = (type.empty() || type == "author") ? "author" : "othercredit";
1782                 const std::string attr = (type == "book") ? R"(class="other" otherclass="bookauthor")" : "";
1783
1784                 xs << xml::StartTag(tag, attr);
1785                 xs << xml::CR();
1786                 xs << xml::StartTag("personname");
1787                 xs << xml::CR();
1788                 const docstring name = *it;
1789
1790                 // All authors go in a <personname>. If more structure is known, use it; otherwise (just "et al."),
1791                 // print it as such.
1792                 if (name == "others") {
1793                         xs << buf.B_(etal);
1794                 } else {
1795                         name_parts parts = nameParts(name);
1796                         if (! parts.prefix.empty()) {
1797                                 xs << xml::StartTag("honorific");
1798                                 xs << parts.prefix;
1799                                 xs << xml::EndTag("honorific");
1800                                 xs << xml::CR();
1801                         }
1802                         if (! parts.prename.empty()) {
1803                                 xs << xml::StartTag("firstname");
1804                                 xs << parts.prename;
1805                                 xs << xml::EndTag("firstname");
1806                                 xs << xml::CR();
1807                         }
1808                         if (! parts.surname.empty()) {
1809                                 xs << xml::StartTag("surname");
1810                                 xs << parts.surname;
1811                                 xs << xml::EndTag("surname");
1812                                 xs << xml::CR();
1813                         }
1814                         if (! parts.suffix.empty()) {
1815                                 xs << xml::StartTag("othername", "role=\"suffix\"");
1816                                 xs << parts.suffix;
1817                                 xs << xml::EndTag("othername");
1818                                 xs << xml::CR();
1819                         }
1820                 }
1821
1822                 xs << xml::EndTag("personname");
1823                 xs << xml::CR();
1824                 xs << xml::EndTag(tag);
1825                 xs << xml::CR();
1826
1827                 // Could add an affiliation after <personname>, but not stored in BibTeX.
1828         }
1829         xs << xml::EndTag("authorgroup");
1830         xs << xml::CR();
1831 }
1832
1833 } // namespace lyx