src/BiblioInfo.cpp

   1 /**
   2  * \file BiblioInfo.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Angus Leeming
   7  * \author Herbert Voß
   8  * \author Richard Kimberly Heck
   9  * \author Julien Rioux
  10  * \author Jürgen Spitzmüller
  11  *
  12  * Full author contact details are available in file CREDITS.
  13  */
  14
  15 #include <config.h>
  16
  17 #include "BiblioInfo.h"
  18
  19 #include "Buffer.h"
  20 #include "BufferParams.h"
  21 #include "Citation.h"
  22 #include "Encoding.h"
  23 #include "Language.h"
  24 #include "TextClass.h"
  25 #include "TocBackend.h"
  26 #include "xml.h"
  27
  28 #include "support/convert.h"
  29 #include "support/debug.h"
  30 #include "support/docstream.h"
  31 #include "support/FileName.h"
  32 #include "support/gettext.h"
  33 #include "support/lassert.h"
  34 #include "support/lstrings.h"
  35 #include "support/textutils.h"
  36
  37 #include <map>
  38 #include <regex>
  39 #include <set>
  40
  41 using namespace std;
  42 using namespace lyx::support;
  43
  44
  45 namespace lyx {
  46
  47 namespace {
  48
  49 // Remove placeholders from names
  50 docstring renormalize(docstring const & input)
  51 {
  52         docstring res = subst(input, from_ascii("$$space!"), from_ascii(" "));
  53         return subst(res, from_ascii("$$comma!"), from_ascii(","));
  54 }
  55
  56
  57 // Split the surname into prefix ("von-part") and family name
  58 pair<docstring, docstring> parseSurname(docstring const & sname)
  59 {
  60         // Split the surname into its tokens
  61         vector<docstring> pieces = getVectorFromString(sname, from_ascii(" "));
  62         if (pieces.size() < 2)
  63                 return make_pair(docstring(), sname);
  64
  65         // Now we look for pieces that begin with a lower case letter.
  66         // All except for the very last token constitute the "von-part".
  67         docstring prefix;
  68         vector<docstring>::const_iterator it = pieces.begin();
  69         vector<docstring>::const_iterator const en = pieces.end();
  70         bool first = true;
  71         for (; it != en; ++it) {
  72                 if ((*it).empty())
  73                         continue;
  74                 // If this is the last piece, then what we now have is
  75                 // the family name, notwithstanding the casing.
  76                 if (it + 1 == en)
  77                         break;
  78                 char_type const c = (*it)[0];
  79                 // If the piece starts with a upper case char, we assume
  80                 // this is part of the surname.
  81                 if (!isLower(c))
  82                         break;
  83                 // Nothing of the former, so add this piece to the prename
  84                 if (!first)
  85                         prefix += " ";
  86                 else
  87                         first = false;
  88                 prefix += *it;
  89         }
  90
  91         // Reconstruct the family name.
  92         // Note that if we left the loop with because it + 1 == en,
  93         // then this will still do the right thing, i.e., make surname
  94         // just be the last piece.
  95         docstring surname;
  96         first = true;
  97         for (; it != en; ++it) {
  98                 if (!first)
  99                         surname += " ";
 100                 else
 101                         first = false;
 102                 surname += *it;
 103         }
 104         return make_pair(prefix, surname);
 105 }
 106
 107
 108 struct name_parts {
 109         docstring surname;
 110         docstring prename;
 111         docstring suffix;
 112         docstring prefix;
 113 };
 114
 115
 116 // gets the name parts (prename, surname, prefix, suffix) from an author-type string
 117 name_parts nameParts(docstring const & iname)
 118 {
 119         name_parts res;
 120         if (iname.empty())
 121                 return res;
 122
 123         // First we check for goupings (via {...}) and replace blanks and
 124         // commas inside groups with temporary placeholders
 125         docstring name;
 126         int gl = 0;
 127         docstring::const_iterator p = iname.begin();
 128         while (p != iname.end()) {
 129                 // count grouping level
 130                 if (*p == '{')
 131                         ++gl;
 132                 else if (*p == '}')
 133                         --gl;
 134                 // generate string with probable placeholders
 135                 if (*p == ' ' && gl > 0)
 136                         name += from_ascii("$$space!");
 137                 else if (*p == ',' && gl > 0)
 138                         name += from_ascii("$$comma!");
 139                 else
 140                         name += *p;
 141                 ++p;
 142         }
 143
 144         // Now we look for a comma, and take the last name to be everything
 145         // preceding the right-most one, so that we also get the name suffix
 146         // (aka "jr" part).
 147         vector<docstring> pieces = getVectorFromString(name);
 148         if (pieces.size() > 1) {
 149                 // Whether we have a name suffix or not, the prename is
 150                 // always last item
 151                 res.prename = renormalize(pieces.back());
 152                 // The family name, conversely, is always the first item.
 153                 // However, it might contain a prefix (aka "von" part)
 154                 docstring const sname = pieces.front();
 155                 res.prefix = renormalize(parseSurname(sname).first);
 156                 res.surname = renormalize(parseSurname(sname).second);
 157                 // If we have three pieces (the maximum allowed by BibTeX),
 158                 // the second one is the name suffix.
 159                 if (pieces.size() > 2)
 160                         res.suffix = renormalize(pieces.at(1));
 161                 return res;
 162         }
 163
 164         // OK, so now we want to look for the last name.
 165         // Split on spaces, to get various tokens.
 166         pieces = getVectorFromString(name, from_ascii(" "));
 167         // No space: Only a family name given
 168         if (pieces.size() < 2) {
 169                 res.surname = renormalize(pieces.back());
 170                 return res;
 171         }
 172         // If we get two pieces, assume "prename surname"
 173         if (pieces.size() == 2) {
 174                 res.prename = renormalize(pieces.front());
 175                 res.surname = renormalize(pieces.back());
 176                 return res;
 177         }
 178
 179         // More than 3 pieces: A name prefix (aka "von" part) might be included.
 180         // We look for the first piece that begins with a lower case letter
 181         // (which is the name prefix, if it is not the last token) or the last token.
 182         docstring prename;
 183         vector<docstring>::const_iterator it = pieces.begin();
 184         vector<docstring>::const_iterator const en = pieces.end();
 185         bool first = true;
 186         for (; it != en; ++it) {
 187                 if ((*it).empty())
 188                         continue;
 189                 char_type const c = (*it)[0];
 190                 // If the piece starts with a lower case char, we assume
 191                 // this is the name prefix and thus prename is complete.
 192                 if (isLower(c))
 193                         break;
 194                 // Same if this is the last piece, which is always the surname.
 195                 if (it + 1 == en)
 196                         break;
 197                 // Nothing of the former, so add this piece to the prename
 198                 if (!first)
 199                         prename += " ";
 200                 else
 201                         first = false;
 202                 prename += *it;
 203         }
 204
 205         // Now reconstruct the family name and strip the prefix.
 206         // Note that if we left the loop because it + 1 == en,
 207         // then this will still do the right thing, i.e., make surname
 208         // just be the last piece.
 209         docstring surname;
 210         first = true;
 211         for (; it != en; ++it) {
 212                 if (!first)
 213                         surname += " ";
 214                 else
 215                         first = false;
 216                 surname += *it;
 217         }
 218         res.prename = renormalize(prename);
 219         res.prefix = renormalize(parseSurname(surname).first);
 220         res.surname = renormalize(parseSurname(surname).second);
 221         return res;
 222 }
 223
 224
 225 docstring constructName(docstring const & name, string const & scheme)
 226 {
 227         // re-constructs a name from name parts according
 228         // to a given scheme
 229         docstring const prename = nameParts(name).prename;
 230         docstring const surname = nameParts(name).surname;
 231         docstring const prefix = nameParts(name).prefix;
 232         docstring const suffix = nameParts(name).suffix;
 233         string res = scheme;
 234         static regex const reg1("(.*)(\\{%prename%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 235         static regex const reg2("(.*)(\\{%suffix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 236         static regex const reg3("(.*)(\\{%prefix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 237         smatch sub;
 238         // Changing the first parameter of regex_match() may corrupt the
 239         // second one. In this case we use the temporary string tmp.
 240         if (regex_match(scheme, sub, reg1)) {
 241                 res = sub.str(1);
 242                 if (!prename.empty())
 243                         res += sub.str(3);
 244                 res += sub.str(5);
 245         }
 246         if (regex_match(res, sub, reg2)) {
 247                 string tmp = sub.str(1);
 248                 if (!suffix.empty())
 249                         tmp += sub.str(3);
 250                 res = tmp + sub.str(5);
 251         }
 252         if (regex_match(res, sub, reg3)) {
 253                 string tmp = sub.str(1);
 254                 if (!prefix.empty())
 255                         tmp += sub.str(3);
 256                 res = tmp + sub.str(5);
 257         }
 258         docstring result = from_ascii(res);
 259         result = subst(result, from_ascii("%prename%"), prename);
 260         result = subst(result, from_ascii("%surname%"), surname);
 261         result = subst(result, from_ascii("%prefix%"), prefix);
 262         result = subst(result, from_ascii("%suffix%"), suffix);
 263         return result;
 264 }
 265
 266
 267 vector<docstring> const getAuthors(docstring const & author)
 268 {
 269         // We check for goupings (via {...}) and only consider " and "
 270         // outside groups as author separator. This is to account
 271         // for cases such as {{Barnes and Noble, Inc.}}, which
 272         // need to be treated as one single family name.
 273         // We use temporary placeholders in order to differentiate the
 274         // diverse " and " cases.
 275
 276         // First, we temporarily replace all ampersands. It is rather unusual
 277         // in author names, but can happen (consider cases such as "C \& A Corp.").
 278         docstring iname = subst(author, from_ascii("&"), from_ascii("$$amp!"));
 279         // Then, we temporarily make all " and " strings to ampersands in order
 280         // to handle them later on a per-char level.
 281         iname = subst(iname, from_ascii(" and "), from_ascii(" & "));
 282         // Now we traverse through the string and replace the "&" by the proper
 283         // output in- and outside groups
 284         docstring name;
 285         int gl = 0;
 286         docstring::const_iterator p = iname.begin();
 287         while (p != iname.end()) {
 288                 // count grouping level
 289                 if (*p == '{')
 290                         ++gl;
 291                 else if (*p == '}')
 292                         --gl;
 293                 // generate string with probable placeholders
 294                 if (*p == '&') {
 295                         if (gl > 0)
 296                                 // Inside groups, we output "and"
 297                                 name += from_ascii("and");
 298                         else
 299                                 // Outside groups, we output a separator
 300                                 name += from_ascii("$$namesep!");
 301                 }
 302                 else
 303                         name += *p;
 304                 ++p;
 305         }
 306
 307         // re-insert the literal ampersands
 308         name = subst(name, from_ascii("$$amp!"), from_ascii("&"));
 309
 310         // Now construct the actual vector
 311         return getVectorFromString(name, from_ascii(" $$namesep! "));
 312 }
 313
 314
 315 bool multipleAuthors(docstring const & author)
 316 {
 317         return getAuthors(author).size() > 1;
 318 }
 319
 320
 321 // converts a string containing LaTeX commands into unicode
 322 // for display.
 323 docstring convertLaTeXCommands(docstring const & str)
 324 {
 325         docstring val = str;
 326         docstring ret;
 327
 328         bool scanning_cmd = false;
 329         bool scanning_math = false;
 330         bool escaped = false; // used to catch \$, etc.
 331         while (!val.empty()) {
 332                 char_type const ch = val[0];
 333
 334                 // if we're scanning math, we output everything until we
 335                 // find an unescaped $, at which point we break out.
 336                 if (scanning_math) {
 337                         if (escaped)
 338                                 escaped = false;
 339                         else if (ch == '\\')
 340                                 escaped = true;
 341                         else if (ch == '$')
 342                                 scanning_math = false;
 343                         ret += ch;
 344                         val = val.substr(1);
 345                         continue;
 346                 }
 347
 348                 // if we're scanning a command name, then we just
 349                 // discard characters until we hit something that
 350                 // isn't alpha.
 351                 if (scanning_cmd) {
 352                         if (isAlphaASCII(ch)) {
 353                                 val = val.substr(1);
 354                                 escaped = false;
 355                                 continue;
 356                         }
 357                         // so we're done with this command.
 358                         // now we fall through and check this character.
 359                         scanning_cmd = false;
 360                 }
 361
 362                 // was the last character a \? If so, then this is something like:
 363                 // \\ or \$, so we'll just output it. That's probably not always right...
 364                 if (escaped) {
 365                         // exception: output \, as THIN SPACE
 366                         if (ch == ',')
 367                                 ret.push_back(0x2009);
 368                         else
 369                                 ret += ch;
 370                         val = val.substr(1);
 371                         escaped = false;
 372                         continue;
 373                 }
 374
 375                 if (ch == '$') {
 376                         ret += ch;
 377                         val = val.substr(1);
 378                         scanning_math = true;
 379                         continue;
 380                 }
 381
 382                 // Change text mode accents in the form
 383                 // {\v a} to \v{a} (see #9340).
 384                 // FIXME: This is a sort of mini-tex2lyx.
 385                 //        Use the real tex2lyx instead!
 386                 static regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}");
 387                 if (regex_search(to_utf8(val), tma_reg)) {
 388                         val = val.substr(1);
 389                         val.replace(2, 1, from_ascii("{"));
 390                         continue;
 391                 }
 392
 393                 // Apart from the above, we just ignore braces
 394                 if (ch == '{' || ch == '}') {
 395                         val = val.substr(1);
 396                         continue;
 397                 }
 398
 399                 // we're going to check things that look like commands, so if
 400                 // this doesn't, just output it.
 401                 if (ch != '\\') {
 402                         ret += ch;
 403                         val = val.substr(1);
 404                         continue;
 405                 }
 406
 407                 // ok, could be a command of some sort
 408                 // let's see if it corresponds to some unicode
 409                 // unicodesymbols has things in the form: \"{u},
 410                 // whereas we may see things like: \"u. So we'll
 411                 // look for that and change it, if necessary.
 412                 // FIXME: This is a sort of mini-tex2lyx.
 413                 //        Use the real tex2lyx instead!
 414                 static regex const reg("^\\\\\\W\\w");
 415                 if (regex_search(to_utf8(val), reg)) {
 416                         val.insert(3, from_ascii("}"));
 417                         val.insert(2, from_ascii("{"));
 418                 }
 419                 bool termination;
 420                 docstring rem;
 421                 docstring const cnvtd = Encodings::fromLaTeXCommand(val,
 422                                 Encodings::TEXT_CMD, termination, rem);
 423                 if (!cnvtd.empty()) {
 424                         // it did, so we'll take that bit and proceed with what's left
 425                         ret += cnvtd;
 426                         val = rem;
 427                         continue;
 428                 }
 429                 // it's a command of some sort
 430                 scanning_cmd = true;
 431                 escaped = true;
 432                 val = val.substr(1);
 433         }
 434         return ret;
 435 }
 436
 437
 438 // Escape '<' and '>' and remove richtext markers (e.g. {!this is richtext!}) from a string.
 439 docstring processRichtext(docstring const & str, bool richtext)
 440 {
 441         docstring val = str;
 442         docstring ret;
 443
 444         bool scanning_rich = false;
 445         while (!val.empty()) {
 446                 char_type const ch = val[0];
 447                 if (ch == '{' && val.size() > 1 && val[1] == '!') {
 448                         // beginning of rich text
 449                         scanning_rich = true;
 450                         val = val.substr(2);
 451                         continue;
 452                 }
 453                 if (scanning_rich && ch == '!' && val.size() > 1 && val[1] == '}') {
 454                         // end of rich text
 455                         scanning_rich = false;
 456                         val = val.substr(2);
 457                         continue;
 458                 }
 459                 if (richtext) {
 460                         if (scanning_rich)
 461                                 ret += ch;
 462                         else {
 463                                 // we need to escape '<' and '>'
 464                                 if (ch == '<')
 465                                         ret += "&lt;";
 466                                 else if (ch == '>')
 467                                         ret += "&gt;";
 468                                 else
 469                                         ret += ch;
 470                         }
 471                 } else if (!scanning_rich /* && !richtext */)
 472                         ret += ch;
 473                 // else the character is discarded, which will happen only if
 474                 // richtext == false and we are scanning rich text
 475                 val = val.substr(1);
 476         }
 477         return ret;
 478 }
 479
 480 } // namespace
 481
 482
 483 //////////////////////////////////////////////////////////////////////
 484 //
 485 // BibTeXInfo
 486 //
 487 //////////////////////////////////////////////////////////////////////
 488
 489 BibTeXInfo::BibTeXInfo(docstring const & key, docstring const & type)
 490         : is_bibtex_(true), bib_key_(key), num_bib_key_(0), entry_type_(type),
 491           info_(), format_(), modifier_(0)
 492 {}
 493
 494
 495
 496 docstring const BibTeXInfo::getAuthorOrEditorList(Buffer const * buf,
 497                                           bool full, bool forceshort) const
 498 {
 499         docstring author = operator[]("author");
 500         if (author.empty())
 501                 author = operator[]("editor");
 502
 503         return getAuthorList(buf, author, full, forceshort);
 504 }
 505
 506
 507 docstring const BibTeXInfo::getAuthorList(Buffer const * buf,
 508                 docstring const & author, bool const full, bool const forceshort,
 509                 bool const allnames, bool const beginning) const
 510 {
 511         // Maxnames treshold depend on engine
 512         size_t maxnames = buf ?
 513                 buf->params().documentClass().max_citenames() : 2;
 514
 515         if (!is_bibtex_) {
 516                 docstring const opt = label();
 517                 if (opt.empty())
 518                         return docstring();
 519
 520                 docstring authors;
 521                 docstring const remainder = trim(split(opt, authors, '('));
 522                 if (remainder.empty())
 523                         // in this case, we didn't find a "(",
 524                         // so we don't have author (year)
 525                         return docstring();
 526                 if (full) {
 527                         // Natbib syntax is "Jones et al.(1990)Jones, Baker, and Williams"
 528                         docstring const fullauthors = trim(rsplit(remainder, ')'));
 529                         if (!fullauthors.empty())
 530                                 return fullauthors;
 531                 }
 532                 return authors;
 533         }
 534
 535         if (author.empty())
 536                 return author;
 537
 538         // OK, we've got some names. Let's format them.
 539         // Try to split the author list
 540         vector<docstring> const authors = getAuthors(author);
 541
 542         docstring retval;
 543
 544         CiteEngineType const engine_type = buf ? buf->params().citeEngineType()
 545                                                : ENGINE_TYPE_DEFAULT;
 546
 547         // These are defined in the styles
 548         string const etal =
 549                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_etal")
 550                     : " et al.";
 551         string const namesep =
 552                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_namesep")
 553                    : ", ";
 554         string const lastnamesep =
 555                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_lastnamesep")
 556                     : ", and ";
 557         string const pairnamesep =
 558                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_pairnamesep")
 559                      : " and ";
 560         string firstnameform =
 561                         buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstnameform")
 562                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 563         if (!beginning)
 564                 firstnameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstbynameform")
 565                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 566         string othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!othernameform")
 567                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 568         if (!beginning)
 569                 othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!otherbynameform")
 570                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 571         string citenameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!citenameform")
 572                              : "{%prefix%[[%prefix% ]]}%surname%";
 573
 574         // Shorten the list (with et al.) if forceshort is set
 575         // and the list can actually be shortened, else if maxcitenames
 576         // is passed and full is not set.
 577         bool shorten = forceshort && authors.size() > 1;
 578         vector<docstring>::const_iterator it = authors.begin();
 579         vector<docstring>::const_iterator en = authors.end();
 580         for (size_t i = 0; it != en; ++it, ++i) {
 581                 if (i >= maxnames && !full) {
 582                         shorten = true;
 583                         break;
 584                 }
 585                 if (*it == "others") {
 586                         retval += buf ? buf->B_(etal) : from_ascii(etal);
 587                         break;
 588                 }
 589                 if (i > 0 && i == authors.size() - 1) {
 590                         if (authors.size() == 2)
 591                                 retval += buf ? buf->B_(pairnamesep) : from_ascii(pairnamesep);
 592                         else
 593                                 retval += buf ? buf->B_(lastnamesep) : from_ascii(lastnamesep);
 594                 } else if (i > 0)
 595                         retval += buf ? buf->B_(namesep) : from_ascii(namesep);
 596                 if (allnames)
 597                         retval += (i == 0) ? constructName(*it, firstnameform)
 598                                 : constructName(*it, othernameform);
 599                 else
 600                         retval += constructName(*it, citenameform);
 601         }
 602         if (shorten) {
 603                 if (allnames)
 604                         retval = constructName(authors[0], firstnameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 605                 else
 606                         retval = constructName(authors[0], citenameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 607         }
 608
 609         return convertLaTeXCommands(retval);
 610 }
 611
 612
 613 docstring const BibTeXInfo::getYear() const
 614 {
 615         if (is_bibtex_) {
 616                 // first try legacy year field
 617                 docstring year = operator[]("year");
 618                 if (!year.empty())
 619                         return year;
 620                 // now try biblatex's date field
 621                 year = operator[]("date");
 622                 // Format is [-]YYYY-MM-DD*/[-]YYYY-MM-DD*
 623                 // We only want the years.
 624                 static regex const yreg("[-]?([\\d]{4}).*");
 625                 static regex const ereg(".*/[-]?([\\d]{4}).*");
 626                 smatch sm;
 627                 string const date = to_utf8(year);
 628                 if (!regex_match(date, sm, yreg))
 629                         // cannot parse year.
 630                         return docstring();
 631                 year = from_ascii(sm[1]);
 632                 // check for an endyear
 633                 if (regex_match(date, sm, ereg))
 634                         year += char_type(0x2013) + from_ascii(sm[1]);
 635                 return year;
 636         }
 637
 638         docstring const opt = label();
 639         if (opt.empty())
 640                 return docstring();
 641
 642         docstring authors;
 643         docstring tmp = split(opt, authors, '(');
 644         if (tmp.empty())
 645                 // we don't have author (year)
 646                 return docstring();
 647         docstring year;
 648         tmp = split(tmp, year, ')');
 649         return year;
 650 }
 651
 652
 653 void BibTeXInfo::getLocators(docstring & doi, docstring & url, docstring & file) const
 654 {
 655         if (is_bibtex_) {
 656                 // get "doi" entry from citation record
 657                 doi = operator[]("doi");
 658                 if (!doi.empty() && !prefixIs(doi,from_ascii("http")))
 659                         doi = "https://doi.org/" + doi;
 660                 // get "url" entry from citation record
 661                 url = operator[]("url");
 662                 // get "file" entry from citation record
 663                 file = operator[]("file");
 664
 665                 // Jabref case, field has a format:
 666                 // Description:Location:Filetype;Description:Location:Filetype...
 667                 // We will grab only first pdf
 668                 if (!file.empty()) {
 669                         docstring ret, filedest, tmp;
 670                         ret = split(file, tmp, ':');
 671                         tmp = split(ret, filedest, ':');
 672                         //TODO howto deal with relative directories?
 673                         FileName f(to_utf8(filedest));
 674                         if (f.exists())
 675                                 file = "file:///" + filedest;
 676                 }
 677
 678                 // kbibtex case, format:
 679                 // file1.pdf;file2.pdf
 680                 // We will grab only first pdf
 681                 docstring kfile;
 682                 if (file.empty())
 683                         kfile = operator[]("localfile");
 684                 if (!kfile.empty()) {
 685                         docstring filedest, tmp;
 686                         tmp = split(kfile, filedest, ';');
 687                         //TODO howto deal with relative directories?
 688                         FileName f(to_utf8(filedest));
 689                         if (f.exists())
 690                                 file = "file:///" + filedest;
 691                 }
 692
 693                 if (!url.empty())
 694                         return;
 695
 696                 // try biblatex specific fields, see its manual
 697                 // 3.13.7 "Electronic Publishing Informationl"
 698                 docstring eprinttype = operator[]("eprinttype");
 699                 docstring eprint = operator[]("eprint");
 700                 if (eprint.empty())
 701                         return;
 702
 703                 if (eprinttype == "arxiv")
 704                         url = "https://arxiv.org/abs/" + eprint;
 705                 if (eprinttype == "jstor")
 706                         url = "https://www.jstor.org/stable/" + eprint;
 707                 if (eprinttype == "pubmed")
 708                         url = "http://www.ncbi.nlm.nih.gov/pubmed/" + eprint;
 709                 if (eprinttype == "hdl")
 710                         url = "https://hdl.handle.net/" + eprint;
 711                 if (eprinttype == "googlebooks")
 712                         url = "http://books.google.com/books?id=" + eprint;
 713
 714                 return;
 715         }
 716
 717         // Here can be handled the bibliography environment. All one could do
 718         // here is let LyX scan the entry for URL or HRef insets.
 719 }
 720
 721
 722 namespace {
 723
 724 docstring parseOptions(docstring const & format, string & optkey,
 725                     docstring & ifpart, docstring & elsepart);
 726
 727 // Calls parseOptions to deal with an embedded option, such as:
 728 //   {%number%[[, no.~%number%]]}
 729 // which must appear at the start of format. ifelsepart gets the
 730 // whole of the option, and we return what's left after the option.
 731 // we return format if there is an error.
 732 docstring parseEmbeddedOption(docstring const & format, docstring & ifelsepart)
 733 {
 734         LASSERT(format[0] == '{' && format[1] == '%', return format);
 735         string optkey;
 736         docstring ifpart;
 737         docstring elsepart;
 738         docstring const rest = parseOptions(format, optkey, ifpart, elsepart);
 739         if (format == rest) { // parse error
 740                 LYXERR0("ERROR! Couldn't parse `" << format <<"'.");
 741                 return format;
 742         }
 743         LASSERT(rest.size() <= format.size(),
 744                 { ifelsepart = docstring(); return format; });
 745         ifelsepart = format.substr(0, format.size() - rest.size());
 746         return rest;
 747 }
 748
 749
 750 // Gets a "clause" from a format string, where the clause is
 751 // delimited by '[[' and ']]'. Returns what is left after the
 752 // clause is removed, and returns format if there is an error.
 753 docstring getClause(docstring const & format, docstring & clause)
 754 {
 755         docstring fmt = format;
 756         // remove '[['
 757         fmt = fmt.substr(2);
 758         // we'll remove characters from the front of fmt as we
 759         // deal with them
 760         while (!fmt.empty()) {
 761                 if (fmt[0] == ']' && fmt.size() > 1 && fmt[1] == ']') {
 762                         // that's the end
 763                         fmt = fmt.substr(2);
 764                         break;
 765                 }
 766                 // check for an embedded option
 767                 if (fmt[0] == '{' && fmt.size() > 1 && fmt[1] == '%') {
 768                         docstring part;
 769                         docstring const rest = parseEmbeddedOption(fmt, part);
 770                         if (fmt == rest) {
 771                                 LYXERR0("ERROR! Couldn't parse embedded option in `" << format <<"'.");
 772                                 return format;
 773                         }
 774                         clause += part;
 775                         fmt = rest;
 776                 } else { // it's just a normal character
 777                                 clause += fmt[0];
 778                                 fmt = fmt.substr(1);
 779                 }
 780         }
 781         return fmt;
 782 }
 783
 784
 785 // parse an options string, which must appear at the start of the
 786 // format parameter. puts the parsed bits in optkey, ifpart, and
 787 // elsepart and returns what's left after the option is removed.
 788 // if there's an error, it returns format itself.
 789 docstring parseOptions(docstring const & format, string & optkey,
 790                     docstring & ifpart, docstring & elsepart)
 791 {
 792         LASSERT(format[0] == '{' && format[1] == '%', return format);
 793         // strip '{%'
 794         docstring fmt = format.substr(2);
 795         size_t pos = fmt.find('%'); // end of key
 796         if (pos == string::npos) {
 797                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of key.");
 798                 return format;
 799         }
 800         optkey = to_utf8(fmt.substr(0, pos));
 801         fmt = fmt.substr(pos + 1);
 802         // [[format]] should be next
 803         if (fmt[0] != '[' || fmt[1] != '[') {
 804                 LYXERR0("Error parsing  `" << format <<"'. Can't find '[[' after key.");
 805                 return format;
 806         }
 807
 808         docstring curfmt = fmt;
 809         fmt = getClause(curfmt, ifpart);
 810         if (fmt == curfmt) {
 811                 LYXERR0("Error parsing  `" << format <<"'. Couldn't get if clause.");
 812                 return format;
 813         }
 814
 815         if (fmt[0] == '}') // we're done, no else clause
 816                 return fmt.substr(1);
 817
 818         // else part should follow
 819         if (fmt[0] != '[' || fmt[1] != '[') {
 820                 LYXERR0("Error parsing  `" << format <<"'. Can't find else clause.");
 821                 return format;
 822         }
 823
 824         curfmt = fmt;
 825         fmt = getClause(curfmt, elsepart);
 826         // we should be done
 827         if (fmt == curfmt || fmt[0] != '}') {
 828                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of option.");
 829                 return format;
 830         }
 831         return fmt.substr(1);
 832 }
 833
 834
 835 } // namespace
 836
 837 /* FIXME
 838 Bug #9131 revealed an oddity in how we are generating citation information
 839 when more than one key is given. We end up building a longer and longer format
 840 string as we go, which we then have to re-parse, over and over and over again,
 841 rather than generating the information for the individual keys and then putting
 842 all of that together. We do that to deal with the way separators work, from what
 843 I can tell, but it still feels like a hack. Fixing this would require quite a
 844 bit of work, however.
 845 */
 846 docstring BibTeXInfo::expandFormat(docstring const & format,
 847                 BibTeXInfoList const & xrefs, int & counter, Buffer const & buf,
 848                 CiteItem const & ci, bool next, bool second) const
 849 {
 850         // incorrect use of macros could put us in an infinite loop
 851         static int const max_passes = 5000;
 852         // the use of overly large keys can lead to performance problems, due
 853         // to eventual attempts to convert LaTeX macros to unicode. See bug
 854         // #8944. By default, the size is limited to 128 (in CiteItem), but
 855         // for specific purposes (such as XHTML export), it needs to be enlarged
 856         // This is perhaps not the best solution, but it will have to do for now.
 857         size_t const max_keysize = ci.max_key_size;
 858         odocstringstream ret; // return value
 859         string key;
 860         bool scanning_key = false;
 861         bool scanning_rich = false;
 862
 863         CiteEngineType const engine_type = buf.params().citeEngineType();
 864         docstring fmt = format;
 865         // we'll remove characters from the front of fmt as we
 866         // deal with them
 867         while (!fmt.empty()) {
 868                 if (counter > max_passes) {
 869                         LYXERR0("Recursion limit reached while parsing `"
 870                                 << format << "'.");
 871                         return _("ERROR!");
 872                 }
 873
 874                 char_type thischar = fmt[0];
 875                 if (thischar == '%') {
 876                         // beginning or end of key
 877                         if (scanning_key) {
 878                                 // end of key
 879                                 scanning_key = false;
 880                                 // so we replace the key with its value, which may be empty
 881                                 if (key[0] == '!') {
 882                                         // macro
 883                                         string const val =
 884                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 885                                         fmt = from_utf8(val) + fmt.substr(1);
 886                                         counter += 1;
 887                                         continue;
 888                                 } else if (prefixIs(key, "B_")) {
 889                                         // a translatable bit (to the Buffer language)
 890                                         string const val =
 891                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 892                                         docstring const trans =
 893                                                 translateIfPossible(from_utf8(val), buf.params().language->code());
 894                                         ret << trans;
 895                                 } else if (key[0] == '_') {
 896                                         // a translatable bit (to the GUI language)
 897                                         string const val =
 898                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 899                                         docstring const trans =
 900                                                 translateIfPossible(from_utf8(val));
 901                                         ret << trans;
 902                                 } else {
 903                                         docstring const val =
 904                                                 getValueForKey(key, buf, ci, xrefs, max_keysize);
 905                                         if (!scanning_rich)
 906                                                 ret << from_ascii("{!<span class=\"bib-" + key + "\">!}");
 907                                         ret << val;
 908                                         if (!scanning_rich)
 909                                                 ret << from_ascii("{!</span>!}");
 910                                 }
 911                         } else {
 912                                 // beginning of key
 913                                 key.clear();
 914                                 scanning_key = true;
 915                         }
 916                 }
 917                 else if (thischar == '{') {
 918                         // beginning of option?
 919                         if (scanning_key) {
 920                                 LYXERR0("ERROR: Found `{' when scanning key in `" << format << "'.");
 921                                 return _("ERROR!");
 922                         }
 923                         if (fmt.size() > 1) {
 924                                 if (fmt[1] == '%') {
 925                                         // it is the beginning of an optional format
 926                                         string optkey;
 927                                         docstring ifpart;
 928                                         docstring elsepart;
 929                                         docstring const newfmt =
 930                                                 parseOptions(fmt, optkey, ifpart, elsepart);
 931                                         if (newfmt == fmt) // parse error
 932                                                 return _("ERROR!");
 933                                         fmt = newfmt;
 934                                         docstring const val =
 935                                                 getValueForKey(optkey, buf, ci, xrefs);
 936                                         if (optkey == "next" && next)
 937                                                 ret << ifpart; // without expansion
 938                                         else if (optkey == "second" && second) {
 939                                                 int newcounter = 0;
 940                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 941                                                         ci, next);
 942                                         } else if (!val.empty()) {
 943                                                 int newcounter = 0;
 944                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 945                                                         ci, next);
 946                                         } else if (!elsepart.empty()) {
 947                                                 int newcounter = 0;
 948                                                 ret << expandFormat(elsepart, xrefs, newcounter, buf,
 949                                                         ci, next);
 950                                         }
 951                                         // fmt will have been shortened for us already
 952                                         continue;
 953                                 }
 954                                 if (fmt[1] == '!') {
 955                                         // beginning of rich text
 956                                         scanning_rich = true;
 957                                         fmt = fmt.substr(2);
 958                                         ret << from_ascii("{!");
 959                                         continue;
 960                                 }
 961                         }
 962                         // we are here if '{' was not followed by % or !.
 963                         // So it's just a character.
 964                         ret << thischar;
 965                 }
 966                 else if (scanning_rich && thischar == '!'
 967                          && fmt.size() > 1 && fmt[1] == '}') {
 968                         // end of rich text
 969                         scanning_rich = false;
 970                         fmt = fmt.substr(2);
 971                         ret << from_ascii("!}");
 972                         continue;
 973                 }
 974                 else if (scanning_key)
 975                         key += char(thischar);
 976                 else {
 977                         try {
 978                                 ret.put(thischar);
 979                         } catch (EncodingException & /* e */) {
 980                                 LYXERR0("Uncodable character '" << docstring(1, thischar) << " in citation label!");
 981                         }
 982                 }
 983                 fmt = fmt.substr(1);
 984         } // for loop
 985         if (scanning_key) {
 986                 LYXERR0("Never found end of key in `" << format << "'!");
 987                 return _("ERROR!");
 988         }
 989         if (scanning_rich) {
 990                 LYXERR0("Never found end of rich text in `" << format << "'!");
 991                 return _("ERROR!");
 992         }
 993         return ret.str();
 994 }
 995
 996
 997 docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs,
 998         Buffer const & buf, CiteItem const & ci, docstring const & format_in) const
 999 {
1000         bool const richtext = ci.richtext;
1001
1002         CiteEngineType const engine_type = buf.params().citeEngineType();
1003         DocumentClass const & dc = buf.params().documentClass();
1004         docstring const & format = format_in.empty()?
1005                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_)))
1006                               : format_in;
1007
1008         if (format != format_) {
1009                 // clear caches since format changed
1010                 info_.clear();
1011                 info_richtext_.clear();
1012                 format_ = format;
1013         }
1014
1015         if (!richtext && !info_.empty()) {
1016                 info_ = convertLaTeXCommands(processRichtext(info_, false));
1017                 return info_;
1018         }
1019         if (richtext && !info_richtext_.empty())
1020                 return info_richtext_;
1021
1022         if (!is_bibtex_) {
1023                 BibTeXInfo::const_iterator it = find(from_ascii("ref"));
1024                 info_ = it->second;
1025                 return info_;
1026         }
1027
1028         int counter = 0;
1029         info_ = expandFormat(format, xrefs, counter, buf,
1030                 ci, false, false);
1031
1032         if (info_.empty()) {
1033                 // this probably shouldn't happen
1034                 return info_;
1035         }
1036
1037         if (richtext) {
1038                 info_richtext_ = convertLaTeXCommands(processRichtext(info_, true));
1039                 return info_richtext_;
1040         }
1041
1042         info_ = convertLaTeXCommands(processRichtext(info_, false));
1043         return info_;
1044 }
1045
1046
1047 docstring const BibTeXInfo::getLabel(BibTeXInfoList const & xrefs,
1048         Buffer const & buf, docstring const & format,
1049         CiteItem const & ci, bool next, bool second) const
1050 {
1051         docstring loclabel;
1052
1053         int counter = 0;
1054         loclabel = expandFormat(format, xrefs, counter, buf, ci, next, second);
1055
1056         if (!loclabel.empty() && !next) {
1057                 loclabel = processRichtext(loclabel, ci.richtext);
1058                 loclabel = convertLaTeXCommands(loclabel);
1059         }
1060
1061         return loclabel;
1062 }
1063
1064
1065 docstring const & BibTeXInfo::operator[](docstring const & field) const
1066 {
1067         BibTeXInfo::const_iterator it = find(field);
1068         if (it != end())
1069                 return it->second;
1070         static docstring const empty_value = docstring();
1071         return empty_value;
1072 }
1073
1074
1075 docstring const & BibTeXInfo::operator[](string const & field) const
1076 {
1077         return operator[](from_ascii(field));
1078 }
1079
1080
1081 docstring BibTeXInfo::getValueForKey(string const & oldkey, Buffer const & buf,
1082         CiteItem const & ci, BibTeXInfoList const & xrefs, size_t maxsize) const
1083 {
1084         // anything less is pointless
1085         LASSERT(maxsize >= 16, maxsize = 16);
1086         string key = oldkey;
1087         bool cleanit = false;
1088         if (prefixIs(oldkey, "clean:")) {
1089                 key = oldkey.substr(6);
1090                 cleanit = true;
1091         }
1092
1093         docstring ret = operator[](key);
1094         if (ret.empty() && !xrefs.empty()) {
1095                 // xr is a (reference to a) BibTeXInfo const *
1096                 for (auto const & xr : xrefs) {
1097                         if (xr && !(*xr)[key].empty()) {
1098                                 ret = (*xr)[key];
1099                                 break;
1100                         }
1101                 }
1102         }
1103         if (ret.empty()) {
1104                 // some special keys
1105                 // FIXME: dialog, textbefore and textafter have nothing to do with this
1106                 if (key == "dialog" && ci.context == CiteItem::Dialog)
1107                         ret = from_ascii("x"); // any non-empty string will do
1108                 else if (key == "export" && ci.context == CiteItem::Export)
1109                         ret = from_ascii("x"); // any non-empty string will do
1110                 else if (key == "ifstar" && ci.Starred)
1111                         ret = from_ascii("x"); // any non-empty string will do
1112                 else if (key == "ifqualified" && ci.isQualified)
1113                         ret = from_ascii("x"); // any non-empty string will do
1114                 else if (key == "entrytype")
1115                         ret = entry_type_;
1116                 else if (prefixIs(key, "ifentrytype:")
1117                          && from_ascii(key.substr(12)) == entry_type_)
1118                         ret = from_ascii("x"); // any non-empty string will do
1119                 else if (key == "key")
1120                         ret = bib_key_;
1121                 else if (key == "label")
1122                         ret = label_;
1123                 else if (key == "modifier" && modifier_ != 0)
1124                         ret = modifier_;
1125                 else if (key == "numericallabel")
1126                         ret = cite_number_;
1127                 else if (prefixIs(key, "ifmultiple:")) {
1128                         // Return whether we have multiple authors
1129                         docstring const kind = operator[](from_ascii(key.substr(11)));
1130                         if (multipleAuthors(kind))
1131                                 ret = from_ascii("x"); // any non-empty string will do
1132                 }
1133                 else if (prefixIs(key, "abbrvnames:")) {
1134                         // Special key to provide abbreviated name list,
1135                         // with respect to maxcitenames. Suitable for Bibliography
1136                         // beginnings.
1137                         docstring const kind = operator[](from_ascii(key.substr(11)));
1138                         ret = getAuthorList(&buf, kind, false, false, true);
1139                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1140                                 ret[0] = uppercase(ret[0]);
1141                 } else if (prefixIs(key, "fullnames:")) {
1142                         // Return a full name list. Suitable for Bibliography
1143                         // beginnings.
1144                         docstring const kind = operator[](from_ascii(key.substr(10)));
1145                         ret = getAuthorList(&buf, kind, true, false, true);
1146                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1147                                 ret[0] = uppercase(ret[0]);
1148                 } else if (prefixIs(key, "forceabbrvnames:")) {
1149                         // Special key to provide abbreviated name lists,
1150                         // irrespective of maxcitenames. Suitable for Bibliography
1151                         // beginnings.
1152                         docstring const kind = operator[](from_ascii(key.substr(15)));
1153                         ret = getAuthorList(&buf, kind, false, true, true);
1154                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1155                                 ret[0] = uppercase(ret[0]);
1156                 } else if (prefixIs(key, "abbrvbynames:")) {
1157                         // Special key to provide abbreviated name list,
1158                         // with respect to maxcitenames. Suitable for further names inside a
1159                         // bibliography item // (such as "ed. by ...")
1160                         docstring const kind = operator[](from_ascii(key.substr(11)));
1161                         ret = getAuthorList(&buf, kind, false, false, true, false);
1162                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1163                                 ret[0] = uppercase(ret[0]);
1164                 } else if (prefixIs(key, "fullbynames:")) {
1165                         // Return a full name list. Suitable for further names inside a
1166                         // bibliography item // (such as "ed. by ...")
1167                         docstring const kind = operator[](from_ascii(key.substr(10)));
1168                         ret = getAuthorList(&buf, kind, true, false, true, false);
1169                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1170                                 ret[0] = uppercase(ret[0]);
1171                 } else if (prefixIs(key, "forceabbrvbynames:")) {
1172                         // Special key to provide abbreviated name lists,
1173                         // irrespective of maxcitenames. Suitable for further names inside a
1174                         // bibliography item // (such as "ed. by ...")
1175                         docstring const kind = operator[](from_ascii(key.substr(15)));
1176                         ret = getAuthorList(&buf, kind, false, true, true, false);
1177                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1178                                 ret[0] = uppercase(ret[0]);
1179                 } else if (key == "abbrvciteauthor") {
1180                         // Special key to provide abbreviated author or
1181                         // editor names (suitable for citation labels),
1182                         // with respect to maxcitenames.
1183                         ret = getAuthorOrEditorList(&buf, false, false);
1184                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1185                                 ret[0] = uppercase(ret[0]);
1186                 } else if (key == "fullciteauthor") {
1187                         // Return a full author or editor list (for citation labels)
1188                         ret = getAuthorOrEditorList(&buf, true, false);
1189                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1190                                 ret[0] = uppercase(ret[0]);
1191                 } else if (key == "forceabbrvciteauthor") {
1192                         // Special key to provide abbreviated author or
1193                         // editor names (suitable for citation labels),
1194                         // irrespective of maxcitenames.
1195                         ret = getAuthorOrEditorList(&buf, false, true);
1196                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1197                                 ret[0] = uppercase(ret[0]);
1198                 } else if (key == "bibentry") {
1199                         // Special key to provide the full bibliography entry: see getInfo()
1200                         CiteEngineType const engine_type = buf.params().citeEngineType();
1201                         DocumentClass const & dc = buf.params().documentClass();
1202                         docstring const & format =
1203                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_), false));
1204                         int counter = 0;
1205                         ret = expandFormat(format, xrefs, counter, buf, ci, false, false);
1206                 } else if (key == "textbefore")
1207                         ret = ci.textBefore;
1208                 else if (key == "textafter")
1209                         ret = ci.textAfter;
1210                 else if (key == "curpretext") {
1211                         vector<pair<docstring, docstring>> pres = ci.getPretexts();
1212                         vector<pair<docstring, docstring>>::iterator it = pres.begin();
1213                         int numkey = 1;
1214                         for (; it != pres.end() ; ++it) {
1215                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1216                                         ret = (*it).second;
1217                                         pres.erase(it);
1218                                         break;
1219                                 }
1220                                 if ((*it).first == bib_key_)
1221                                         ++numkey;
1222                         }
1223                 } else if (key == "curposttext") {
1224                         vector<pair<docstring, docstring>> posts = ci.getPosttexts();
1225                         vector<pair<docstring, docstring>>::iterator it = posts.begin();
1226                         int numkey = 1;
1227                         for (; it != posts.end() ; ++it) {
1228                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1229                                         ret = (*it).second;
1230                                         posts.erase(it);
1231                                         break;
1232                                 }
1233                                 if ((*it).first == bib_key_)
1234                                         ++numkey;
1235                         }
1236                 } else if (key == "year")
1237                         ret = getYear();
1238         }
1239
1240         if (cleanit)
1241                 ret = xml::cleanAttr(ret);
1242
1243         // make sure it is not too big
1244         support::truncateWithEllipsis(ret, maxsize);
1245         return ret;
1246 }
1247
1248
1249 //////////////////////////////////////////////////////////////////////
1250 //
1251 // BiblioInfo
1252 //
1253 //////////////////////////////////////////////////////////////////////
1254
1255 namespace {
1256
1257 // A functor for use with sort, leading to case insensitive sorting
1258 bool compareNoCase(const docstring & a, const docstring & b) {
1259         return compare_no_case(a, b) < 0;
1260 }
1261
1262 } // namespace
1263
1264
1265 vector<docstring> const BiblioInfo::getXRefs(BibTeXInfo const & data, bool const nested) const
1266 {
1267         vector<docstring> result;
1268         if (!data.isBibTeX())
1269                 return result;
1270         // Legacy crossref field. This is not nestable.
1271         if (!nested && !data["crossref"].empty()) {
1272                 docstring const xrefkey = data["crossref"];
1273                 result.push_back(xrefkey);
1274                 // However, check for nested xdatas
1275                 BiblioInfo::const_iterator it = find(xrefkey);
1276                 if (it != end()) {
1277                         BibTeXInfo const & xref = it->second;
1278                         vector<docstring> const nxdata = getXRefs(xref, true);
1279                         if (!nxdata.empty())
1280                                 result.insert(result.end(), nxdata.begin(), nxdata.end());
1281                 }
1282         }
1283         // Biblatex's xdata field. Infinitely nestable.
1284         // XData field can consist of a comma-separated list of keys
1285         vector<docstring> const xdatakeys = getVectorFromString(data["xdata"]);
1286         if (!xdatakeys.empty()) {
1287                 for (auto const & xdatakey : xdatakeys) {
1288                         result.push_back(xdatakey);
1289                         BiblioInfo::const_iterator it = find(xdatakey);
1290                         if (it != end()) {
1291                                 BibTeXInfo const & xdata = it->second;
1292                                 vector<docstring> const nxdata = getXRefs(xdata, true);
1293                                 if (!nxdata.empty())
1294                                         result.insert(result.end(), nxdata.begin(), nxdata.end());
1295                         }
1296                 }
1297         }
1298         return result;
1299 }
1300
1301
1302 vector<docstring> const BiblioInfo::getKeys() const
1303 {
1304         vector<docstring> bibkeys;
1305         for (auto const & bi : *this)
1306                 bibkeys.push_back(bi.first);
1307         sort(bibkeys.begin(), bibkeys.end(), &compareNoCase);
1308         return bibkeys;
1309 }
1310
1311
1312 vector<docstring> const BiblioInfo::getFields() const
1313 {
1314         vector<docstring> bibfields;
1315         for (auto const & fn : field_names_)
1316                 bibfields.push_back(fn);
1317         sort(bibfields.begin(), bibfields.end());
1318         return bibfields;
1319 }
1320
1321
1322 vector<docstring> const BiblioInfo::getEntries() const
1323 {
1324         vector<docstring> bibentries;
1325         for (auto const & et : entry_types_)
1326                 bibentries.push_back(et);
1327         sort(bibentries.begin(), bibentries.end());
1328         return bibentries;
1329 }
1330
1331
1332 docstring const BiblioInfo::getAuthorOrEditorList(docstring const & key, Buffer const & buf) const
1333 {
1334         BiblioInfo::const_iterator it = find(key);
1335         if (it == end())
1336                 return docstring();
1337         BibTeXInfo const & data = it->second;
1338         return data.getAuthorOrEditorList(&buf, false);
1339 }
1340
1341
1342 docstring const BiblioInfo::getCiteNumber(docstring const & key) const
1343 {
1344         BiblioInfo::const_iterator it = find(key);
1345         if (it == end())
1346                 return docstring();
1347         BibTeXInfo const & data = it->second;
1348         return data.citeNumber();
1349 }
1350
1351 void BiblioInfo::getLocators(docstring const & key, docstring & doi, docstring & url, docstring & file) const
1352 {
1353         BiblioInfo::const_iterator it = find(key);
1354          if (it == end())
1355                 return;
1356         BibTeXInfo const & data = it->second;
1357         data.getLocators(doi,url,file);
1358 }
1359
1360
1361 docstring const BiblioInfo::getYear(docstring const & key, bool use_modifier) const
1362 {
1363         BiblioInfo::const_iterator it = find(key);
1364         if (it == end())
1365                 return docstring();
1366         BibTeXInfo const & data = it->second;
1367         docstring year = data.getYear();
1368         if (year.empty()) {
1369                 // let's try the crossrefs
1370                 vector<docstring> const xrefs = getXRefs(data);
1371                 if (xrefs.empty())
1372                         // no luck
1373                         return docstring();
1374                 for (docstring const & xref : xrefs) {
1375                         BiblioInfo::const_iterator const xrefit = find(xref);
1376                         if (xrefit == end())
1377                                 continue;
1378                         BibTeXInfo const & xref_data = xrefit->second;
1379                         year = xref_data.getYear();
1380                         if (!year.empty())
1381                                 // success!
1382                                 break;
1383                 }
1384         }
1385         if (use_modifier && data.modifier() != 0)
1386                 year += data.modifier();
1387         return year;
1388 }
1389
1390
1391 docstring const BiblioInfo::getYear(docstring const & key, Buffer const & buf, bool use_modifier) const
1392 {
1393         docstring const year = getYear(key, use_modifier);
1394         if (year.empty())
1395                 return buf.B_("No year");
1396         return year;
1397 }
1398
1399
1400 docstring const BiblioInfo::getInfo(docstring const & key,
1401         Buffer const & buf, CiteItem const & ci, docstring const & format) const
1402 {
1403         BiblioInfo::const_iterator it = find(key);
1404         if (it == end())
1405                 return docstring(_("Bibliography entry not found!"));
1406         BibTeXInfo const & data = it->second;
1407         BibTeXInfoList xrefptrs;
1408         for (docstring const & xref : getXRefs(data)) {
1409                 BiblioInfo::const_iterator const xrefit = find(xref);
1410                 if (xrefit != end())
1411                         xrefptrs.push_back(&(xrefit->second));
1412         }
1413         return data.getInfo(xrefptrs, buf, ci, format);
1414 }
1415
1416
1417 docstring const BiblioInfo::getLabel(vector<docstring> keys,
1418         Buffer const & buf, string const & style, CiteItem const & ci) const
1419 {
1420         size_t max_size = ci.max_size;
1421         // shorter makes no sense
1422         LASSERT(max_size >= 16, max_size = 16);
1423
1424         // we can't display more than 10 of these, anyway
1425         // but since we truncate in the middle,
1426         // we need to split into two halfs.
1427         bool const too_many_keys = keys.size() > 10;
1428         vector<docstring> lkeys;
1429         if (too_many_keys) {
1430                 lkeys.insert(lkeys.end(), keys.end() - 5, keys.end());
1431                 keys.resize(5);
1432                 keys.insert(keys.end(), lkeys.begin(), lkeys.end());
1433         }
1434
1435         CiteEngineType const engine_type = buf.params().citeEngineType();
1436         DocumentClass const & dc = buf.params().documentClass();
1437         docstring const & format = from_utf8(dc.getCiteFormat(engine_type, style, false, "cite"));
1438         docstring ret = format;
1439         vector<docstring>::const_iterator key = keys.begin();
1440         vector<docstring>::const_iterator ken = keys.end();
1441         vector<docstring> handled_keys;
1442         for (int i = 0; key != ken; ++key, ++i) {
1443                 handled_keys.push_back(*key);
1444                 int n = 0;
1445                 for (auto const & k : handled_keys) {
1446                         if (k == *key)
1447                                 ++n;
1448                 }
1449                 BiblioInfo::const_iterator it = find(*key);
1450                 BibTeXInfo empty_data;
1451                 empty_data.key(*key);
1452                 BibTeXInfo & data = empty_data;
1453                 vector<BibTeXInfo const *> xrefptrs;
1454                 if (it != end()) {
1455                         data = it->second;
1456                         for (docstring const & xref : getXRefs(data)) {
1457                                 BiblioInfo::const_iterator const xrefit = find(xref);
1458                                 if (xrefit != end())
1459                                         xrefptrs.push_back(&(xrefit->second));
1460                         }
1461                 }
1462                 data.numKey(n);
1463                 ret = data.getLabel(xrefptrs, buf, ret, ci, key + 1 != ken, i == 1);
1464         }
1465
1466         support::truncateWithEllipsis(ret, max_size, true);
1467
1468         return ret;
1469 }
1470
1471
1472 bool BiblioInfo::isBibtex(docstring const & key) const
1473 {
1474         docstring key1;
1475         split(key, key1, ',');
1476         BiblioInfo::const_iterator it = find(key1);
1477         if (it == end())
1478                 return false;
1479         return it->second.isBibTeX();
1480 }
1481
1482
1483 BiblioInfo::CiteStringMap const BiblioInfo::getCiteStrings(
1484         vector<docstring> const & keys, vector<CitationStyle> const & styles,
1485         Buffer const & buf, CiteItem const & ci) const
1486 {
1487         if (empty())
1488                 return vector<pair<docstring,docstring>>();
1489
1490         string style;
1491         CiteStringMap csm(styles.size());
1492         for (size_t i = 0; i != csm.size(); ++i) {
1493                 style = styles[i].name;
1494                 csm[i] = make_pair(from_ascii(style), getLabel(keys, buf, style, ci));
1495         }
1496
1497         return csm;
1498 }
1499
1500
1501 void BiblioInfo::mergeBiblioInfo(BiblioInfo const & info)
1502 {
1503         bimap_.insert(info.begin(), info.end());
1504         field_names_.insert(info.field_names_.begin(), info.field_names_.end());
1505         entry_types_.insert(info.entry_types_.begin(), info.entry_types_.end());
1506 }
1507
1508
1509 namespace {
1510
1511 // used in xhtml to sort a list of BibTeXInfo objects
1512 bool lSorter(BibTeXInfo const * lhs, BibTeXInfo const * rhs)
1513 {
1514         docstring const lauth = lhs->getAuthorOrEditorList();
1515         docstring const rauth = rhs->getAuthorOrEditorList();
1516         docstring const lyear = lhs->getYear();
1517         docstring const ryear = rhs->getYear();
1518         docstring const ltitl = lhs->operator[]("title");
1519         docstring const rtitl = rhs->operator[]("title");
1520         return  (lauth < rauth)
1521                 || (lauth == rauth && lyear < ryear)
1522                 || (lauth == rauth && lyear == ryear && ltitl < rtitl);
1523 }
1524
1525 } // namespace
1526
1527
1528 void BiblioInfo::collectCitedEntries(Buffer const & buf)
1529 {
1530         cited_entries_.clear();
1531         // We are going to collect all the citation keys used in the document,
1532         // getting them from the TOC.
1533         // FIXME We may want to collect these differently, in the first case,
1534         // so that we might have them in order of appearance.
1535         set<docstring> citekeys;
1536         Toc const & toc = *buf.tocBackend().toc("citation");
1537         for (auto const & t : toc) {
1538                 if (t.str().empty())
1539                         continue;
1540                 vector<docstring> const keys = getVectorFromString(t.str());
1541                 citekeys.insert(keys.begin(), keys.end());
1542         }
1543         if (citekeys.empty())
1544                 return;
1545
1546         // We have a set of the keys used in this document.
1547         // We will now convert it to a list of the BibTeXInfo objects used in
1548         // this document...
1549         vector<BibTeXInfo const *> bi;
1550         for (auto const & ck : citekeys) {
1551                 BiblioInfo::const_iterator const bt = find(ck);
1552                 if (bt == end() || !bt->second.isBibTeX())
1553                         continue;
1554                 bi.push_back(&(bt->second));
1555         }
1556         // ...and sort it.
1557         sort(bi.begin(), bi.end(), lSorter);
1558
1559         // Now we can write the sorted keys
1560         // b is a BibTeXInfo const *
1561         for (auto const & b : bi)
1562                 cited_entries_.push_back(b->key());
1563 }
1564
1565
1566 void BiblioInfo::makeCitationLabels(Buffer const & buf)
1567 {
1568         collectCitedEntries(buf);
1569         CiteEngineType const engine_type = buf.params().citeEngineType();
1570         bool const numbers = (engine_type & ENGINE_TYPE_NUMERICAL);
1571
1572         int keynumber = 0;
1573         char modifier = 0;
1574         // used to remember the last one we saw
1575         // we'll be comparing entries to see if we need to add
1576         // modifiers, like "1984a"
1577         map<docstring, BibTeXInfo>::iterator last = bimap_.end();
1578
1579         // add letters to years
1580         for (auto const & ce : cited_entries_) {
1581                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1582                 // this shouldn't happen, but...
1583                 if (biit == bimap_.end())
1584                         // ...fail gracefully, anyway.
1585                         continue;
1586                 BibTeXInfo & entry = biit->second;
1587                 if (numbers) {
1588                         docstring const num = convert<docstring>(++keynumber);
1589                         entry.setCiteNumber(num);
1590                 } else {
1591                         // The first test here is checking whether this is the first
1592                         // time through the loop. If so, then we do not have anything
1593                         // with which to compare.
1594                         if (last != bimap_.end()
1595                             && entry.getAuthorOrEditorList() == last->second.getAuthorOrEditorList()
1596                             // we access the year via getYear() so as to get it from the xref,
1597                             // if we need to do so
1598                             && getYear(entry.key()) == getYear(last->second.key())) {
1599                                 if (modifier == 0) {
1600                                         // so the last one should have been 'a'
1601                                         last->second.setModifier('a');
1602                                         modifier = 'b';
1603                                 } else if (modifier == 'z')
1604                                         modifier = 'A';
1605                                 else
1606                                         modifier++;
1607                         } else {
1608                                 modifier = 0;
1609                         }
1610                         entry.setModifier(modifier);
1611                         // remember the last one
1612                         last = biit;
1613                 }
1614         }
1615         // Set the labels
1616         for (auto const & ce : cited_entries_) {
1617                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1618                 // this shouldn't happen, but...
1619                 if (biit == bimap_.end())
1620                         // ...fail gracefully, anyway.
1621                         continue;
1622                 BibTeXInfo & entry = biit->second;
1623                 if (numbers) {
1624                         entry.label(entry.citeNumber());
1625                 } else {
1626                         docstring const auth = entry.getAuthorOrEditorList(&buf, false);
1627                         // we do it this way so as to access the xref, if necessary
1628                         // note that this also gives us the modifier
1629                         docstring const year = getYear(ce, buf, true);
1630                         if (!auth.empty() && !year.empty())
1631                                 entry.label(auth + ' ' + year);
1632                         else
1633                                 entry.label(entry.key());
1634                 }
1635         }
1636 }
1637
1638
1639 //////////////////////////////////////////////////////////////////////
1640 //
1641 // CitationStyle
1642 //
1643 //////////////////////////////////////////////////////////////////////
1644
1645
1646 CitationStyle citationStyleFromString(string const & command,
1647                                       BufferParams const & params)
1648 {
1649         CitationStyle cs;
1650         if (command.empty())
1651                 return cs;
1652
1653         string const alias = params.getCiteAlias(command);
1654         string cmd = alias.empty() ? command : alias;
1655         if (isUpperCase(command[0])) {
1656                 cs.forceUpperCase = true;
1657                 cmd[0] = lowercase(cmd[0]);
1658         }
1659
1660         size_t const n = command.size() - 1;
1661         if (command[n] == '*') {
1662                 cs.hasStarredVersion = true;
1663                 if (suffixIs(cmd, '*'))
1664                         cmd = cmd.substr(0, cmd.size() - 1);
1665         }
1666
1667         cs.name = cmd;
1668         return cs;
1669 }
1670
1671
1672 string citationStyleToString(const CitationStyle & cs, bool const latex)
1673 {
1674         string cmd = latex ? cs.cmd : cs.name;
1675         if (cs.forceUpperCase)
1676                 cmd[0] = uppercase(cmd[0]);
1677         if (cs.hasStarredVersion)
1678                 cmd += '*';
1679         return cmd;
1680 }
1681
1682
1683 docstring authorsToDocBookAuthorGroup(docstring const & authorsString, XMLStream & xs, Buffer const & buf)
1684 {
1685         // This function closely mimics getAuthorList, but produces DocBook instead of text.
1686         // It has been greatly simplified, as the complete list of authors is always produced. No separators are required,
1687         // as the output has a database-like shape.
1688         // constructName has also been merged within, as it becomes really simple and leads to no copy-paste.
1689
1690         if (authorsString.empty()) {
1691                 return docstring();
1692         }
1693
1694         // Split the input list of authors into individual authors.
1695         vector<docstring> const authors = getAuthors(authorsString);
1696
1697         // Retrieve the "et al." variation.
1698         string const etal = buf.params().documentClass().getCiteMacro(buf.params().citeEngineType(), "_etal");
1699
1700         // Output the list of authors.
1701         xs << xml::StartTag("authorgroup");
1702         xs << xml::CR();
1703
1704         auto it = authors.cbegin();
1705         auto en = authors.cend();
1706         for (size_t i = 0; it != en; ++it, ++i) {
1707                 xs << xml::StartTag("author");
1708                 xs << xml::CR();
1709                 xs << xml::StartTag("personname");
1710                 xs << xml::CR();
1711                 docstring name = *it;
1712
1713                 // All authors go in a <personname>. If more structure is known, use it; otherwise (just "et al."), print it as such.
1714                 if (name == "others") {
1715                         xs << buf.B_(etal);
1716                 } else {
1717                         name_parts parts = nameParts(name);
1718                         if (! parts.prefix.empty()) {
1719                                 xs << xml::StartTag("honorific");
1720                                 xs << parts.prefix;
1721                                 xs << xml::EndTag("honorific");
1722                                 xs << xml::CR();
1723                         }
1724                         if (! parts.prename.empty()) {
1725                                 xs << xml::StartTag("firstname");
1726                                 xs << parts.prename;
1727                                 xs << xml::EndTag("firstname");
1728                                 xs << xml::CR();
1729                         }
1730                         if (! parts.surname.empty()) {
1731                                 xs << xml::StartTag("surname");
1732                                 xs << parts.surname;
1733                                 xs << xml::EndTag("surname");
1734                                 xs << xml::CR();
1735                         }
1736                         if (! parts.suffix.empty()) {
1737                                 xs << xml::StartTag("othername", "role=\"suffix\"");
1738                                 xs << parts.suffix;
1739                                 xs << xml::EndTag("othername");
1740                                 xs << xml::CR();
1741                         }
1742                 }
1743
1744                 xs << xml::EndTag("personname");
1745                 xs << xml::CR();
1746                 xs << xml::EndTag("author");
1747                 xs << xml::CR();
1748
1749                 // Could add an affiliation after <personname>, but not stored in BibTeX.
1750         }
1751         xs << xml::EndTag("authorgroup");
1752         xs << xml::CR();
1753
1754         return docstring();
1755 }
1756
1757 } // namespace lyx