src/BiblioInfo.cpp

   1 /**
   2  * \file BiblioInfo.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Angus Leeming
   7  * \author Herbert Voß
   8  * \author Richard Kimberly Heck
   9  * \author Julien Rioux
  10  * \author Jürgen Spitzmüller
  11  *
  12  * Full author contact details are available in file CREDITS.
  13  */
  14
  15 #include <config.h>
  16
  17 #include "BiblioInfo.h"
  18
  19 #include "Buffer.h"
  20 #include "BufferParams.h"
  21 #include "Citation.h"
  22 #include "Encoding.h"
  23 #include "Language.h"
  24 #include "TextClass.h"
  25 #include "TocBackend.h"
  26 #include "xml.h"
  27
  28 #include "support/convert.h"
  29 #include "support/debug.h"
  30 #include "support/docstream.h"
  31 #include "support/FileName.h"
  32 #include "support/gettext.h"
  33 #include "support/lassert.h"
  34 #include "support/lstrings.h"
  35 #include "support/textutils.h"
  36
  37 #include <map>
  38 #include <regex>
  39 #include <set>
  40
  41 using namespace std;
  42 using namespace lyx::support;
  43
  44
  45 namespace lyx {
  46
  47 namespace {
  48
  49 // Remove placeholders from names
  50 docstring renormalize(docstring const & input)
  51 {
  52         docstring res = subst(input, from_ascii("$$space!"), from_ascii(" "));
  53         return subst(res, from_ascii("$$comma!"), from_ascii(","));
  54 }
  55
  56
  57 // Split the surname into prefix ("von-part") and family name
  58 pair<docstring, docstring> parseSurname(docstring const & sname)
  59 {
  60         // Split the surname into its tokens
  61         vector<docstring> pieces = getVectorFromString(sname, from_ascii(" "));
  62         if (pieces.size() < 2)
  63                 return make_pair(docstring(), sname);
  64
  65         // Now we look for pieces that begin with a lower case letter.
  66         // All except for the very last token constitute the "von-part".
  67         docstring prefix;
  68         vector<docstring>::const_iterator it = pieces.begin();
  69         vector<docstring>::const_iterator const en = pieces.end();
  70         bool first = true;
  71         for (; it != en; ++it) {
  72                 if ((*it).empty())
  73                         continue;
  74                 // If this is the last piece, then what we now have is
  75                 // the family name, notwithstanding the casing.
  76                 if (it + 1 == en)
  77                         break;
  78                 char_type const c = (*it)[0];
  79                 // If the piece starts with a upper case char, we assume
  80                 // this is part of the surname.
  81                 if (!isLower(c))
  82                         break;
  83                 // Nothing of the former, so add this piece to the prename
  84                 if (!first)
  85                         prefix += " ";
  86                 else
  87                         first = false;
  88                 prefix += *it;
  89         }
  90
  91         // Reconstruct the family name.
  92         // Note that if we left the loop with because it + 1 == en,
  93         // then this will still do the right thing, i.e., make surname
  94         // just be the last piece.
  95         docstring surname;
  96         first = true;
  97         for (; it != en; ++it) {
  98                 if (!first)
  99                         surname += " ";
 100                 else
 101                         first = false;
 102                 surname += *it;
 103         }
 104         return make_pair(prefix, surname);
 105 }
 106
 107
 108 struct name_parts {
 109         docstring surname;
 110         docstring prename;
 111         docstring suffix;
 112         docstring prefix;
 113 };
 114
 115
 116 // gets the name parts (prename, surname, prefix, suffix) from an author-type string
 117 name_parts nameParts(docstring const & iname)
 118 {
 119         name_parts res;
 120         if (iname.empty())
 121                 return res;
 122
 123         // First we check for goupings (via {...}) and replace blanks and
 124         // commas inside groups with temporary placeholders
 125         docstring name;
 126         int gl = 0;
 127         docstring::const_iterator p = iname.begin();
 128         while (p != iname.end()) {
 129                 // count grouping level
 130                 if (*p == '{')
 131                         ++gl;
 132                 else if (*p == '}')
 133                         --gl;
 134                 // generate string with probable placeholders
 135                 if (*p == ' ' && gl > 0)
 136                         name += from_ascii("$$space!");
 137                 else if (*p == ',' && gl > 0)
 138                         name += from_ascii("$$comma!");
 139                 else
 140                         name += *p;
 141                 ++p;
 142         }
 143
 144         // Now we look for a comma, and take the last name to be everything
 145         // preceding the right-most one, so that we also get the name suffix
 146         // (aka "jr" part).
 147         vector<docstring> pieces = getVectorFromString(name);
 148         if (pieces.size() > 1) {
 149                 // Whether we have a name suffix or not, the prename is
 150                 // always last item
 151                 res.prename = renormalize(pieces.back());
 152                 // The family name, conversely, is always the first item.
 153                 // However, it might contain a prefix (aka "von" part)
 154                 docstring const sname = pieces.front();
 155                 res.prefix = renormalize(parseSurname(sname).first);
 156                 res.surname = renormalize(parseSurname(sname).second);
 157                 // If we have three pieces (the maximum allowed by BibTeX),
 158                 // the second one is the name suffix.
 159                 if (pieces.size() > 2)
 160                         res.suffix = renormalize(pieces.at(1));
 161                 return res;
 162         }
 163
 164         // OK, so now we want to look for the last name.
 165         // Split on spaces, to get various tokens.
 166         pieces = getVectorFromString(name, from_ascii(" "));
 167         // No space: Only a family name given
 168         if (pieces.size() < 2) {
 169                 res.surname = renormalize(pieces.back());
 170                 return res;
 171         }
 172         // If we get two pieces, assume "prename surname"
 173         if (pieces.size() == 2) {
 174                 res.prename = renormalize(pieces.front());
 175                 res.surname = renormalize(pieces.back());
 176                 return res;
 177         }
 178
 179         // More than 3 pieces: A name prefix (aka "von" part) might be included.
 180         // We look for the first piece that begins with a lower case letter
 181         // (which is the name prefix, if it is not the last token) or the last token.
 182         docstring prename;
 183         vector<docstring>::const_iterator it = pieces.begin();
 184         vector<docstring>::const_iterator const en = pieces.end();
 185         bool first = true;
 186         for (; it != en; ++it) {
 187                 if ((*it).empty())
 188                         continue;
 189                 char_type const c = (*it)[0];
 190                 // If the piece starts with a lower case char, we assume
 191                 // this is the name prefix and thus prename is complete.
 192                 if (isLower(c))
 193                         break;
 194                 // Same if this is the last piece, which is always the surname.
 195                 if (it + 1 == en)
 196                         break;
 197                 // Nothing of the former, so add this piece to the prename
 198                 if (!first)
 199                         prename += " ";
 200                 else
 201                         first = false;
 202                 prename += *it;
 203         }
 204
 205         // Now reconstruct the family name and strip the prefix.
 206         // Note that if we left the loop because it + 1 == en,
 207         // then this will still do the right thing, i.e., make surname
 208         // just be the last piece.
 209         docstring surname;
 210         first = true;
 211         for (; it != en; ++it) {
 212                 if (!first)
 213                         surname += " ";
 214                 else
 215                         first = false;
 216                 surname += *it;
 217         }
 218         res.prename = renormalize(prename);
 219         res.prefix = renormalize(parseSurname(surname).first);
 220         res.surname = renormalize(parseSurname(surname).second);
 221         return res;
 222 }
 223
 224
 225 docstring constructName(docstring const & name, string const & scheme)
 226 {
 227         // re-constructs a name from name parts according
 228         // to a given scheme
 229         docstring const prename = nameParts(name).prename;
 230         docstring const surname = nameParts(name).surname;
 231         docstring const prefix = nameParts(name).prefix;
 232         docstring const suffix = nameParts(name).suffix;
 233         string res = scheme;
 234         static regex const reg1("(.*)(\\{%prename%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 235         static regex const reg2("(.*)(\\{%suffix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 236         static regex const reg3("(.*)(\\{%prefix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 237         smatch sub;
 238         // Changing the first parameter of regex_match() may corrupt the
 239         // second one. In this case we use the temporary string tmp.
 240         if (regex_match(scheme, sub, reg1)) {
 241                 res = sub.str(1);
 242                 if (!prename.empty())
 243                         res += sub.str(3);
 244                 res += sub.str(5);
 245         }
 246         if (regex_match(res, sub, reg2)) {
 247                 string tmp = sub.str(1);
 248                 if (!suffix.empty())
 249                         tmp += sub.str(3);
 250                 res = tmp + sub.str(5);
 251         }
 252         if (regex_match(res, sub, reg3)) {
 253                 string tmp = sub.str(1);
 254                 if (!prefix.empty())
 255                         tmp += sub.str(3);
 256                 res = tmp + sub.str(5);
 257         }
 258         docstring result = from_ascii(res);
 259         result = subst(result, from_ascii("%prename%"), prename);
 260         result = subst(result, from_ascii("%surname%"), surname);
 261         result = subst(result, from_ascii("%prefix%"), prefix);
 262         result = subst(result, from_ascii("%suffix%"), suffix);
 263         return result;
 264 }
 265
 266
 267 vector<docstring> const getAuthors(docstring const & author)
 268 {
 269         // We check for goupings (via {...}) and only consider " and "
 270         // outside groups as author separator. This is to account
 271         // for cases such as {{Barnes and Noble, Inc.}}, which
 272         // need to be treated as one single family name.
 273         // We use temporary placeholders in order to differentiate the
 274         // diverse " and " cases.
 275
 276         // First, we temporarily replace all ampersands. It is rather unusual
 277         // in author names, but can happen (consider cases such as "C \& A Corp.").
 278         docstring iname = subst(author, from_ascii("&"), from_ascii("$$amp!"));
 279         // Then, we temporarily make all " and " strings to ampersands in order
 280         // to handle them later on a per-char level. Note that arbitrary casing
 281         // ("And", "AND", "aNd", ...) is allowed in bibtex (#10465).
 282         static regex const and_reg("(.* )([aA][nN][dD])( .*)");
 283         smatch sub;
 284         string res = to_utf8(iname);
 285         while (regex_match(res, sub, and_reg))
 286                 res = sub.str(1) + "&" + sub.str(3);
 287         iname = from_utf8(res);
 288         // Now we traverse through the string and replace the "&" by the proper
 289         // output in- and outside groups
 290         docstring name;
 291         int gl = 0;
 292         docstring::const_iterator p = iname.begin();
 293         while (p != iname.end()) {
 294                 // count grouping level
 295                 if (*p == '{')
 296                         ++gl;
 297                 else if (*p == '}')
 298                         --gl;
 299                 // generate string with probable placeholders
 300                 if (*p == '&') {
 301                         if (gl > 0)
 302                                 // Inside groups, we output "and"
 303                                 name += from_ascii("and");
 304                         else
 305                                 // Outside groups, we output a separator
 306                                 name += from_ascii("$$namesep!");
 307                 }
 308                 else
 309                         name += *p;
 310                 ++p;
 311         }
 312
 313         // re-insert the literal ampersands
 314         name = subst(name, from_ascii("$$amp!"), from_ascii("&"));
 315
 316         // Now construct the actual vector
 317         return getVectorFromString(name, from_ascii(" $$namesep! "));
 318 }
 319
 320
 321 bool multipleAuthors(docstring const & author)
 322 {
 323         return getAuthors(author).size() > 1;
 324 }
 325
 326
 327 // converts a string containing LaTeX commands into unicode
 328 // for display.
 329 docstring convertLaTeXCommands(docstring const & str)
 330 {
 331         docstring val = str;
 332         docstring ret;
 333
 334         bool scanning_cmd = false;
 335         bool scanning_math = false;
 336         bool escaped = false; // used to catch \$, etc.
 337         while (!val.empty()) {
 338                 char_type const ch = val[0];
 339
 340                 // if we're scanning math, we output everything until we
 341                 // find an unescaped $, at which point we break out.
 342                 if (scanning_math) {
 343                         if (escaped)
 344                                 escaped = false;
 345                         else if (ch == '\\')
 346                                 escaped = true;
 347                         else if (ch == '$')
 348                                 scanning_math = false;
 349                         ret += ch;
 350                         val = val.substr(1);
 351                         continue;
 352                 }
 353
 354                 // if we're scanning a command name, then we just
 355                 // discard characters until we hit something that
 356                 // isn't alpha.
 357                 if (scanning_cmd) {
 358                         if (isAlphaASCII(ch)) {
 359                                 val = val.substr(1);
 360                                 escaped = false;
 361                                 continue;
 362                         }
 363                         // so we're done with this command.
 364                         // now we fall through and check this character.
 365                         scanning_cmd = false;
 366                 }
 367
 368                 // was the last character a \? If so, then this is something like:
 369                 // \\ or \$, so we'll just output it. That's probably not always right...
 370                 if (escaped) {
 371                         // exception: output \, as THIN SPACE
 372                         if (ch == ',')
 373                                 ret.push_back(0x2009);
 374                         else
 375                                 ret += ch;
 376                         val = val.substr(1);
 377                         escaped = false;
 378                         continue;
 379                 }
 380
 381                 if (ch == '$') {
 382                         ret += ch;
 383                         val = val.substr(1);
 384                         scanning_math = true;
 385                         continue;
 386                 }
 387
 388                 // Change text mode accents in the form
 389                 // {\v a} to \v{a} (see #9340).
 390                 // FIXME: This is a sort of mini-tex2lyx.
 391                 //        Use the real tex2lyx instead!
 392                 static regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}");
 393                 if (regex_search(to_utf8(val), tma_reg)) {
 394                         val = val.substr(1);
 395                         val.replace(2, 1, from_ascii("{"));
 396                         continue;
 397                 }
 398
 399                 // Apart from the above, we just ignore braces
 400                 if (ch == '{' || ch == '}') {
 401                         val = val.substr(1);
 402                         continue;
 403                 }
 404
 405                 // we're going to check things that look like commands, so if
 406                 // this doesn't, just output it.
 407                 if (ch != '\\') {
 408                         ret += ch;
 409                         val = val.substr(1);
 410                         continue;
 411                 }
 412
 413                 // ok, could be a command of some sort
 414                 // let's see if it corresponds to some unicode
 415                 // unicodesymbols has things in the form: \"{u},
 416                 // whereas we may see things like: \"u. So we'll
 417                 // look for that and change it, if necessary.
 418                 // FIXME: This is a sort of mini-tex2lyx.
 419                 //        Use the real tex2lyx instead!
 420                 static regex const reg("^\\\\\\W\\w");
 421                 if (regex_search(to_utf8(val), reg)) {
 422                         val.insert(3, from_ascii("}"));
 423                         val.insert(2, from_ascii("{"));
 424                 }
 425                 bool termination;
 426                 docstring rem;
 427                 docstring const cnvtd = Encodings::fromLaTeXCommand(val,
 428                                 Encodings::TEXT_CMD, termination, rem);
 429                 if (!cnvtd.empty()) {
 430                         // it did, so we'll take that bit and proceed with what's left
 431                         ret += cnvtd;
 432                         val = rem;
 433                         continue;
 434                 }
 435                 // it's a command of some sort
 436                 scanning_cmd = true;
 437                 escaped = true;
 438                 val = val.substr(1);
 439         }
 440         return ret;
 441 }
 442
 443
 444 // Escape '<' and '>' and remove richtext markers (e.g. {!this is richtext!}) from a string.
 445 docstring processRichtext(docstring const & str, bool richtext)
 446 {
 447         docstring val = str;
 448         docstring ret;
 449
 450         bool scanning_rich = false;
 451         while (!val.empty()) {
 452                 char_type const ch = val[0];
 453                 if (ch == '{' && val.size() > 1 && val[1] == '!') {
 454                         // beginning of rich text
 455                         scanning_rich = true;
 456                         val = val.substr(2);
 457                         continue;
 458                 }
 459                 if (scanning_rich && ch == '!' && val.size() > 1 && val[1] == '}') {
 460                         // end of rich text
 461                         scanning_rich = false;
 462                         val = val.substr(2);
 463                         continue;
 464                 }
 465                 if (richtext) {
 466                         if (scanning_rich)
 467                                 ret += ch;
 468                         else {
 469                                 // we need to escape '<' and '>'
 470                                 if (ch == '<')
 471                                         ret += "&lt;";
 472                                 else if (ch == '>')
 473                                         ret += "&gt;";
 474                                 else
 475                                         ret += ch;
 476                         }
 477                 } else if (!scanning_rich /* && !richtext */)
 478                         ret += ch;
 479                 // else the character is discarded, which will happen only if
 480                 // richtext == false and we are scanning rich text
 481                 val = val.substr(1);
 482         }
 483         return ret;
 484 }
 485
 486 } // namespace
 487
 488
 489 //////////////////////////////////////////////////////////////////////
 490 //
 491 // BibTeXInfo
 492 //
 493 //////////////////////////////////////////////////////////////////////
 494
 495 BibTeXInfo::BibTeXInfo(docstring const & key, docstring const & type)
 496         : is_bibtex_(true), bib_key_(key), num_bib_key_(0), entry_type_(type),
 497           info_(), format_(), modifier_(0)
 498 {}
 499
 500
 501
 502 docstring const BibTeXInfo::getAuthorOrEditorList(Buffer const * buf,
 503                                           bool full, bool forceshort) const
 504 {
 505         docstring author = operator[]("author");
 506         if (author.empty())
 507                 author = operator[]("editor");
 508
 509         return getAuthorList(buf, author, full, forceshort);
 510 }
 511
 512
 513 docstring const BibTeXInfo::getAuthorList(Buffer const * buf,
 514                 docstring const & author, bool const full, bool const forceshort,
 515                 bool const allnames, bool const beginning) const
 516 {
 517         // Maxnames treshold depend on engine
 518         size_t maxnames = buf ?
 519                 buf->params().documentClass().max_citenames() : 2;
 520
 521         if (!is_bibtex_) {
 522                 docstring const opt = label();
 523                 if (opt.empty())
 524                         return docstring();
 525
 526                 docstring authors;
 527                 docstring const remainder = trim(split(opt, authors, '('));
 528                 if (remainder.empty())
 529                         // in this case, we didn't find a "(",
 530                         // so we don't have author (year)
 531                         return docstring();
 532                 if (full) {
 533                         // Natbib syntax is "Jones et al.(1990)Jones, Baker, and Williams"
 534                         docstring const fullauthors = trim(rsplit(remainder, ')'));
 535                         if (!fullauthors.empty())
 536                                 return fullauthors;
 537                 }
 538                 return authors;
 539         }
 540
 541         if (author.empty())
 542                 return author;
 543
 544         // OK, we've got some names. Let's format them.
 545         // Try to split the author list
 546         vector<docstring> const authors = getAuthors(author);
 547
 548         docstring retval;
 549
 550         CiteEngineType const engine_type = buf ? buf->params().citeEngineType()
 551                                                : ENGINE_TYPE_DEFAULT;
 552
 553         // These are defined in the styles
 554         string const etal =
 555                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_etal")
 556                     : " et al.";
 557         string const namesep =
 558                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_namesep")
 559                    : ", ";
 560         string const lastnamesep =
 561                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_lastnamesep")
 562                     : ", and ";
 563         string const pairnamesep =
 564                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_pairnamesep")
 565                      : " and ";
 566         string firstnameform =
 567                         buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstnameform")
 568                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 569         if (!beginning)
 570                 firstnameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstbynameform")
 571                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 572         string othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!othernameform")
 573                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 574         if (!beginning)
 575                 othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!otherbynameform")
 576                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 577         string citenameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!citenameform")
 578                              : "{%prefix%[[%prefix% ]]}%surname%";
 579
 580         // Shorten the list (with et al.) if forceshort is set
 581         // and the list can actually be shortened, else if maxcitenames
 582         // is passed and full is not set.
 583         bool shorten = forceshort && authors.size() > 1;
 584         vector<docstring>::const_iterator it = authors.begin();
 585         vector<docstring>::const_iterator en = authors.end();
 586         for (size_t i = 0; it != en; ++it, ++i) {
 587                 if (i >= maxnames && !full) {
 588                         shorten = true;
 589                         break;
 590                 }
 591                 if (*it == "others") {
 592                         retval += buf ? buf->B_(etal) : from_ascii(etal);
 593                         break;
 594                 }
 595                 if (i > 0 && i == authors.size() - 1) {
 596                         if (authors.size() == 2)
 597                                 retval += buf ? buf->B_(pairnamesep) : from_ascii(pairnamesep);
 598                         else
 599                                 retval += buf ? buf->B_(lastnamesep) : from_ascii(lastnamesep);
 600                 } else if (i > 0)
 601                         retval += buf ? buf->B_(namesep) : from_ascii(namesep);
 602                 if (allnames)
 603                         retval += (i == 0) ? constructName(*it, firstnameform)
 604                                 : constructName(*it, othernameform);
 605                 else
 606                         retval += constructName(*it, citenameform);
 607         }
 608         if (shorten) {
 609                 if (allnames)
 610                         retval = constructName(authors[0], firstnameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 611                 else
 612                         retval = constructName(authors[0], citenameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 613         }
 614
 615         return convertLaTeXCommands(retval);
 616 }
 617
 618
 619 docstring const BibTeXInfo::getYear() const
 620 {
 621         if (is_bibtex_) {
 622                 // first try legacy year field
 623                 docstring year = operator[]("year");
 624                 if (!year.empty())
 625                         return year;
 626                 // now try biblatex's date field
 627                 year = operator[]("date");
 628                 // Format is [-]YYYY-MM-DD*/[-]YYYY-MM-DD*
 629                 // We only want the years.
 630                 static regex const yreg("[-]?([\\d]{4}).*");
 631                 static regex const ereg(".*/[-]?([\\d]{4}).*");
 632                 smatch sm;
 633                 string const date = to_utf8(year);
 634                 if (!regex_match(date, sm, yreg))
 635                         // cannot parse year.
 636                         return docstring();
 637                 year = from_ascii(sm[1]);
 638                 // check for an endyear
 639                 if (regex_match(date, sm, ereg))
 640                         year += char_type(0x2013) + from_ascii(sm[1]);
 641                 return year;
 642         }
 643
 644         docstring const opt = label();
 645         if (opt.empty())
 646                 return docstring();
 647
 648         docstring authors;
 649         docstring tmp = split(opt, authors, '(');
 650         if (tmp.empty())
 651                 // we don't have author (year)
 652                 return docstring();
 653         docstring year;
 654         tmp = split(tmp, year, ')');
 655         return year;
 656 }
 657
 658
 659 void BibTeXInfo::getLocators(docstring & doi, docstring & url, docstring & file) const
 660 {
 661         if (is_bibtex_) {
 662                 // get "doi" entry from citation record
 663                 doi = operator[]("doi");
 664                 if (!doi.empty() && !prefixIs(doi,from_ascii("http")))
 665                         doi = "https://doi.org/" + doi;
 666                 // get "url" entry from citation record
 667                 url = operator[]("url");
 668                 // get "file" entry from citation record
 669                 file = operator[]("file");
 670
 671                 // Jabref case, field has a format:
 672                 // Description:Location:Filetype;Description:Location:Filetype...
 673                 // We will grab only first pdf
 674                 if (!file.empty()) {
 675                         docstring ret, filedest, tmp;
 676                         ret = split(file, tmp, ':');
 677                         tmp = split(ret, filedest, ':');
 678                         //TODO howto deal with relative directories?
 679                         FileName f(to_utf8(filedest));
 680                         if (f.exists())
 681                                 file = "file:///" + filedest;
 682                 }
 683
 684                 // kbibtex case, format:
 685                 // file1.pdf;file2.pdf
 686                 // We will grab only first pdf
 687                 docstring kfile;
 688                 if (file.empty())
 689                         kfile = operator[]("localfile");
 690                 if (!kfile.empty()) {
 691                         docstring filedest, tmp;
 692                         tmp = split(kfile, filedest, ';');
 693                         //TODO howto deal with relative directories?
 694                         FileName f(to_utf8(filedest));
 695                         if (f.exists())
 696                                 file = "file:///" + filedest;
 697                 }
 698
 699                 if (!url.empty())
 700                         return;
 701
 702                 // try biblatex specific fields, see its manual
 703                 // 3.13.7 "Electronic Publishing Informationl"
 704                 docstring eprinttype = operator[]("eprinttype");
 705                 docstring eprint = operator[]("eprint");
 706                 if (eprint.empty())
 707                         return;
 708
 709                 if (eprinttype == "arxiv")
 710                         url = "https://arxiv.org/abs/" + eprint;
 711                 if (eprinttype == "jstor")
 712                         url = "https://www.jstor.org/stable/" + eprint;
 713                 if (eprinttype == "pubmed")
 714                         url = "http://www.ncbi.nlm.nih.gov/pubmed/" + eprint;
 715                 if (eprinttype == "hdl")
 716                         url = "https://hdl.handle.net/" + eprint;
 717                 if (eprinttype == "googlebooks")
 718                         url = "http://books.google.com/books?id=" + eprint;
 719
 720                 return;
 721         }
 722
 723         // Here can be handled the bibliography environment. All one could do
 724         // here is let LyX scan the entry for URL or HRef insets.
 725 }
 726
 727
 728 namespace {
 729
 730 docstring parseOptions(docstring const & format, string & optkey,
 731                     docstring & ifpart, docstring & elsepart);
 732
 733 // Calls parseOptions to deal with an embedded option, such as:
 734 //   {%number%[[, no.~%number%]]}
 735 // which must appear at the start of format. ifelsepart gets the
 736 // whole of the option, and we return what's left after the option.
 737 // we return format if there is an error.
 738 docstring parseEmbeddedOption(docstring const & format, docstring & ifelsepart)
 739 {
 740         LASSERT(format[0] == '{' && format[1] == '%', return format);
 741         string optkey;
 742         docstring ifpart;
 743         docstring elsepart;
 744         docstring const rest = parseOptions(format, optkey, ifpart, elsepart);
 745         if (format == rest) { // parse error
 746                 LYXERR0("ERROR! Couldn't parse `" << format <<"'.");
 747                 return format;
 748         }
 749         LASSERT(rest.size() <= format.size(),
 750                 { ifelsepart = docstring(); return format; });
 751         ifelsepart = format.substr(0, format.size() - rest.size());
 752         return rest;
 753 }
 754
 755
 756 // Gets a "clause" from a format string, where the clause is
 757 // delimited by '[[' and ']]'. Returns what is left after the
 758 // clause is removed, and returns format if there is an error.
 759 docstring getClause(docstring const & format, docstring & clause)
 760 {
 761         docstring fmt = format;
 762         // remove '[['
 763         fmt = fmt.substr(2);
 764         // we'll remove characters from the front of fmt as we
 765         // deal with them
 766         while (!fmt.empty()) {
 767                 if (fmt[0] == ']' && fmt.size() > 1 && fmt[1] == ']') {
 768                         // that's the end
 769                         fmt = fmt.substr(2);
 770                         break;
 771                 }
 772                 // check for an embedded option
 773                 if (fmt[0] == '{' && fmt.size() > 1 && fmt[1] == '%') {
 774                         docstring part;
 775                         docstring const rest = parseEmbeddedOption(fmt, part);
 776                         if (fmt == rest) {
 777                                 LYXERR0("ERROR! Couldn't parse embedded option in `" << format <<"'.");
 778                                 return format;
 779                         }
 780                         clause += part;
 781                         fmt = rest;
 782                 } else { // it's just a normal character
 783                                 clause += fmt[0];
 784                                 fmt = fmt.substr(1);
 785                 }
 786         }
 787         return fmt;
 788 }
 789
 790
 791 // parse an options string, which must appear at the start of the
 792 // format parameter. puts the parsed bits in optkey, ifpart, and
 793 // elsepart and returns what's left after the option is removed.
 794 // if there's an error, it returns format itself.
 795 docstring parseOptions(docstring const & format, string & optkey,
 796                     docstring & ifpart, docstring & elsepart)
 797 {
 798         LASSERT(format[0] == '{' && format[1] == '%', return format);
 799         // strip '{%'
 800         docstring fmt = format.substr(2);
 801         size_t pos = fmt.find('%'); // end of key
 802         if (pos == string::npos) {
 803                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of key.");
 804                 return format;
 805         }
 806         optkey = to_utf8(fmt.substr(0, pos));
 807         fmt = fmt.substr(pos + 1);
 808         // [[format]] should be next
 809         if (fmt[0] != '[' || fmt[1] != '[') {
 810                 LYXERR0("Error parsing  `" << format <<"'. Can't find '[[' after key.");
 811                 return format;
 812         }
 813
 814         docstring curfmt = fmt;
 815         fmt = getClause(curfmt, ifpart);
 816         if (fmt == curfmt) {
 817                 LYXERR0("Error parsing  `" << format <<"'. Couldn't get if clause.");
 818                 return format;
 819         }
 820
 821         if (fmt[0] == '}') // we're done, no else clause
 822                 return fmt.substr(1);
 823
 824         // else part should follow
 825         if (fmt[0] != '[' || fmt[1] != '[') {
 826                 LYXERR0("Error parsing  `" << format <<"'. Can't find else clause.");
 827                 return format;
 828         }
 829
 830         curfmt = fmt;
 831         fmt = getClause(curfmt, elsepart);
 832         // we should be done
 833         if (fmt == curfmt || fmt[0] != '}') {
 834                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of option.");
 835                 return format;
 836         }
 837         return fmt.substr(1);
 838 }
 839
 840
 841 } // namespace
 842
 843 /* FIXME
 844 Bug #9131 revealed an oddity in how we are generating citation information
 845 when more than one key is given. We end up building a longer and longer format
 846 string as we go, which we then have to re-parse, over and over and over again,
 847 rather than generating the information for the individual keys and then putting
 848 all of that together. We do that to deal with the way separators work, from what
 849 I can tell, but it still feels like a hack. Fixing this would require quite a
 850 bit of work, however.
 851 */
 852 docstring BibTeXInfo::expandFormat(docstring const & format,
 853                 BibTeXInfoList const & xrefs, int & counter, Buffer const & buf,
 854                 CiteItem const & ci, bool next, bool second) const
 855 {
 856         // incorrect use of macros could put us in an infinite loop
 857         static int const max_passes = 5000;
 858         // the use of overly large keys can lead to performance problems, due
 859         // to eventual attempts to convert LaTeX macros to unicode. See bug
 860         // #8944. By default, the size is limited to 128 (in CiteItem), but
 861         // for specific purposes (such as XHTML export), it needs to be enlarged
 862         // This is perhaps not the best solution, but it will have to do for now.
 863         size_t const max_keysize = ci.max_key_size;
 864         odocstringstream ret; // return value
 865         string key;
 866         bool scanning_key = false;
 867         bool scanning_rich = false;
 868
 869         CiteEngineType const engine_type = buf.params().citeEngineType();
 870         docstring fmt = format;
 871         // we'll remove characters from the front of fmt as we
 872         // deal with them
 873         while (!fmt.empty()) {
 874                 if (counter > max_passes) {
 875                         LYXERR0("Recursion limit reached while parsing `"
 876                                 << format << "'.");
 877                         return _("ERROR!");
 878                 }
 879
 880                 char_type thischar = fmt[0];
 881                 if (thischar == '%') {
 882                         // beginning or end of key
 883                         if (scanning_key) {
 884                                 // end of key
 885                                 scanning_key = false;
 886                                 // so we replace the key with its value, which may be empty
 887                                 if (key[0] == '!') {
 888                                         // macro
 889                                         string const val =
 890                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 891                                         fmt = from_utf8(val) + fmt.substr(1);
 892                                         counter += 1;
 893                                         continue;
 894                                 } else if (prefixIs(key, "B_")) {
 895                                         // a translatable bit (to the Buffer language)
 896                                         string const val =
 897                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 898                                         docstring const trans =
 899                                                 translateIfPossible(from_utf8(val), buf.params().language->code());
 900                                         ret << trans;
 901                                 } else if (key[0] == '_') {
 902                                         // a translatable bit (to the GUI language)
 903                                         string const val =
 904                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 905                                         docstring const trans =
 906                                                 translateIfPossible(from_utf8(val));
 907                                         ret << trans;
 908                                 } else {
 909                                         docstring const val =
 910                                                 getValueForKey(key, buf, ci, xrefs, max_keysize);
 911                                         if (!scanning_rich)
 912                                                 ret << from_ascii("{!<span class=\"bib-" + key + "\">!}");
 913                                         ret << val;
 914                                         if (!scanning_rich)
 915                                                 ret << from_ascii("{!</span>!}");
 916                                 }
 917                         } else {
 918                                 // beginning of key
 919                                 key.clear();
 920                                 scanning_key = true;
 921                         }
 922                 }
 923                 else if (thischar == '{') {
 924                         // beginning of option?
 925                         if (scanning_key) {
 926                                 LYXERR0("ERROR: Found `{' when scanning key in `" << format << "'.");
 927                                 return _("ERROR!");
 928                         }
 929                         if (fmt.size() > 1) {
 930                                 if (fmt[1] == '%') {
 931                                         // it is the beginning of an optional format
 932                                         string optkey;
 933                                         docstring ifpart;
 934                                         docstring elsepart;
 935                                         docstring const newfmt =
 936                                                 parseOptions(fmt, optkey, ifpart, elsepart);
 937                                         if (newfmt == fmt) // parse error
 938                                                 return _("ERROR!");
 939                                         fmt = newfmt;
 940                                         docstring const val =
 941                                                 getValueForKey(optkey, buf, ci, xrefs);
 942                                         if (optkey == "next" && next)
 943                                                 ret << ifpart; // without expansion
 944                                         else if (optkey == "second" && second) {
 945                                                 int newcounter = 0;
 946                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 947                                                         ci, next);
 948                                         } else if (!val.empty()) {
 949                                                 int newcounter = 0;
 950                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 951                                                         ci, next);
 952                                         } else if (!elsepart.empty()) {
 953                                                 int newcounter = 0;
 954                                                 ret << expandFormat(elsepart, xrefs, newcounter, buf,
 955                                                         ci, next);
 956                                         }
 957                                         // fmt will have been shortened for us already
 958                                         continue;
 959                                 }
 960                                 if (fmt[1] == '!') {
 961                                         // beginning of rich text
 962                                         scanning_rich = true;
 963                                         fmt = fmt.substr(2);
 964                                         ret << from_ascii("{!");
 965                                         continue;
 966                                 }
 967                         }
 968                         // we are here if '{' was not followed by % or !.
 969                         // So it's just a character.
 970                         ret << thischar;
 971                 }
 972                 else if (scanning_rich && thischar == '!'
 973                          && fmt.size() > 1 && fmt[1] == '}') {
 974                         // end of rich text
 975                         scanning_rich = false;
 976                         fmt = fmt.substr(2);
 977                         ret << from_ascii("!}");
 978                         continue;
 979                 }
 980                 else if (scanning_key)
 981                         key += char(thischar);
 982                 else {
 983                         try {
 984                                 ret.put(thischar);
 985                         } catch (EncodingException & /* e */) {
 986                                 LYXERR0("Uncodable character '" << docstring(1, thischar) << " in citation label!");
 987                         }
 988                 }
 989                 fmt = fmt.substr(1);
 990         } // for loop
 991         if (scanning_key) {
 992                 LYXERR0("Never found end of key in `" << format << "'!");
 993                 return _("ERROR!");
 994         }
 995         if (scanning_rich) {
 996                 LYXERR0("Never found end of rich text in `" << format << "'!");
 997                 return _("ERROR!");
 998         }
 999         return ret.str();
1000 }
1001
1002
1003 docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs,
1004         Buffer const & buf, CiteItem const & ci, docstring const & format_in) const
1005 {
1006         bool const richtext = ci.richtext;
1007
1008         CiteEngineType const engine_type = buf.params().citeEngineType();
1009         DocumentClass const & dc = buf.params().documentClass();
1010         docstring const & format = format_in.empty()?
1011                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_)))
1012                               : format_in;
1013
1014         if (format != format_) {
1015                 // clear caches since format changed
1016                 info_.clear();
1017                 info_richtext_.clear();
1018                 format_ = format;
1019         }
1020
1021         if (!richtext && !info_.empty()) {
1022                 info_ = convertLaTeXCommands(processRichtext(info_, false));
1023                 return info_;
1024         }
1025         if (richtext && !info_richtext_.empty())
1026                 return info_richtext_;
1027
1028         if (!is_bibtex_) {
1029                 BibTeXInfo::const_iterator it = find(from_ascii("ref"));
1030                 info_ = it->second;
1031                 return info_;
1032         }
1033
1034         int counter = 0;
1035         info_ = expandFormat(format, xrefs, counter, buf,
1036                 ci, false, false);
1037
1038         if (info_.empty()) {
1039                 // this probably shouldn't happen
1040                 return info_;
1041         }
1042
1043         if (richtext) {
1044                 info_richtext_ = convertLaTeXCommands(processRichtext(info_, true));
1045                 return info_richtext_;
1046         }
1047
1048         info_ = convertLaTeXCommands(processRichtext(info_, false));
1049         return info_;
1050 }
1051
1052
1053 docstring const BibTeXInfo::getLabel(BibTeXInfoList const & xrefs,
1054         Buffer const & buf, docstring const & format,
1055         CiteItem const & ci, bool next, bool second) const
1056 {
1057         docstring loclabel;
1058
1059         int counter = 0;
1060         loclabel = expandFormat(format, xrefs, counter, buf, ci, next, second);
1061
1062         if (!loclabel.empty() && !next) {
1063                 loclabel = processRichtext(loclabel, ci.richtext);
1064                 loclabel = convertLaTeXCommands(loclabel);
1065         }
1066
1067         return loclabel;
1068 }
1069
1070
1071 docstring const & BibTeXInfo::operator[](docstring const & field) const
1072 {
1073         BibTeXInfo::const_iterator it = find(field);
1074         if (it != end())
1075                 return it->second;
1076         static docstring const empty_value = docstring();
1077         return empty_value;
1078 }
1079
1080
1081 docstring const & BibTeXInfo::operator[](string const & field) const
1082 {
1083         return operator[](from_ascii(field));
1084 }
1085
1086
1087 docstring BibTeXInfo::getValueForKey(string const & oldkey, Buffer const & buf,
1088         CiteItem const & ci, BibTeXInfoList const & xrefs, size_t maxsize) const
1089 {
1090         // anything less is pointless
1091         LASSERT(maxsize >= 16, maxsize = 16);
1092         string key = oldkey;
1093         bool cleanit = false;
1094         if (prefixIs(oldkey, "clean:")) {
1095                 key = oldkey.substr(6);
1096                 cleanit = true;
1097         }
1098
1099         docstring ret = operator[](key);
1100         if (ret.empty() && !xrefs.empty()) {
1101                 // xr is a (reference to a) BibTeXInfo const *
1102                 for (auto const & xr : xrefs) {
1103                         if (xr && !(*xr)[key].empty()) {
1104                                 ret = (*xr)[key];
1105                                 break;
1106                         }
1107                 }
1108         }
1109         if (ret.empty()) {
1110                 // some special keys
1111                 // FIXME: dialog, textbefore and textafter have nothing to do with this
1112                 if (key == "dialog" && ci.context == CiteItem::Dialog)
1113                         ret = from_ascii("x"); // any non-empty string will do
1114                 else if (key == "export" && ci.context == CiteItem::Export)
1115                         ret = from_ascii("x"); // any non-empty string will do
1116                 else if (key == "ifstar" && ci.Starred)
1117                         ret = from_ascii("x"); // any non-empty string will do
1118                 else if (key == "ifqualified" && ci.isQualified)
1119                         ret = from_ascii("x"); // any non-empty string will do
1120                 else if (key == "entrytype")
1121                         ret = entry_type_;
1122                 else if (prefixIs(key, "ifentrytype:")
1123                          && from_ascii(key.substr(12)) == entry_type_)
1124                         ret = from_ascii("x"); // any non-empty string will do
1125                 else if (key == "key")
1126                         ret = bib_key_;
1127                 else if (key == "label")
1128                         ret = label_;
1129                 else if (key == "modifier" && modifier_ != 0)
1130                         ret = modifier_;
1131                 else if (key == "numericallabel")
1132                         ret = cite_number_;
1133                 else if (prefixIs(key, "ifmultiple:")) {
1134                         // Return whether we have multiple authors
1135                         docstring const kind = operator[](from_ascii(key.substr(11)));
1136                         if (multipleAuthors(kind))
1137                                 ret = from_ascii("x"); // any non-empty string will do
1138                 }
1139                 else if (prefixIs(key, "abbrvnames:")) {
1140                         // Special key to provide abbreviated name list,
1141                         // with respect to maxcitenames. Suitable for Bibliography
1142                         // beginnings.
1143                         docstring const kind = operator[](from_ascii(key.substr(11)));
1144                         ret = getAuthorList(&buf, kind, false, false, true);
1145                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1146                                 ret[0] = uppercase(ret[0]);
1147                 } else if (prefixIs(key, "fullnames:")) {
1148                         // Return a full name list. Suitable for Bibliography
1149                         // beginnings.
1150                         docstring const kind = operator[](from_ascii(key.substr(10)));
1151                         ret = getAuthorList(&buf, kind, true, false, true);
1152                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1153                                 ret[0] = uppercase(ret[0]);
1154                 } else if (prefixIs(key, "forceabbrvnames:")) {
1155                         // Special key to provide abbreviated name lists,
1156                         // irrespective of maxcitenames. Suitable for Bibliography
1157                         // beginnings.
1158                         docstring const kind = operator[](from_ascii(key.substr(15)));
1159                         ret = getAuthorList(&buf, kind, false, true, true);
1160                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1161                                 ret[0] = uppercase(ret[0]);
1162                 } else if (prefixIs(key, "abbrvbynames:")) {
1163                         // Special key to provide abbreviated name list,
1164                         // with respect to maxcitenames. Suitable for further names inside a
1165                         // bibliography item // (such as "ed. by ...")
1166                         docstring const kind = operator[](from_ascii(key.substr(11)));
1167                         ret = getAuthorList(&buf, kind, false, false, true, false);
1168                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1169                                 ret[0] = uppercase(ret[0]);
1170                 } else if (prefixIs(key, "fullbynames:")) {
1171                         // Return a full name list. Suitable for further names inside a
1172                         // bibliography item // (such as "ed. by ...")
1173                         docstring const kind = operator[](from_ascii(key.substr(10)));
1174                         ret = getAuthorList(&buf, kind, true, false, true, false);
1175                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1176                                 ret[0] = uppercase(ret[0]);
1177                 } else if (prefixIs(key, "forceabbrvbynames:")) {
1178                         // Special key to provide abbreviated name lists,
1179                         // irrespective of maxcitenames. Suitable for further names inside a
1180                         // bibliography item // (such as "ed. by ...")
1181                         docstring const kind = operator[](from_ascii(key.substr(15)));
1182                         ret = getAuthorList(&buf, kind, false, true, true, false);
1183                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1184                                 ret[0] = uppercase(ret[0]);
1185                 } else if (key == "abbrvciteauthor") {
1186                         // Special key to provide abbreviated author or
1187                         // editor names (suitable for citation labels),
1188                         // with respect to maxcitenames.
1189                         ret = getAuthorOrEditorList(&buf, false, false);
1190                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1191                                 ret[0] = uppercase(ret[0]);
1192                 } else if (key == "fullciteauthor") {
1193                         // Return a full author or editor list (for citation labels)
1194                         ret = getAuthorOrEditorList(&buf, true, false);
1195                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1196                                 ret[0] = uppercase(ret[0]);
1197                 } else if (key == "forceabbrvciteauthor") {
1198                         // Special key to provide abbreviated author or
1199                         // editor names (suitable for citation labels),
1200                         // irrespective of maxcitenames.
1201                         ret = getAuthorOrEditorList(&buf, false, true);
1202                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1203                                 ret[0] = uppercase(ret[0]);
1204                 } else if (key == "bibentry") {
1205                         // Special key to provide the full bibliography entry: see getInfo()
1206                         CiteEngineType const engine_type = buf.params().citeEngineType();
1207                         DocumentClass const & dc = buf.params().documentClass();
1208                         docstring const & format =
1209                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_), false));
1210                         int counter = 0;
1211                         ret = expandFormat(format, xrefs, counter, buf, ci, false, false);
1212                 } else if (key == "textbefore")
1213                         ret = ci.textBefore;
1214                 else if (key == "textafter")
1215                         ret = ci.textAfter;
1216                 else if (key == "curpretext") {
1217                         vector<pair<docstring, docstring>> pres = ci.getPretexts();
1218                         vector<pair<docstring, docstring>>::iterator it = pres.begin();
1219                         int numkey = 1;
1220                         for (; it != pres.end() ; ++it) {
1221                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1222                                         ret = (*it).second;
1223                                         pres.erase(it);
1224                                         break;
1225                                 }
1226                                 if ((*it).first == bib_key_)
1227                                         ++numkey;
1228                         }
1229                 } else if (key == "curposttext") {
1230                         vector<pair<docstring, docstring>> posts = ci.getPosttexts();
1231                         vector<pair<docstring, docstring>>::iterator it = posts.begin();
1232                         int numkey = 1;
1233                         for (; it != posts.end() ; ++it) {
1234                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1235                                         ret = (*it).second;
1236                                         posts.erase(it);
1237                                         break;
1238                                 }
1239                                 if ((*it).first == bib_key_)
1240                                         ++numkey;
1241                         }
1242                 } else if (key == "year")
1243                         ret = getYear();
1244         }
1245
1246         if (cleanit)
1247                 ret = xml::cleanAttr(ret);
1248
1249         // make sure it is not too big
1250         support::truncateWithEllipsis(ret, maxsize);
1251         return ret;
1252 }
1253
1254
1255 //////////////////////////////////////////////////////////////////////
1256 //
1257 // BiblioInfo
1258 //
1259 //////////////////////////////////////////////////////////////////////
1260
1261 namespace {
1262
1263 // A functor for use with sort, leading to case insensitive sorting
1264 bool compareNoCase(const docstring & a, const docstring & b) {
1265         return compare_no_case(a, b) < 0;
1266 }
1267
1268 } // namespace
1269
1270
1271 vector<docstring> const BiblioInfo::getXRefs(BibTeXInfo const & data, bool const nested) const
1272 {
1273         vector<docstring> result;
1274         if (!data.isBibTeX())
1275                 return result;
1276         // Legacy crossref field. This is not nestable.
1277         if (!nested && !data["crossref"].empty()) {
1278                 docstring const xrefkey = data["crossref"];
1279                 result.push_back(xrefkey);
1280                 // However, check for nested xdatas
1281                 BiblioInfo::const_iterator it = find(xrefkey);
1282                 if (it != end()) {
1283                         BibTeXInfo const & xref = it->second;
1284                         vector<docstring> const nxdata = getXRefs(xref, true);
1285                         if (!nxdata.empty())
1286                                 result.insert(result.end(), nxdata.begin(), nxdata.end());
1287                 }
1288         }
1289         // Biblatex's xdata field. Infinitely nestable.
1290         // XData field can consist of a comma-separated list of keys
1291         vector<docstring> const xdatakeys = getVectorFromString(data["xdata"]);
1292         if (!xdatakeys.empty()) {
1293                 for (auto const & xdatakey : xdatakeys) {
1294                         result.push_back(xdatakey);
1295                         BiblioInfo::const_iterator it = find(xdatakey);
1296                         if (it != end()) {
1297                                 BibTeXInfo const & xdata = it->second;
1298                                 vector<docstring> const nxdata = getXRefs(xdata, true);
1299                                 if (!nxdata.empty())
1300                                         result.insert(result.end(), nxdata.begin(), nxdata.end());
1301                         }
1302                 }
1303         }
1304         return result;
1305 }
1306
1307
1308 vector<docstring> const BiblioInfo::getKeys() const
1309 {
1310         vector<docstring> bibkeys;
1311         for (auto const & bi : *this)
1312                 bibkeys.push_back(bi.first);
1313         sort(bibkeys.begin(), bibkeys.end(), &compareNoCase);
1314         return bibkeys;
1315 }
1316
1317
1318 vector<docstring> const BiblioInfo::getFields() const
1319 {
1320         vector<docstring> bibfields;
1321         for (auto const & fn : field_names_)
1322                 bibfields.push_back(fn);
1323         sort(bibfields.begin(), bibfields.end());
1324         return bibfields;
1325 }
1326
1327
1328 vector<docstring> const BiblioInfo::getEntries() const
1329 {
1330         vector<docstring> bibentries;
1331         for (auto const & et : entry_types_)
1332                 bibentries.push_back(et);
1333         sort(bibentries.begin(), bibentries.end());
1334         return bibentries;
1335 }
1336
1337
1338 docstring const BiblioInfo::getAuthorOrEditorList(docstring const & key, Buffer const & buf) const
1339 {
1340         BiblioInfo::const_iterator it = find(key);
1341         if (it == end())
1342                 return docstring();
1343         BibTeXInfo const & data = it->second;
1344         return data.getAuthorOrEditorList(&buf, false);
1345 }
1346
1347
1348 docstring const BiblioInfo::getCiteNumber(docstring const & key) const
1349 {
1350         BiblioInfo::const_iterator it = find(key);
1351         if (it == end())
1352                 return docstring();
1353         BibTeXInfo const & data = it->second;
1354         return data.citeNumber();
1355 }
1356
1357 void BiblioInfo::getLocators(docstring const & key, docstring & doi, docstring & url, docstring & file) const
1358 {
1359         BiblioInfo::const_iterator it = find(key);
1360          if (it == end())
1361                 return;
1362         BibTeXInfo const & data = it->second;
1363         data.getLocators(doi,url,file);
1364 }
1365
1366
1367 docstring const BiblioInfo::getYear(docstring const & key, bool use_modifier) const
1368 {
1369         BiblioInfo::const_iterator it = find(key);
1370         if (it == end())
1371                 return docstring();
1372         BibTeXInfo const & data = it->second;
1373         docstring year = data.getYear();
1374         if (year.empty()) {
1375                 // let's try the crossrefs
1376                 vector<docstring> const xrefs = getXRefs(data);
1377                 if (xrefs.empty())
1378                         // no luck
1379                         return docstring();
1380                 for (docstring const & xref : xrefs) {
1381                         BiblioInfo::const_iterator const xrefit = find(xref);
1382                         if (xrefit == end())
1383                                 continue;
1384                         BibTeXInfo const & xref_data = xrefit->second;
1385                         year = xref_data.getYear();
1386                         if (!year.empty())
1387                                 // success!
1388                                 break;
1389                 }
1390         }
1391         if (use_modifier && data.modifier() != 0)
1392                 year += data.modifier();
1393         return year;
1394 }
1395
1396
1397 docstring const BiblioInfo::getYear(docstring const & key, Buffer const & buf, bool use_modifier) const
1398 {
1399         docstring const year = getYear(key, use_modifier);
1400         if (year.empty())
1401                 return buf.B_("No year");
1402         return year;
1403 }
1404
1405
1406 docstring const BiblioInfo::getInfo(docstring const & key,
1407         Buffer const & buf, CiteItem const & ci, docstring const & format) const
1408 {
1409         BiblioInfo::const_iterator it = find(key);
1410         if (it == end())
1411                 return _("Bibliography entry not found!");
1412         BibTeXInfo const & data = it->second;
1413         BibTeXInfoList xrefptrs;
1414         for (docstring const & xref : getXRefs(data)) {
1415                 BiblioInfo::const_iterator const xrefit = find(xref);
1416                 if (xrefit != end())
1417                         xrefptrs.push_back(&(xrefit->second));
1418         }
1419         return data.getInfo(xrefptrs, buf, ci, format);
1420 }
1421
1422
1423 docstring const BiblioInfo::getLabel(vector<docstring> keys,
1424         Buffer const & buf, string const & style, CiteItem const & ci) const
1425 {
1426         size_t max_size = ci.max_size;
1427         // shorter makes no sense
1428         LASSERT(max_size >= 16, max_size = 16);
1429
1430         // we can't display more than 10 of these, anyway
1431         // but since we truncate in the middle,
1432         // we need to split into two halfs.
1433         bool const too_many_keys = keys.size() > 10;
1434         vector<docstring> lkeys;
1435         if (too_many_keys) {
1436                 lkeys.insert(lkeys.end(), keys.end() - 5, keys.end());
1437                 keys.resize(5);
1438                 keys.insert(keys.end(), lkeys.begin(), lkeys.end());
1439         }
1440
1441         CiteEngineType const engine_type = buf.params().citeEngineType();
1442         DocumentClass const & dc = buf.params().documentClass();
1443         docstring const & format = from_utf8(dc.getCiteFormat(engine_type, style, false, "cite"));
1444         docstring ret = format;
1445         vector<docstring>::const_iterator key = keys.begin();
1446         vector<docstring>::const_iterator ken = keys.end();
1447         vector<docstring> handled_keys;
1448         for (int i = 0; key != ken; ++key, ++i) {
1449                 handled_keys.push_back(*key);
1450                 int n = 0;
1451                 for (auto const & k : handled_keys) {
1452                         if (k == *key)
1453                                 ++n;
1454                 }
1455                 BiblioInfo::const_iterator it = find(*key);
1456                 BibTeXInfo empty_data;
1457                 empty_data.key(*key);
1458                 BibTeXInfo & data = empty_data;
1459                 vector<BibTeXInfo const *> xrefptrs;
1460                 if (it != end()) {
1461                         data = it->second;
1462                         for (docstring const & xref : getXRefs(data)) {
1463                                 BiblioInfo::const_iterator const xrefit = find(xref);
1464                                 if (xrefit != end())
1465                                         xrefptrs.push_back(&(xrefit->second));
1466                         }
1467                 }
1468                 data.numKey(n);
1469                 ret = data.getLabel(xrefptrs, buf, ret, ci, key + 1 != ken, i == 1);
1470         }
1471
1472         support::truncateWithEllipsis(ret, max_size, true);
1473
1474         return ret;
1475 }
1476
1477
1478 bool BiblioInfo::isBibtex(docstring const & key) const
1479 {
1480         docstring key1;
1481         split(key, key1, ',');
1482         BiblioInfo::const_iterator it = find(key1);
1483         if (it == end())
1484                 return false;
1485         return it->second.isBibTeX();
1486 }
1487
1488
1489 BiblioInfo::CiteStringMap const BiblioInfo::getCiteStrings(
1490         vector<docstring> const & keys, vector<CitationStyle> const & styles,
1491         Buffer const & buf, CiteItem const & ci) const
1492 {
1493         if (empty())
1494                 return vector<pair<docstring,docstring>>();
1495
1496         string style;
1497         CiteStringMap csm(styles.size());
1498         for (size_t i = 0; i != csm.size(); ++i) {
1499                 style = styles[i].name;
1500                 csm[i] = make_pair(from_ascii(style), getLabel(keys, buf, style, ci));
1501         }
1502
1503         return csm;
1504 }
1505
1506
1507 void BiblioInfo::mergeBiblioInfo(BiblioInfo const & info)
1508 {
1509         bimap_.insert(info.begin(), info.end());
1510         field_names_.insert(info.field_names_.begin(), info.field_names_.end());
1511         entry_types_.insert(info.entry_types_.begin(), info.entry_types_.end());
1512 }
1513
1514
1515 namespace {
1516
1517 // used in xhtml to sort a list of BibTeXInfo objects
1518 bool lSorter(BibTeXInfo const * lhs, BibTeXInfo const * rhs)
1519 {
1520         docstring const lauth = lhs->getAuthorOrEditorList();
1521         docstring const rauth = rhs->getAuthorOrEditorList();
1522         docstring const lyear = lhs->getYear();
1523         docstring const ryear = rhs->getYear();
1524         docstring const ltitl = lhs->operator[]("title");
1525         docstring const rtitl = rhs->operator[]("title");
1526         return  (lauth < rauth)
1527                 || (lauth == rauth && lyear < ryear)
1528                 || (lauth == rauth && lyear == ryear && ltitl < rtitl);
1529 }
1530
1531 } // namespace
1532
1533
1534 void BiblioInfo::collectCitedEntries(Buffer const & buf)
1535 {
1536         cited_entries_.clear();
1537         // We are going to collect all the citation keys used in the document,
1538         // getting them from the TOC.
1539         // FIXME We may want to collect these differently, in the first case,
1540         // so that we might have them in order of appearance.
1541         set<docstring> citekeys;
1542         Toc const & toc = *buf.tocBackend().toc("citation");
1543         for (auto const & t : toc) {
1544                 if (t.str().empty())
1545                         continue;
1546                 vector<docstring> const keys = getVectorFromString(t.str());
1547                 citekeys.insert(keys.begin(), keys.end());
1548         }
1549         if (citekeys.empty())
1550                 return;
1551
1552         // We have a set of the keys used in this document.
1553         // We will now convert it to a list of the BibTeXInfo objects used in
1554         // this document...
1555         vector<BibTeXInfo const *> bi;
1556         for (auto const & ck : citekeys) {
1557                 BiblioInfo::const_iterator const bt = find(ck);
1558                 if (bt == end() || !bt->second.isBibTeX())
1559                         continue;
1560                 bi.push_back(&(bt->second));
1561         }
1562         // ...and sort it.
1563         sort(bi.begin(), bi.end(), lSorter);
1564
1565         // Now we can write the sorted keys
1566         // b is a BibTeXInfo const *
1567         for (auto const & b : bi)
1568                 cited_entries_.push_back(b->key());
1569 }
1570
1571
1572 void BiblioInfo::makeCitationLabels(Buffer const & buf)
1573 {
1574         collectCitedEntries(buf);
1575         CiteEngineType const engine_type = buf.params().citeEngineType();
1576         bool const numbers = (engine_type & ENGINE_TYPE_NUMERICAL);
1577
1578         int keynumber = 0;
1579         char modifier = 0;
1580         // used to remember the last one we saw
1581         // we'll be comparing entries to see if we need to add
1582         // modifiers, like "1984a"
1583         map<docstring, BibTeXInfo>::iterator last = bimap_.end();
1584
1585         // add letters to years
1586         for (auto const & ce : cited_entries_) {
1587                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1588                 // this shouldn't happen, but...
1589                 if (biit == bimap_.end())
1590                         // ...fail gracefully, anyway.
1591                         continue;
1592                 BibTeXInfo & entry = biit->second;
1593                 if (numbers) {
1594                         docstring const num = convert<docstring>(++keynumber);
1595                         entry.setCiteNumber(num);
1596                 } else {
1597                         // The first test here is checking whether this is the first
1598                         // time through the loop. If so, then we do not have anything
1599                         // with which to compare.
1600                         if (last != bimap_.end()
1601                             && entry.getAuthorOrEditorList() == last->second.getAuthorOrEditorList()
1602                             // we access the year via getYear() so as to get it from the xref,
1603                             // if we need to do so
1604                             && getYear(entry.key()) == getYear(last->second.key())) {
1605                                 if (modifier == 0) {
1606                                         // so the last one should have been 'a'
1607                                         last->second.setModifier('a');
1608                                         modifier = 'b';
1609                                 } else if (modifier == 'z')
1610                                         modifier = 'A';
1611                                 else
1612                                         modifier++;
1613                         } else {
1614                                 modifier = 0;
1615                         }
1616                         entry.setModifier(modifier);
1617                         // remember the last one
1618                         last = biit;
1619                 }
1620         }
1621         // Set the labels
1622         for (auto const & ce : cited_entries_) {
1623                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1624                 // this shouldn't happen, but...
1625                 if (biit == bimap_.end())
1626                         // ...fail gracefully, anyway.
1627                         continue;
1628                 BibTeXInfo & entry = biit->second;
1629                 if (numbers) {
1630                         entry.label(entry.citeNumber());
1631                 } else {
1632                         docstring const auth = entry.getAuthorOrEditorList(&buf, false);
1633                         // we do it this way so as to access the xref, if necessary
1634                         // note that this also gives us the modifier
1635                         docstring const year = getYear(ce, buf, true);
1636                         if (!auth.empty() && !year.empty())
1637                                 entry.label(auth + ' ' + year);
1638                         else
1639                                 entry.label(entry.key());
1640                 }
1641         }
1642 }
1643
1644
1645 //////////////////////////////////////////////////////////////////////
1646 //
1647 // CitationStyle
1648 //
1649 //////////////////////////////////////////////////////////////////////
1650
1651
1652 CitationStyle citationStyleFromString(string const & command,
1653                                       BufferParams const & params)
1654 {
1655         CitationStyle cs;
1656         if (command.empty())
1657                 return cs;
1658
1659         string const alias = params.getCiteAlias(command);
1660         string cmd = alias.empty() ? command : alias;
1661         if (isUpperCase(command[0])) {
1662                 cs.forceUpperCase = true;
1663                 cmd[0] = lowercase(cmd[0]);
1664         }
1665
1666         size_t const n = command.size() - 1;
1667         if (command[n] == '*') {
1668                 cs.hasStarredVersion = true;
1669                 if (suffixIs(cmd, '*'))
1670                         cmd = cmd.substr(0, cmd.size() - 1);
1671         }
1672
1673         cs.name = cmd;
1674         return cs;
1675 }
1676
1677
1678 string citationStyleToString(const CitationStyle & cs, bool const latex)
1679 {
1680         string cmd = latex ? cs.cmd : cs.name;
1681         if (cs.forceUpperCase)
1682                 cmd[0] = uppercase(cmd[0]);
1683         if (cs.hasStarredVersion)
1684                 cmd += '*';
1685         return cmd;
1686 }
1687
1688
1689 docstring authorsToDocBookAuthorGroup(docstring const & authorsString, XMLStream & xs, Buffer const & buf)
1690 {
1691         // This function closely mimics getAuthorList, but produces DocBook instead of text.
1692         // It has been greatly simplified, as the complete list of authors is always produced. No separators are required,
1693         // as the output has a database-like shape.
1694         // constructName has also been merged within, as it becomes really simple and leads to no copy-paste.
1695
1696         if (authorsString.empty()) {
1697                 return docstring();
1698         }
1699
1700         // Split the input list of authors into individual authors.
1701         vector<docstring> const authors = getAuthors(authorsString);
1702
1703         // Retrieve the "et al." variation.
1704         string const etal = buf.params().documentClass().getCiteMacro(buf.params().citeEngineType(), "_etal");
1705
1706         // Output the list of authors.
1707         xs << xml::StartTag("authorgroup");
1708         xs << xml::CR();
1709
1710         auto it = authors.cbegin();
1711         auto en = authors.cend();
1712         for (size_t i = 0; it != en; ++it, ++i) {
1713                 xs << xml::StartTag("author");
1714                 xs << xml::CR();
1715                 xs << xml::StartTag("personname");
1716                 xs << xml::CR();
1717                 docstring name = *it;
1718
1719                 // All authors go in a <personname>. If more structure is known, use it; otherwise (just "et al."), print it as such.
1720                 if (name == "others") {
1721                         xs << buf.B_(etal);
1722                 } else {
1723                         name_parts parts = nameParts(name);
1724                         if (! parts.prefix.empty()) {
1725                                 xs << xml::StartTag("honorific");
1726                                 xs << parts.prefix;
1727                                 xs << xml::EndTag("honorific");
1728                                 xs << xml::CR();
1729                         }
1730                         if (! parts.prename.empty()) {
1731                                 xs << xml::StartTag("firstname");
1732                                 xs << parts.prename;
1733                                 xs << xml::EndTag("firstname");
1734                                 xs << xml::CR();
1735                         }
1736                         if (! parts.surname.empty()) {
1737                                 xs << xml::StartTag("surname");
1738                                 xs << parts.surname;
1739                                 xs << xml::EndTag("surname");
1740                                 xs << xml::CR();
1741                         }
1742                         if (! parts.suffix.empty()) {
1743                                 xs << xml::StartTag("othername", "role=\"suffix\"");
1744                                 xs << parts.suffix;
1745                                 xs << xml::EndTag("othername");
1746                                 xs << xml::CR();
1747                         }
1748                 }
1749
1750                 xs << xml::EndTag("personname");
1751                 xs << xml::CR();
1752                 xs << xml::EndTag("author");
1753                 xs << xml::CR();
1754
1755                 // Could add an affiliation after <personname>, but not stored in BibTeX.
1756         }
1757         xs << xml::EndTag("authorgroup");
1758         xs << xml::CR();
1759
1760         return docstring();
1761 }
1762
1763 } // namespace lyx