src/BiblioInfo.cpp

   1 /**
   2  * \file BiblioInfo.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Angus Leeming
   7  * \author Herbert Voß
   8  * \author Richard Heck
   9  * \author Julien Rioux
  10  * \author Jürgen Spitzmüller
  11  *
  12  * Full author contact details are available in file CREDITS.
  13  */
  14
  15 #include <config.h>
  16
  17 #include "BiblioInfo.h"
  18 #include "Buffer.h"
  19 #include "BufferParams.h"
  20 #include "Citation.h"
  21 #include "Encoding.h"
  22 #include "Language.h"
  23 #include "xml.h"
  24 #include "TextClass.h"
  25 #include "TocBackend.h"
  26
  27 #include "support/convert.h"
  28 #include "support/debug.h"
  29 #include "support/docstream.h"
  30 #include "support/FileName.h"
  31 #include "support/gettext.h"
  32 #include "support/lassert.h"
  33 #include "support/lstrings.h"
  34 #include "support/regex.h"
  35 #include "support/textutils.h"
  36
  37 #include <map>
  38 #include <set>
  39
  40 using namespace std;
  41 using namespace lyx::support;
  42
  43
  44 namespace lyx {
  45
  46 namespace {
  47
  48 // Remove placeholders from names
  49 docstring renormalize(docstring const & input)
  50 {
  51         docstring res = subst(input, from_ascii("$$space!"), from_ascii(" "));
  52         return subst(res, from_ascii("$$comma!"), from_ascii(","));
  53 }
  54
  55
  56 // Split the surname into prefix ("von-part") and family name
  57 pair<docstring, docstring> parseSurname(docstring const & sname)
  58 {
  59         // Split the surname into its tokens
  60         vector<docstring> pieces = getVectorFromString(sname, from_ascii(" "));
  61         if (pieces.size() < 2)
  62                 return make_pair(docstring(), sname);
  63
  64         // Now we look for pieces that begin with a lower case letter.
  65         // All except for the very last token constitute the "von-part".
  66         docstring prefix;
  67         vector<docstring>::const_iterator it = pieces.begin();
  68         vector<docstring>::const_iterator const en = pieces.end();
  69         bool first = true;
  70         for (; it != en; ++it) {
  71                 if ((*it).empty())
  72                         continue;
  73                 // If this is the last piece, then what we now have is
  74                 // the family name, notwithstanding the casing.
  75                 if (it + 1 == en)
  76                         break;
  77                 char_type const c = (*it)[0];
  78                 // If the piece starts with a upper case char, we assume
  79                 // this is part of the surname.
  80                 if (!isLower(c))
  81                         break;
  82                 // Nothing of the former, so add this piece to the prename
  83                 if (!first)
  84                         prefix += " ";
  85                 else
  86                         first = false;
  87                 prefix += *it;
  88         }
  89
  90         // Reconstruct the family name.
  91         // Note that if we left the loop with because it + 1 == en,
  92         // then this will still do the right thing, i.e., make surname
  93         // just be the last piece.
  94         docstring surname;
  95         first = true;
  96         for (; it != en; ++it) {
  97                 if (!first)
  98                         surname += " ";
  99                 else
 100                         first = false;
 101                 surname += *it;
 102         }
 103         return make_pair(prefix, surname);
 104 }
 105
 106
 107 struct name_parts {
 108         docstring surname;
 109         docstring prename;
 110         docstring suffix;
 111         docstring prefix;
 112 };
 113
 114
 115 // gets the name parts (prename, surname, prefix, suffix) from an author-type string
 116 name_parts nameParts(docstring const & iname)
 117 {
 118         name_parts res;
 119         if (iname.empty())
 120                 return res;
 121
 122         // First we check for goupings (via {...}) and replace blanks and
 123         // commas inside groups with temporary placeholders
 124         docstring name;
 125         int gl = 0;
 126         docstring::const_iterator p = iname.begin();
 127         while (p != iname.end()) {
 128                 // count grouping level
 129                 if (*p == '{')
 130                         ++gl;
 131                 else if (*p == '}')
 132                         --gl;
 133                 // generate string with probable placeholders
 134                 if (*p == ' ' && gl > 0)
 135                         name += from_ascii("$$space!");
 136                 else if (*p == ',' && gl > 0)
 137                         name += from_ascii("$$comma!");
 138                 else
 139                         name += *p;
 140                 ++p;
 141         }
 142
 143         // Now we look for a comma, and take the last name to be everything
 144         // preceding the right-most one, so that we also get the name suffix
 145         // (aka "jr" part).
 146         vector<docstring> pieces = getVectorFromString(name);
 147         if (pieces.size() > 1) {
 148                 // Whether we have a name suffix or not, the prename is
 149                 // always last item
 150                 res.prename = renormalize(pieces.back());
 151                 // The family name, conversely, is always the first item.
 152                 // However, it might contain a prefix (aka "von" part)
 153                 docstring const sname = pieces.front();
 154                 res.prefix = renormalize(parseSurname(sname).first);
 155                 res.surname = renormalize(parseSurname(sname).second);
 156                 // If we have three pieces (the maximum allowed by BibTeX),
 157                 // the second one is the name suffix.
 158                 if (pieces.size() > 2)
 159                         res.suffix = renormalize(pieces.at(1));
 160                 return res;
 161         }
 162
 163         // OK, so now we want to look for the last name.
 164         // Split on spaces, to get various tokens.
 165         pieces = getVectorFromString(name, from_ascii(" "));
 166         // No space: Only a family name given
 167         if (pieces.size() < 2) {
 168                 res.surname = renormalize(pieces.back());
 169                 return res;
 170         }
 171         // If we get two pieces, assume "prename surname"
 172         if (pieces.size() == 2) {
 173                 res.prename = renormalize(pieces.front());
 174                 res.surname = renormalize(pieces.back());
 175                 return res;
 176         }
 177
 178         // More than 3 pieces: A name prefix (aka "von" part) might be included.
 179         // We look for the first piece that begins with a lower case letter
 180         // (which is the name prefix, if it is not the last token) or the last token.
 181         docstring prename;
 182         vector<docstring>::const_iterator it = pieces.begin();
 183         vector<docstring>::const_iterator const en = pieces.end();
 184         bool first = true;
 185         for (; it != en; ++it) {
 186                 if ((*it).empty())
 187                         continue;
 188                 char_type const c = (*it)[0];
 189                 // If the piece starts with a lower case char, we assume
 190                 // this is the name prefix and thus prename is complete.
 191                 if (isLower(c))
 192                         break;
 193                 // Same if this is the last piece, which is always the surname.
 194                 if (it + 1 == en)
 195                         break;
 196                 // Nothing of the former, so add this piece to the prename
 197                 if (!first)
 198                         prename += " ";
 199                 else
 200                         first = false;
 201                 prename += *it;
 202         }
 203
 204         // Now reconstruct the family name and strip the prefix.
 205         // Note that if we left the loop because it + 1 == en,
 206         // then this will still do the right thing, i.e., make surname
 207         // just be the last piece.
 208         docstring surname;
 209         first = true;
 210         for (; it != en; ++it) {
 211                 if (!first)
 212                         surname += " ";
 213                 else
 214                         first = false;
 215                 surname += *it;
 216         }
 217         res.prename = renormalize(prename);
 218         res.prefix = renormalize(parseSurname(surname).first);
 219         res.surname = renormalize(parseSurname(surname).second);
 220         return res;
 221 }
 222
 223
 224 docstring constructName(docstring const & name, string const & scheme)
 225 {
 226         // re-constructs a name from name parts according
 227         // to a given scheme
 228         docstring const prename = nameParts(name).prename;
 229         docstring const surname = nameParts(name).surname;
 230         docstring const prefix = nameParts(name).prefix;
 231         docstring const suffix = nameParts(name).suffix;
 232         string res = scheme;
 233         static regex const reg1("(.*)(\\{%prename%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 234         static regex const reg2("(.*)(\\{%suffix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 235         static regex const reg3("(.*)(\\{%prefix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 236         smatch sub;
 237         // Changing the first parameter of regex_match() may corrupt the
 238         // second one. In this case we use the temporary string tmp.
 239         if (regex_match(scheme, sub, reg1)) {
 240                 res = sub.str(1);
 241                 if (!prename.empty())
 242                         res += sub.str(3);
 243                 res += sub.str(5);
 244         }
 245         if (regex_match(res, sub, reg2)) {
 246                 string tmp = sub.str(1);
 247                 if (!suffix.empty())
 248                         tmp += sub.str(3);
 249                 res = tmp + sub.str(5);
 250         }
 251         if (regex_match(res, sub, reg3)) {
 252                 string tmp = sub.str(1);
 253                 if (!prefix.empty())
 254                         tmp += sub.str(3);
 255                 res = tmp + sub.str(5);
 256         }
 257         docstring result = from_ascii(res);
 258         result = subst(result, from_ascii("%prename%"), prename);
 259         result = subst(result, from_ascii("%surname%"), surname);
 260         result = subst(result, from_ascii("%prefix%"), prefix);
 261         result = subst(result, from_ascii("%suffix%"), suffix);
 262         return result;
 263 }
 264
 265
 266 vector<docstring> const getAuthors(docstring const & author)
 267 {
 268         // We check for goupings (via {...}) and only consider " and "
 269         // outside groups as author separator. This is to account
 270         // for cases such as {{Barnes and Noble, Inc.}}, which
 271         // need to be treated as one single family name.
 272         // We use temporary placeholders in order to differentiate the
 273         // diverse " and " cases.
 274
 275         // First, we temporarily replace all ampersands. It is rather unusual
 276         // in author names, but can happen (consider cases such as "C \& A Corp.").
 277         docstring iname = subst(author, from_ascii("&"), from_ascii("$$amp!"));
 278         // Then, we temporarily make all " and " strings to ampersands in order
 279         // to handle them later on a per-char level.
 280         iname = subst(iname, from_ascii(" and "), from_ascii(" & "));
 281         // Now we traverse through the string and replace the "&" by the proper
 282         // output in- and outside groups
 283         docstring name;
 284         int gl = 0;
 285         docstring::const_iterator p = iname.begin();
 286         while (p != iname.end()) {
 287                 // count grouping level
 288                 if (*p == '{')
 289                         ++gl;
 290                 else if (*p == '}')
 291                         --gl;
 292                 // generate string with probable placeholders
 293                 if (*p == '&') {
 294                         if (gl > 0)
 295                                 // Inside groups, we output "and"
 296                                 name += from_ascii("and");
 297                         else
 298                                 // Outside groups, we output a separator
 299                                 name += from_ascii("$$namesep!");
 300                 }
 301                 else
 302                         name += *p;
 303                 ++p;
 304         }
 305
 306         // re-insert the literal ampersands
 307         name = subst(name, from_ascii("$$amp!"), from_ascii("&"));
 308
 309         // Now construct the actual vector
 310         return getVectorFromString(name, from_ascii(" $$namesep! "));
 311 }
 312
 313
 314 bool multipleAuthors(docstring const & author)
 315 {
 316         return getAuthors(author).size() > 1;
 317 }
 318
 319
 320 // converts a string containing LaTeX commands into unicode
 321 // for display.
 322 docstring convertLaTeXCommands(docstring const & str)
 323 {
 324         docstring val = str;
 325         docstring ret;
 326
 327         bool scanning_cmd = false;
 328         bool scanning_math = false;
 329         bool escaped = false; // used to catch \$, etc.
 330         while (!val.empty()) {
 331                 char_type const ch = val[0];
 332
 333                 // if we're scanning math, we output everything until we
 334                 // find an unescaped $, at which point we break out.
 335                 if (scanning_math) {
 336                         if (escaped)
 337                                 escaped = false;
 338                         else if (ch == '\\')
 339                                 escaped = true;
 340                         else if (ch == '$')
 341                                 scanning_math = false;
 342                         ret += ch;
 343                         val = val.substr(1);
 344                         continue;
 345                 }
 346
 347                 // if we're scanning a command name, then we just
 348                 // discard characters until we hit something that
 349                 // isn't alpha.
 350                 if (scanning_cmd) {
 351                         if (isAlphaASCII(ch)) {
 352                                 val = val.substr(1);
 353                                 escaped = false;
 354                                 continue;
 355                         }
 356                         // so we're done with this command.
 357                         // now we fall through and check this character.
 358                         scanning_cmd = false;
 359                 }
 360
 361                 // was the last character a \? If so, then this is something like:
 362                 // \\ or \$, so we'll just output it. That's probably not always right...
 363                 if (escaped) {
 364                         // exception: output \, as THIN SPACE
 365                         if (ch == ',')
 366                                 ret.push_back(0x2009);
 367                         else
 368                                 ret += ch;
 369                         val = val.substr(1);
 370                         escaped = false;
 371                         continue;
 372                 }
 373
 374                 if (ch == '$') {
 375                         ret += ch;
 376                         val = val.substr(1);
 377                         scanning_math = true;
 378                         continue;
 379                 }
 380
 381                 // Change text mode accents in the form
 382                 // {\v a} to \v{a} (see #9340).
 383                 // FIXME: This is a sort of mini-tex2lyx.
 384                 //        Use the real tex2lyx instead!
 385                 static lyx::regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}");
 386                 if (lyx::regex_search(to_utf8(val), tma_reg)) {
 387                         val = val.substr(1);
 388                         val.replace(2, 1, from_ascii("{"));
 389                         continue;
 390                 }
 391
 392                 // Apart from the above, we just ignore braces
 393                 if (ch == '{' || ch == '}') {
 394                         val = val.substr(1);
 395                         continue;
 396                 }
 397
 398                 // we're going to check things that look like commands, so if
 399                 // this doesn't, just output it.
 400                 if (ch != '\\') {
 401                         ret += ch;
 402                         val = val.substr(1);
 403                         continue;
 404                 }
 405
 406                 // ok, could be a command of some sort
 407                 // let's see if it corresponds to some unicode
 408                 // unicodesymbols has things in the form: \"{u},
 409                 // whereas we may see things like: \"u. So we'll
 410                 // look for that and change it, if necessary.
 411                 // FIXME: This is a sort of mini-tex2lyx.
 412                 //        Use the real tex2lyx instead!
 413                 static lyx::regex const reg("^\\\\\\W\\w");
 414                 if (lyx::regex_search(to_utf8(val), reg)) {
 415                         val.insert(3, from_ascii("}"));
 416                         val.insert(2, from_ascii("{"));
 417                 }
 418                 bool termination;
 419                 docstring rem;
 420                 docstring const cnvtd = Encodings::fromLaTeXCommand(val,
 421                                 Encodings::TEXT_CMD, termination, rem);
 422                 if (!cnvtd.empty()) {
 423                         // it did, so we'll take that bit and proceed with what's left
 424                         ret += cnvtd;
 425                         val = rem;
 426                         continue;
 427                 }
 428                 // it's a command of some sort
 429                 scanning_cmd = true;
 430                 escaped = true;
 431                 val = val.substr(1);
 432         }
 433         return ret;
 434 }
 435
 436
 437 // Escape '<' and '>' and remove richtext markers (e.g. {!this is richtext!}) from a string.
 438 docstring processRichtext(docstring const & str, bool richtext)
 439 {
 440         docstring val = str;
 441         docstring ret;
 442
 443         bool scanning_rich = false;
 444         while (!val.empty()) {
 445                 char_type const ch = val[0];
 446                 if (ch == '{' && val.size() > 1 && val[1] == '!') {
 447                         // beginning of rich text
 448                         scanning_rich = true;
 449                         val = val.substr(2);
 450                         continue;
 451                 }
 452                 if (scanning_rich && ch == '!' && val.size() > 1 && val[1] == '}') {
 453                         // end of rich text
 454                         scanning_rich = false;
 455                         val = val.substr(2);
 456                         continue;
 457                 }
 458                 if (richtext) {
 459                         if (scanning_rich)
 460                                 ret += ch;
 461                         else {
 462                                 // we need to escape '<' and '>'
 463                                 if (ch == '<')
 464                                         ret += "&lt;";
 465                                 else if (ch == '>')
 466                                         ret += "&gt;";
 467                                 else
 468                                         ret += ch;
 469                         }
 470                 } else if (!scanning_rich /* && !richtext */)
 471                         ret += ch;
 472                 // else the character is discarded, which will happen only if
 473                 // richtext == false and we are scanning rich text
 474                 val = val.substr(1);
 475         }
 476         return ret;
 477 }
 478
 479 } // namespace
 480
 481
 482 //////////////////////////////////////////////////////////////////////
 483 //
 484 // BibTeXInfo
 485 //
 486 //////////////////////////////////////////////////////////////////////
 487
 488 BibTeXInfo::BibTeXInfo(docstring const & key, docstring const & type)
 489         : is_bibtex_(true), bib_key_(key), num_bib_key_(0), entry_type_(type),
 490           info_(), format_(), modifier_(0)
 491 {}
 492
 493
 494
 495 docstring const BibTeXInfo::getAuthorOrEditorList(Buffer const * buf,
 496                                           bool full, bool forceshort) const
 497 {
 498         docstring author = operator[]("author");
 499         if (author.empty())
 500                 author = operator[]("editor");
 501
 502         return getAuthorList(buf, author, full, forceshort);
 503 }
 504
 505
 506 docstring const BibTeXInfo::getAuthorList(Buffer const * buf,
 507                 docstring const & author, bool const full, bool const forceshort,
 508                 bool const allnames, bool const beginning) const
 509 {
 510         // Maxnames treshold depend on engine
 511         size_t maxnames = buf ?
 512                 buf->params().documentClass().max_citenames() : 2;
 513
 514         if (!is_bibtex_) {
 515                 docstring const opt = label();
 516                 if (opt.empty())
 517                         return docstring();
 518
 519                 docstring authors;
 520                 docstring const remainder = trim(split(opt, authors, '('));
 521                 if (remainder.empty())
 522                         // in this case, we didn't find a "(",
 523                         // so we don't have author (year)
 524                         return docstring();
 525                 if (full) {
 526                         // Natbib syntax is "Jones et al.(1990)Jones, Baker, and Williams"
 527                         docstring const fullauthors = trim(rsplit(remainder, ')'));
 528                         if (!fullauthors.empty())
 529                                 return fullauthors;
 530                 }
 531                 return authors;
 532         }
 533
 534         if (author.empty())
 535                 return author;
 536
 537         // OK, we've got some names. Let's format them.
 538         // Try to split the author list
 539         vector<docstring> const authors = getAuthors(author);
 540
 541         docstring retval;
 542
 543         CiteEngineType const engine_type = buf ? buf->params().citeEngineType()
 544                                                : ENGINE_TYPE_DEFAULT;
 545
 546         // These are defined in the styles
 547         string const etal =
 548                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_etal")
 549                     : " et al.";
 550         string const namesep =
 551                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_namesep")
 552                    : ", ";
 553         string const lastnamesep =
 554                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_lastnamesep")
 555                     : ", and ";
 556         string const pairnamesep =
 557                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_pairnamesep")
 558                      : " and ";
 559         string firstnameform =
 560                         buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstnameform")
 561                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 562         if (!beginning)
 563                 firstnameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstbynameform")
 564                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 565         string othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!othernameform")
 566                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 567         if (!beginning)
 568                 othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!otherbynameform")
 569                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 570         string citenameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!citenameform")
 571                              : "{%prefix%[[%prefix% ]]}%surname%";
 572
 573         // Shorten the list (with et al.) if forceshort is set
 574         // and the list can actually be shortened, else if maxcitenames
 575         // is passed and full is not set.
 576         bool shorten = forceshort && authors.size() > 1;
 577         vector<docstring>::const_iterator it = authors.begin();
 578         vector<docstring>::const_iterator en = authors.end();
 579         for (size_t i = 0; it != en; ++it, ++i) {
 580                 if (i >= maxnames && !full) {
 581                         shorten = true;
 582                         break;
 583                 }
 584                 if (*it == "others") {
 585                         retval += buf ? buf->B_(etal) : from_ascii(etal);
 586                         break;
 587                 }
 588                 if (i > 0 && i == authors.size() - 1) {
 589                         if (authors.size() == 2)
 590                                 retval += buf ? buf->B_(pairnamesep) : from_ascii(pairnamesep);
 591                         else
 592                                 retval += buf ? buf->B_(lastnamesep) : from_ascii(lastnamesep);
 593                 } else if (i > 0)
 594                         retval += buf ? buf->B_(namesep) : from_ascii(namesep);
 595                 if (allnames)
 596                         retval += (i == 0) ? constructName(*it, firstnameform)
 597                                 : constructName(*it, othernameform);
 598                 else
 599                         retval += constructName(*it, citenameform);
 600         }
 601         if (shorten) {
 602                 if (allnames)
 603                         retval = constructName(authors[0], firstnameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 604                 else
 605                         retval = constructName(authors[0], citenameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 606         }
 607
 608         return convertLaTeXCommands(retval);
 609 }
 610
 611
 612 docstring const BibTeXInfo::getYear() const
 613 {
 614         if (is_bibtex_) {
 615                 // first try legacy year field
 616                 docstring year = operator[]("year");
 617                 if (!year.empty())
 618                         return year;
 619                 // now try biblatex's date field
 620                 year = operator[]("date");
 621                 // Format is [-]YYYY-MM-DD*/[-]YYYY-MM-DD*
 622                 // We only want the years.
 623                 static regex const yreg("[-]?([\\d]{4}).*");
 624                 static regex const ereg(".*/[-]?([\\d]{4}).*");
 625                 smatch sm;
 626                 string const date = to_utf8(year);
 627                 if (!regex_match(date, sm, yreg))
 628                         // cannot parse year.
 629                         return docstring();
 630                 year = from_ascii(sm[1]);
 631                 // check for an endyear
 632                 if (regex_match(date, sm, ereg))
 633                         year += char_type(0x2013) + from_ascii(sm[1]);
 634                 return year;
 635         }
 636
 637         docstring const opt = label();
 638         if (opt.empty())
 639                 return docstring();
 640
 641         docstring authors;
 642         docstring tmp = split(opt, authors, '(');
 643         if (tmp.empty())
 644                 // we don't have author (year)
 645                 return docstring();
 646         docstring year;
 647         tmp = split(tmp, year, ')');
 648         return year;
 649 }
 650
 651
 652 void BibTeXInfo::getLocators(docstring & doi, docstring & url, docstring & file) const
 653 {
 654         if (is_bibtex_) {
 655                 // get "doi" entry from citation record
 656                 doi = operator[]("doi");
 657                 if (!doi.empty() && !prefixIs(doi,from_ascii("http")))
 658                         doi = "https://doi.org/" + doi;
 659                 // get "url" entry from citation record
 660                 url = operator[]("url");
 661                 // get "file" entry from citation record
 662                 file = operator[]("file");
 663
 664                 // Jabref case, field has a format:
 665                 // Description:Location:Filetype;Description:Location:Filetype...
 666                 // We will grab only first pdf
 667                 if (!file.empty()) {
 668                         docstring ret, filedest, tmp;
 669                         ret = split(file, tmp, ':');
 670                         tmp = split(ret, filedest, ':');
 671                         //TODO howto deal with relative directories?
 672                         FileName f(to_utf8(filedest));
 673                         if (f.exists())
 674                                 file = "file:///" + filedest;
 675                 }
 676
 677                 // kbibtex case, format:
 678                 // file1.pdf;file2.pdf
 679                 // We will grab only first pdf
 680                 docstring kfile;
 681                 if (file.empty())
 682                         kfile = operator[]("localfile");
 683                 if (!kfile.empty()) {
 684                         docstring filedest, tmp;
 685                         tmp = split(kfile, filedest, ';');
 686                         //TODO howto deal with relative directories?
 687                         FileName f(to_utf8(filedest));
 688                         if (f.exists())
 689                                 file = "file:///" + filedest;
 690                 }
 691
 692                 if (!url.empty())
 693                         return;
 694
 695                 // try biblatex specific fields, see its manual
 696                 // 3.13.7 "Electronic Publishing Informationl"
 697                 docstring eprinttype = operator[]("eprinttype");
 698                 docstring eprint = operator[]("eprint");
 699                 if (eprint.empty())
 700                         return;
 701
 702                 if (eprinttype == "arxiv")
 703                         url = "https://arxiv.org/abs/" + eprint;
 704                 if (eprinttype == "jstor")
 705                         url = "https://www.jstor.org/stable/" + eprint;
 706                 if (eprinttype == "pubmed")
 707                         url = "http://www.ncbi.nlm.nih.gov/pubmed/" + eprint;
 708                 if (eprinttype == "hdl")
 709                         url = "https://hdl.handle.net/" + eprint;
 710                 if (eprinttype == "googlebooks")
 711                         url = "http://books.google.com/books?id=" + eprint;
 712
 713                 return;
 714         }
 715
 716         // Here can be handled the bibliography environment. All one could do
 717         // here is let LyX scan the entry for URL or HRef insets.
 718 }
 719
 720
 721 namespace {
 722
 723 docstring parseOptions(docstring const & format, string & optkey,
 724                     docstring & ifpart, docstring & elsepart);
 725
 726 // Calls parseOptions to deal with an embedded option, such as:
 727 //   {%number%[[, no.~%number%]]}
 728 // which must appear at the start of format. ifelsepart gets the
 729 // whole of the option, and we return what's left after the option.
 730 // we return format if there is an error.
 731 docstring parseEmbeddedOption(docstring const & format, docstring & ifelsepart)
 732 {
 733         LASSERT(format[0] == '{' && format[1] == '%', return format);
 734         string optkey;
 735         docstring ifpart;
 736         docstring elsepart;
 737         docstring const rest = parseOptions(format, optkey, ifpart, elsepart);
 738         if (format == rest) { // parse error
 739                 LYXERR0("ERROR! Couldn't parse `" << format <<"'.");
 740                 return format;
 741         }
 742         LASSERT(rest.size() <= format.size(),
 743                 { ifelsepart = docstring(); return format; });
 744         ifelsepart = format.substr(0, format.size() - rest.size());
 745         return rest;
 746 }
 747
 748
 749 // Gets a "clause" from a format string, where the clause is
 750 // delimited by '[[' and ']]'. Returns what is left after the
 751 // clause is removed, and returns format if there is an error.
 752 docstring getClause(docstring const & format, docstring & clause)
 753 {
 754         docstring fmt = format;
 755         // remove '[['
 756         fmt = fmt.substr(2);
 757         // we'll remove characters from the front of fmt as we
 758         // deal with them
 759         while (!fmt.empty()) {
 760                 if (fmt[0] == ']' && fmt.size() > 1 && fmt[1] == ']') {
 761                         // that's the end
 762                         fmt = fmt.substr(2);
 763                         break;
 764                 }
 765                 // check for an embedded option
 766                 if (fmt[0] == '{' && fmt.size() > 1 && fmt[1] == '%') {
 767                         docstring part;
 768                         docstring const rest = parseEmbeddedOption(fmt, part);
 769                         if (fmt == rest) {
 770                                 LYXERR0("ERROR! Couldn't parse embedded option in `" << format <<"'.");
 771                                 return format;
 772                         }
 773                         clause += part;
 774                         fmt = rest;
 775                 } else { // it's just a normal character
 776                                 clause += fmt[0];
 777                                 fmt = fmt.substr(1);
 778                 }
 779         }
 780         return fmt;
 781 }
 782
 783
 784 // parse an options string, which must appear at the start of the
 785 // format parameter. puts the parsed bits in optkey, ifpart, and
 786 // elsepart and returns what's left after the option is removed.
 787 // if there's an error, it returns format itself.
 788 docstring parseOptions(docstring const & format, string & optkey,
 789                     docstring & ifpart, docstring & elsepart)
 790 {
 791         LASSERT(format[0] == '{' && format[1] == '%', return format);
 792         // strip '{%'
 793         docstring fmt = format.substr(2);
 794         size_t pos = fmt.find('%'); // end of key
 795         if (pos == string::npos) {
 796                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of key.");
 797                 return format;
 798         }
 799         optkey = to_utf8(fmt.substr(0, pos));
 800         fmt = fmt.substr(pos + 1);
 801         // [[format]] should be next
 802         if (fmt[0] != '[' || fmt[1] != '[') {
 803                 LYXERR0("Error parsing  `" << format <<"'. Can't find '[[' after key.");
 804                 return format;
 805         }
 806
 807         docstring curfmt = fmt;
 808         fmt = getClause(curfmt, ifpart);
 809         if (fmt == curfmt) {
 810                 LYXERR0("Error parsing  `" << format <<"'. Couldn't get if clause.");
 811                 return format;
 812         }
 813
 814         if (fmt[0] == '}') // we're done, no else clause
 815                 return fmt.substr(1);
 816
 817         // else part should follow
 818         if (fmt[0] != '[' || fmt[1] != '[') {
 819                 LYXERR0("Error parsing  `" << format <<"'. Can't find else clause.");
 820                 return format;
 821         }
 822
 823         curfmt = fmt;
 824         fmt = getClause(curfmt, elsepart);
 825         // we should be done
 826         if (fmt == curfmt || fmt[0] != '}') {
 827                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of option.");
 828                 return format;
 829         }
 830         return fmt.substr(1);
 831 }
 832
 833
 834 } // namespace
 835
 836 /* FIXME
 837 Bug #9131 revealed an oddity in how we are generating citation information
 838 when more than one key is given. We end up building a longer and longer format
 839 string as we go, which we then have to re-parse, over and over and over again,
 840 rather than generating the information for the individual keys and then putting
 841 all of that together. We do that to deal with the way separators work, from what
 842 I can tell, but it still feels like a hack. Fixing this would require quite a
 843 bit of work, however.
 844 */
 845 docstring BibTeXInfo::expandFormat(docstring const & format,
 846                 BibTeXInfoList const & xrefs, int & counter, Buffer const & buf,
 847                 CiteItem const & ci, bool next, bool second) const
 848 {
 849         // incorrect use of macros could put us in an infinite loop
 850         static int const max_passes = 5000;
 851         // the use of overly large keys can lead to performance problems, due
 852         // to eventual attempts to convert LaTeX macros to unicode. See bug
 853         // #8944. By default, the size is limited to 128 (in CiteItem), but
 854         // for specific purposes (such as XHTML export), it needs to be enlarged
 855         // This is perhaps not the best solution, but it will have to do for now.
 856         size_t const max_keysize = ci.max_key_size;
 857         odocstringstream ret; // return value
 858         string key;
 859         bool scanning_key = false;
 860         bool scanning_rich = false;
 861
 862         CiteEngineType const engine_type = buf.params().citeEngineType();
 863         docstring fmt = format;
 864         // we'll remove characters from the front of fmt as we
 865         // deal with them
 866         while (!fmt.empty()) {
 867                 if (counter > max_passes) {
 868                         LYXERR0("Recursion limit reached while parsing `"
 869                                 << format << "'.");
 870                         return _("ERROR!");
 871                 }
 872
 873                 char_type thischar = fmt[0];
 874                 if (thischar == '%') {
 875                         // beginning or end of key
 876                         if (scanning_key) {
 877                                 // end of key
 878                                 scanning_key = false;
 879                                 // so we replace the key with its value, which may be empty
 880                                 if (key[0] == '!') {
 881                                         // macro
 882                                         string const val =
 883                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 884                                         fmt = from_utf8(val) + fmt.substr(1);
 885                                         counter += 1;
 886                                         continue;
 887                                 } else if (prefixIs(key, "B_")) {
 888                                         // a translatable bit (to the Buffer language)
 889                                         string const val =
 890                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 891                                         docstring const trans =
 892                                                 translateIfPossible(from_utf8(val), buf.params().language->code());
 893                                         ret << trans;
 894                                 } else if (key[0] == '_') {
 895                                         // a translatable bit (to the GUI language)
 896                                         string const val =
 897                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 898                                         docstring const trans =
 899                                                 translateIfPossible(from_utf8(val));
 900                                         ret << trans;
 901                                 } else {
 902                                         docstring const val =
 903                                                 getValueForKey(key, buf, ci, xrefs, max_keysize);
 904                                         if (!scanning_rich)
 905                                                 ret << from_ascii("{!<span class=\"bib-" + key + "\">!}");
 906                                         ret << val;
 907                                         if (!scanning_rich)
 908                                                 ret << from_ascii("{!</span>!}");
 909                                 }
 910                         } else {
 911                                 // beginning of key
 912                                 key.clear();
 913                                 scanning_key = true;
 914                         }
 915                 }
 916                 else if (thischar == '{') {
 917                         // beginning of option?
 918                         if (scanning_key) {
 919                                 LYXERR0("ERROR: Found `{' when scanning key in `" << format << "'.");
 920                                 return _("ERROR!");
 921                         }
 922                         if (fmt.size() > 1) {
 923                                 if (fmt[1] == '%') {
 924                                         // it is the beginning of an optional format
 925                                         string optkey;
 926                                         docstring ifpart;
 927                                         docstring elsepart;
 928                                         docstring const newfmt =
 929                                                 parseOptions(fmt, optkey, ifpart, elsepart);
 930                                         if (newfmt == fmt) // parse error
 931                                                 return _("ERROR!");
 932                                         fmt = newfmt;
 933                                         docstring const val =
 934                                                 getValueForKey(optkey, buf, ci, xrefs);
 935                                         if (optkey == "next" && next)
 936                                                 ret << ifpart; // without expansion
 937                                         else if (optkey == "second" && second) {
 938                                                 int newcounter = 0;
 939                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 940                                                         ci, next);
 941                                         } else if (!val.empty()) {
 942                                                 int newcounter = 0;
 943                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 944                                                         ci, next);
 945                                         } else if (!elsepart.empty()) {
 946                                                 int newcounter = 0;
 947                                                 ret << expandFormat(elsepart, xrefs, newcounter, buf,
 948                                                         ci, next);
 949                                         }
 950                                         // fmt will have been shortened for us already
 951                                         continue;
 952                                 }
 953                                 if (fmt[1] == '!') {
 954                                         // beginning of rich text
 955                                         scanning_rich = true;
 956                                         fmt = fmt.substr(2);
 957                                         ret << from_ascii("{!");
 958                                         continue;
 959                                 }
 960                         }
 961                         // we are here if '{' was not followed by % or !.
 962                         // So it's just a character.
 963                         ret << thischar;
 964                 }
 965                 else if (scanning_rich && thischar == '!'
 966                          && fmt.size() > 1 && fmt[1] == '}') {
 967                         // end of rich text
 968                         scanning_rich = false;
 969                         fmt = fmt.substr(2);
 970                         ret << from_ascii("!}");
 971                         continue;
 972                 }
 973                 else if (scanning_key)
 974                         key += char(thischar);
 975                 else {
 976                         try {
 977                                 ret.put(thischar);
 978                         } catch (EncodingException & /* e */) {
 979                                 LYXERR0("Uncodable character '" << docstring(1, thischar) << " in citation label!");
 980                         }
 981                 }
 982                 fmt = fmt.substr(1);
 983         } // for loop
 984         if (scanning_key) {
 985                 LYXERR0("Never found end of key in `" << format << "'!");
 986                 return _("ERROR!");
 987         }
 988         if (scanning_rich) {
 989                 LYXERR0("Never found end of rich text in `" << format << "'!");
 990                 return _("ERROR!");
 991         }
 992         return ret.str();
 993 }
 994
 995
 996 docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs,
 997         Buffer const & buf, CiteItem const & ci, docstring const & format_in) const
 998 {
 999         bool const richtext = ci.richtext;
1000
1001         CiteEngineType const engine_type = buf.params().citeEngineType();
1002         DocumentClass const & dc = buf.params().documentClass();
1003         docstring const & format = format_in.empty()?
1004                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_)))
1005                               : format_in;
1006
1007         if (format != format_) {
1008                 // clear caches since format changed
1009                 info_.clear();
1010                 info_richtext_.clear();
1011                 format_ = format;
1012         }
1013
1014         if (!richtext && !info_.empty()) {
1015                 info_ = convertLaTeXCommands(processRichtext(info_, false));
1016                 return info_;
1017         }
1018         if (richtext && !info_richtext_.empty())
1019                 return info_richtext_;
1020
1021         if (!is_bibtex_) {
1022                 BibTeXInfo::const_iterator it = find(from_ascii("ref"));
1023                 info_ = it->second;
1024                 return info_;
1025         }
1026
1027         int counter = 0;
1028         info_ = expandFormat(format, xrefs, counter, buf,
1029                 ci, false, false);
1030
1031         if (info_.empty()) {
1032                 // this probably shouldn't happen
1033                 return info_;
1034         }
1035
1036         if (richtext) {
1037                 info_richtext_ = convertLaTeXCommands(processRichtext(info_, true));
1038                 return info_richtext_;
1039         }
1040
1041         info_ = convertLaTeXCommands(processRichtext(info_, false));
1042         return info_;
1043 }
1044
1045
1046 docstring const BibTeXInfo::getLabel(BibTeXInfoList const & xrefs,
1047         Buffer const & buf, docstring const & format,
1048         CiteItem const & ci, bool next, bool second) const
1049 {
1050         docstring loclabel;
1051
1052         int counter = 0;
1053         loclabel = expandFormat(format, xrefs, counter, buf, ci, next, second);
1054
1055         if (!loclabel.empty() && !next) {
1056                 loclabel = processRichtext(loclabel, ci.richtext);
1057                 loclabel = convertLaTeXCommands(loclabel);
1058         }
1059
1060         return loclabel;
1061 }
1062
1063
1064 docstring const & BibTeXInfo::operator[](docstring const & field) const
1065 {
1066         BibTeXInfo::const_iterator it = find(field);
1067         if (it != end())
1068                 return it->second;
1069         static docstring const empty_value = docstring();
1070         return empty_value;
1071 }
1072
1073
1074 docstring const & BibTeXInfo::operator[](string const & field) const
1075 {
1076         return operator[](from_ascii(field));
1077 }
1078
1079
1080 docstring BibTeXInfo::getValueForKey(string const & oldkey, Buffer const & buf,
1081         CiteItem const & ci, BibTeXInfoList const & xrefs, size_t maxsize) const
1082 {
1083         // anything less is pointless
1084         LASSERT(maxsize >= 16, maxsize = 16);
1085         string key = oldkey;
1086         bool cleanit = false;
1087         if (prefixIs(oldkey, "clean:")) {
1088                 key = oldkey.substr(6);
1089                 cleanit = true;
1090         }
1091
1092         docstring ret = operator[](key);
1093         if (ret.empty() && !xrefs.empty()) {
1094                 // xr is a (reference to a) BibTeXInfo const *
1095                 for (auto const & xr : xrefs) {
1096                         if (xr && !(*xr)[key].empty()) {
1097                                 ret = (*xr)[key];
1098                                 break;
1099                         }
1100                 }
1101         }
1102         if (ret.empty()) {
1103                 // some special keys
1104                 // FIXME: dialog, textbefore and textafter have nothing to do with this
1105                 if (key == "dialog" && ci.context == CiteItem::Dialog)
1106                         ret = from_ascii("x"); // any non-empty string will do
1107                 else if (key == "export" && ci.context == CiteItem::Export)
1108                         ret = from_ascii("x"); // any non-empty string will do
1109                 else if (key == "ifstar" && ci.Starred)
1110                         ret = from_ascii("x"); // any non-empty string will do
1111                 else if (key == "ifqualified" && ci.isQualified)
1112                         ret = from_ascii("x"); // any non-empty string will do
1113                 else if (key == "entrytype")
1114                         ret = entry_type_;
1115                 else if (prefixIs(key, "ifentrytype:")
1116                          && from_ascii(key.substr(12)) == entry_type_)
1117                         ret = from_ascii("x"); // any non-empty string will do
1118                 else if (key == "key")
1119                         ret = bib_key_;
1120                 else if (key == "label")
1121                         ret = label_;
1122                 else if (key == "modifier" && modifier_ != 0)
1123                         ret = modifier_;
1124                 else if (key == "numericallabel")
1125                         ret = cite_number_;
1126                 else if (prefixIs(key, "ifmultiple:")) {
1127                         // Return whether we have multiple authors
1128                         docstring const kind = operator[](from_ascii(key.substr(11)));
1129                         if (multipleAuthors(kind))
1130                                 ret = from_ascii("x"); // any non-empty string will do
1131                 }
1132                 else if (prefixIs(key, "abbrvnames:")) {
1133                         // Special key to provide abbreviated name list,
1134                         // with respect to maxcitenames. Suitable for Bibliography
1135                         // beginnings.
1136                         docstring const kind = operator[](from_ascii(key.substr(11)));
1137                         ret = getAuthorList(&buf, kind, false, false, true);
1138                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1139                                 ret[0] = uppercase(ret[0]);
1140                 } else if (prefixIs(key, "fullnames:")) {
1141                         // Return a full name list. Suitable for Bibliography
1142                         // beginnings.
1143                         docstring const kind = operator[](from_ascii(key.substr(10)));
1144                         ret = getAuthorList(&buf, kind, true, false, true);
1145                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1146                                 ret[0] = uppercase(ret[0]);
1147                 } else if (prefixIs(key, "forceabbrvnames:")) {
1148                         // Special key to provide abbreviated name lists,
1149                         // irrespective of maxcitenames. Suitable for Bibliography
1150                         // beginnings.
1151                         docstring const kind = operator[](from_ascii(key.substr(15)));
1152                         ret = getAuthorList(&buf, kind, false, true, true);
1153                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1154                                 ret[0] = uppercase(ret[0]);
1155                 } else if (prefixIs(key, "abbrvbynames:")) {
1156                         // Special key to provide abbreviated name list,
1157                         // with respect to maxcitenames. Suitable for further names inside a
1158                         // bibliography item // (such as "ed. by ...")
1159                         docstring const kind = operator[](from_ascii(key.substr(11)));
1160                         ret = getAuthorList(&buf, kind, false, false, true, false);
1161                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1162                                 ret[0] = uppercase(ret[0]);
1163                 } else if (prefixIs(key, "fullbynames:")) {
1164                         // Return a full name list. Suitable for further names inside a
1165                         // bibliography item // (such as "ed. by ...")
1166                         docstring const kind = operator[](from_ascii(key.substr(10)));
1167                         ret = getAuthorList(&buf, kind, true, false, true, false);
1168                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1169                                 ret[0] = uppercase(ret[0]);
1170                 } else if (prefixIs(key, "forceabbrvbynames:")) {
1171                         // Special key to provide abbreviated name lists,
1172                         // irrespective of maxcitenames. Suitable for further names inside a
1173                         // bibliography item // (such as "ed. by ...")
1174                         docstring const kind = operator[](from_ascii(key.substr(15)));
1175                         ret = getAuthorList(&buf, kind, false, true, true, false);
1176                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1177                                 ret[0] = uppercase(ret[0]);
1178                 } else if (key == "abbrvciteauthor") {
1179                         // Special key to provide abbreviated author or
1180                         // editor names (suitable for citation labels),
1181                         // with respect to maxcitenames.
1182                         ret = getAuthorOrEditorList(&buf, false, false);
1183                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1184                                 ret[0] = uppercase(ret[0]);
1185                 } else if (key == "fullciteauthor") {
1186                         // Return a full author or editor list (for citation labels)
1187                         ret = getAuthorOrEditorList(&buf, true, false);
1188                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1189                                 ret[0] = uppercase(ret[0]);
1190                 } else if (key == "forceabbrvciteauthor") {
1191                         // Special key to provide abbreviated author or
1192                         // editor names (suitable for citation labels),
1193                         // irrespective of maxcitenames.
1194                         ret = getAuthorOrEditorList(&buf, false, true);
1195                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1196                                 ret[0] = uppercase(ret[0]);
1197                 } else if (key == "bibentry") {
1198                         // Special key to provide the full bibliography entry: see getInfo()
1199                         CiteEngineType const engine_type = buf.params().citeEngineType();
1200                         DocumentClass const & dc = buf.params().documentClass();
1201                         docstring const & format =
1202                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_), false));
1203                         int counter = 0;
1204                         ret = expandFormat(format, xrefs, counter, buf, ci, false, false);
1205                 } else if (key == "textbefore")
1206                         ret = ci.textBefore;
1207                 else if (key == "textafter")
1208                         ret = ci.textAfter;
1209                 else if (key == "curpretext") {
1210                         vector<pair<docstring, docstring>> pres = ci.getPretexts();
1211                         vector<pair<docstring, docstring>>::iterator it = pres.begin();
1212                         int numkey = 1;
1213                         for (; it != pres.end() ; ++it) {
1214                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1215                                         ret = (*it).second;
1216                                         pres.erase(it);
1217                                         break;
1218                                 }
1219                                 if ((*it).first == bib_key_)
1220                                         ++numkey;
1221                         }
1222                 } else if (key == "curposttext") {
1223                         vector<pair<docstring, docstring>> posts = ci.getPosttexts();
1224                         vector<pair<docstring, docstring>>::iterator it = posts.begin();
1225                         int numkey = 1;
1226                         for (; it != posts.end() ; ++it) {
1227                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1228                                         ret = (*it).second;
1229                                         posts.erase(it);
1230                                         break;
1231                                 }
1232                                 if ((*it).first == bib_key_)
1233                                         ++numkey;
1234                         }
1235                 } else if (key == "year")
1236                         ret = getYear();
1237         }
1238
1239         if (cleanit)
1240                 ret = xml::cleanAttr(ret);
1241
1242         // make sure it is not too big
1243         support::truncateWithEllipsis(ret, maxsize);
1244         return ret;
1245 }
1246
1247
1248 //////////////////////////////////////////////////////////////////////
1249 //
1250 // BiblioInfo
1251 //
1252 //////////////////////////////////////////////////////////////////////
1253
1254 namespace {
1255
1256 // A functor for use with sort, leading to case insensitive sorting
1257 bool compareNoCase(const docstring & a, const docstring & b) {
1258         return compare_no_case(a, b) < 0;
1259 }
1260
1261 } // namespace
1262
1263
1264 vector<docstring> const BiblioInfo::getXRefs(BibTeXInfo const & data, bool const nested) const
1265 {
1266         vector<docstring> result;
1267         if (!data.isBibTeX())
1268                 return result;
1269         // Legacy crossref field. This is not nestable.
1270         if (!nested && !data["crossref"].empty()) {
1271                 docstring const xrefkey = data["crossref"];
1272                 result.push_back(xrefkey);
1273                 // However, check for nested xdatas
1274                 BiblioInfo::const_iterator it = find(xrefkey);
1275                 if (it != end()) {
1276                         BibTeXInfo const & xref = it->second;
1277                         vector<docstring> const nxdata = getXRefs(xref, true);
1278                         if (!nxdata.empty())
1279                                 result.insert(result.end(), nxdata.begin(), nxdata.end());
1280                 }
1281         }
1282         // Biblatex's xdata field. Infinitely nestable.
1283         // XData field can consist of a comma-separated list of keys
1284         vector<docstring> const xdatakeys = getVectorFromString(data["xdata"]);
1285         if (!xdatakeys.empty()) {
1286                 for (auto const & xdatakey : xdatakeys) {
1287                         result.push_back(xdatakey);
1288                         BiblioInfo::const_iterator it = find(xdatakey);
1289                         if (it != end()) {
1290                                 BibTeXInfo const & xdata = it->second;
1291                                 vector<docstring> const nxdata = getXRefs(xdata, true);
1292                                 if (!nxdata.empty())
1293                                         result.insert(result.end(), nxdata.begin(), nxdata.end());
1294                         }
1295                 }
1296         }
1297         return result;
1298 }
1299
1300
1301 vector<docstring> const BiblioInfo::getKeys() const
1302 {
1303         vector<docstring> bibkeys;
1304         for (auto const & bi : *this)
1305                 bibkeys.push_back(bi.first);
1306         sort(bibkeys.begin(), bibkeys.end(), &compareNoCase);
1307         return bibkeys;
1308 }
1309
1310
1311 vector<docstring> const BiblioInfo::getFields() const
1312 {
1313         vector<docstring> bibfields;
1314         for (auto const & fn : field_names_)
1315                 bibfields.push_back(fn);
1316         sort(bibfields.begin(), bibfields.end());
1317         return bibfields;
1318 }
1319
1320
1321 vector<docstring> const BiblioInfo::getEntries() const
1322 {
1323         vector<docstring> bibentries;
1324         for (auto const & et : entry_types_)
1325                 bibentries.push_back(et);
1326         sort(bibentries.begin(), bibentries.end());
1327         return bibentries;
1328 }
1329
1330
1331 docstring const BiblioInfo::getAuthorOrEditorList(docstring const & key, Buffer const & buf) const
1332 {
1333         BiblioInfo::const_iterator it = find(key);
1334         if (it == end())
1335                 return docstring();
1336         BibTeXInfo const & data = it->second;
1337         return data.getAuthorOrEditorList(&buf, false);
1338 }
1339
1340
1341 docstring const BiblioInfo::getCiteNumber(docstring const & key) const
1342 {
1343         BiblioInfo::const_iterator it = find(key);
1344         if (it == end())
1345                 return docstring();
1346         BibTeXInfo const & data = it->second;
1347         return data.citeNumber();
1348 }
1349
1350 void BiblioInfo::getLocators(docstring const & key, docstring & doi, docstring & url, docstring & file) const
1351 {
1352         BiblioInfo::const_iterator it = find(key);
1353          if (it == end())
1354                 return;
1355         BibTeXInfo const & data = it->second;
1356         data.getLocators(doi,url,file);
1357 }
1358
1359
1360 docstring const BiblioInfo::getYear(docstring const & key, bool use_modifier) const
1361 {
1362         BiblioInfo::const_iterator it = find(key);
1363         if (it == end())
1364                 return docstring();
1365         BibTeXInfo const & data = it->second;
1366         docstring year = data.getYear();
1367         if (year.empty()) {
1368                 // let's try the crossrefs
1369                 vector<docstring> const xrefs = getXRefs(data);
1370                 if (xrefs.empty())
1371                         // no luck
1372                         return docstring();
1373                 for (docstring const & xref : xrefs) {
1374                         BiblioInfo::const_iterator const xrefit = find(xref);
1375                         if (xrefit == end())
1376                                 continue;
1377                         BibTeXInfo const & xref_data = xrefit->second;
1378                         year = xref_data.getYear();
1379                         if (!year.empty())
1380                                 // success!
1381                                 break;
1382                 }
1383         }
1384         if (use_modifier && data.modifier() != 0)
1385                 year += data.modifier();
1386         return year;
1387 }
1388
1389
1390 docstring const BiblioInfo::getYear(docstring const & key, Buffer const & buf, bool use_modifier) const
1391 {
1392         docstring const year = getYear(key, use_modifier);
1393         if (year.empty())
1394                 return buf.B_("No year");
1395         return year;
1396 }
1397
1398
1399 docstring const BiblioInfo::getInfo(docstring const & key,
1400         Buffer const & buf, CiteItem const & ci, docstring const & format) const
1401 {
1402         BiblioInfo::const_iterator it = find(key);
1403         if (it == end())
1404                 return docstring(_("Bibliography entry not found!"));
1405         BibTeXInfo const & data = it->second;
1406         BibTeXInfoList xrefptrs;
1407         for (docstring const & xref : getXRefs(data)) {
1408                 BiblioInfo::const_iterator const xrefit = find(xref);
1409                 if (xrefit != end())
1410                         xrefptrs.push_back(&(xrefit->second));
1411         }
1412         return data.getInfo(xrefptrs, buf, ci, format);
1413 }
1414
1415
1416 docstring const BiblioInfo::getLabel(vector<docstring> keys,
1417         Buffer const & buf, string const & style, CiteItem const & ci) const
1418 {
1419         size_t max_size = ci.max_size;
1420         // shorter makes no sense
1421         LASSERT(max_size >= 16, max_size = 16);
1422
1423         // we can't display more than 10 of these, anyway
1424         // but since we truncate in the middle,
1425         // we need to split into two halfs.
1426         bool const too_many_keys = keys.size() > 10;
1427         vector<docstring> lkeys;
1428         if (too_many_keys) {
1429                 lkeys.insert(lkeys.end(), keys.end() - 5, keys.end());
1430                 keys.resize(5);
1431                 keys.insert(keys.end(), lkeys.begin(), lkeys.end());
1432         }
1433
1434         CiteEngineType const engine_type = buf.params().citeEngineType();
1435         DocumentClass const & dc = buf.params().documentClass();
1436         docstring const & format = from_utf8(dc.getCiteFormat(engine_type, style, false, "cite"));
1437         docstring ret = format;
1438         vector<docstring>::const_iterator key = keys.begin();
1439         vector<docstring>::const_iterator ken = keys.end();
1440         vector<docstring> handled_keys;
1441         for (int i = 0; key != ken; ++key, ++i) {
1442                 handled_keys.push_back(*key);
1443                 int n = 0;
1444                 for (auto const & k : handled_keys) {
1445                         if (k == *key)
1446                                 ++n;
1447                 }
1448                 BiblioInfo::const_iterator it = find(*key);
1449                 BibTeXInfo empty_data;
1450                 empty_data.key(*key);
1451                 BibTeXInfo & data = empty_data;
1452                 vector<BibTeXInfo const *> xrefptrs;
1453                 if (it != end()) {
1454                         data = it->second;
1455                         for (docstring const & xref : getXRefs(data)) {
1456                                 BiblioInfo::const_iterator const xrefit = find(xref);
1457                                 if (xrefit != end())
1458                                         xrefptrs.push_back(&(xrefit->second));
1459                         }
1460                 }
1461                 data.numKey(n);
1462                 ret = data.getLabel(xrefptrs, buf, ret, ci, key + 1 != ken, i == 1);
1463         }
1464
1465         support::truncateWithEllipsis(ret, max_size, true);
1466
1467         return ret;
1468 }
1469
1470
1471 bool BiblioInfo::isBibtex(docstring const & key) const
1472 {
1473         docstring key1;
1474         split(key, key1, ',');
1475         BiblioInfo::const_iterator it = find(key1);
1476         if (it == end())
1477                 return false;
1478         return it->second.isBibTeX();
1479 }
1480
1481
1482 BiblioInfo::CiteStringMap const BiblioInfo::getCiteStrings(
1483         vector<docstring> const & keys, vector<CitationStyle> const & styles,
1484         Buffer const & buf, CiteItem const & ci) const
1485 {
1486         if (empty())
1487                 return vector<pair<docstring,docstring>>();
1488
1489         string style;
1490         CiteStringMap csm(styles.size());
1491         for (size_t i = 0; i != csm.size(); ++i) {
1492                 style = styles[i].name;
1493                 csm[i] = make_pair(from_ascii(style), getLabel(keys, buf, style, ci));
1494         }
1495
1496         return csm;
1497 }
1498
1499
1500 void BiblioInfo::mergeBiblioInfo(BiblioInfo const & info)
1501 {
1502         bimap_.insert(info.begin(), info.end());
1503         field_names_.insert(info.field_names_.begin(), info.field_names_.end());
1504         entry_types_.insert(info.entry_types_.begin(), info.entry_types_.end());
1505 }
1506
1507
1508 namespace {
1509
1510 // used in xhtml to sort a list of BibTeXInfo objects
1511 bool lSorter(BibTeXInfo const * lhs, BibTeXInfo const * rhs)
1512 {
1513         docstring const lauth = lhs->getAuthorOrEditorList();
1514         docstring const rauth = rhs->getAuthorOrEditorList();
1515         docstring const lyear = lhs->getYear();
1516         docstring const ryear = rhs->getYear();
1517         docstring const ltitl = lhs->operator[]("title");
1518         docstring const rtitl = rhs->operator[]("title");
1519         return  (lauth < rauth)
1520                 || (lauth == rauth && lyear < ryear)
1521                 || (lauth == rauth && lyear == ryear && ltitl < rtitl);
1522 }
1523
1524 } // namespace
1525
1526
1527 void BiblioInfo::collectCitedEntries(Buffer const & buf)
1528 {
1529         cited_entries_.clear();
1530         // We are going to collect all the citation keys used in the document,
1531         // getting them from the TOC.
1532         // FIXME We may want to collect these differently, in the first case,
1533         // so that we might have them in order of appearance.
1534         set<docstring> citekeys;
1535         Toc const & toc = *buf.tocBackend().toc("citation");
1536         for (auto const & t : toc) {
1537                 if (t.str().empty())
1538                         continue;
1539                 vector<docstring> const keys = getVectorFromString(t.str());
1540                 citekeys.insert(keys.begin(), keys.end());
1541         }
1542         if (citekeys.empty())
1543                 return;
1544
1545         // We have a set of the keys used in this document.
1546         // We will now convert it to a list of the BibTeXInfo objects used in
1547         // this document...
1548         vector<BibTeXInfo const *> bi;
1549         for (auto const & ck : citekeys) {
1550                 BiblioInfo::const_iterator const bt = find(ck);
1551                 if (bt == end() || !bt->second.isBibTeX())
1552                         continue;
1553                 bi.push_back(&(bt->second));
1554         }
1555         // ...and sort it.
1556         sort(bi.begin(), bi.end(), lSorter);
1557
1558         // Now we can write the sorted keys
1559         // b is a BibTeXInfo const *
1560         for (auto const & b : bi)
1561                 cited_entries_.push_back(b->key());
1562 }
1563
1564
1565 void BiblioInfo::makeCitationLabels(Buffer const & buf)
1566 {
1567         collectCitedEntries(buf);
1568         CiteEngineType const engine_type = buf.params().citeEngineType();
1569         bool const numbers = (engine_type & ENGINE_TYPE_NUMERICAL);
1570
1571         int keynumber = 0;
1572         char modifier = 0;
1573         // used to remember the last one we saw
1574         // we'll be comparing entries to see if we need to add
1575         // modifiers, like "1984a"
1576         map<docstring, BibTeXInfo>::iterator last = bimap_.end();
1577
1578         // add letters to years
1579         for (auto const & ce : cited_entries_) {
1580                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1581                 // this shouldn't happen, but...
1582                 if (biit == bimap_.end())
1583                         // ...fail gracefully, anyway.
1584                         continue;
1585                 BibTeXInfo & entry = biit->second;
1586                 if (numbers) {
1587                         docstring const num = convert<docstring>(++keynumber);
1588                         entry.setCiteNumber(num);
1589                 } else {
1590                         // The first test here is checking whether this is the first
1591                         // time through the loop. If so, then we do not have anything
1592                         // with which to compare.
1593                         if (last != bimap_.end()
1594                             && entry.getAuthorOrEditorList() == last->second.getAuthorOrEditorList()
1595                             // we access the year via getYear() so as to get it from the xref,
1596                             // if we need to do so
1597                             && getYear(entry.key()) == getYear(last->second.key())) {
1598                                 if (modifier == 0) {
1599                                         // so the last one should have been 'a'
1600                                         last->second.setModifier('a');
1601                                         modifier = 'b';
1602                                 } else if (modifier == 'z')
1603                                         modifier = 'A';
1604                                 else
1605                                         modifier++;
1606                         } else {
1607                                 modifier = 0;
1608                         }
1609                         entry.setModifier(modifier);
1610                         // remember the last one
1611                         last = biit;
1612                 }
1613         }
1614         // Set the labels
1615         for (auto const & ce : cited_entries_) {
1616                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1617                 // this shouldn't happen, but...
1618                 if (biit == bimap_.end())
1619                         // ...fail gracefully, anyway.
1620                         continue;
1621                 BibTeXInfo & entry = biit->second;
1622                 if (numbers) {
1623                         entry.label(entry.citeNumber());
1624                 } else {
1625                         docstring const auth = entry.getAuthorOrEditorList(&buf, false);
1626                         // we do it this way so as to access the xref, if necessary
1627                         // note that this also gives us the modifier
1628                         docstring const year = getYear(ce, buf, true);
1629                         if (!auth.empty() && !year.empty())
1630                                 entry.label(auth + ' ' + year);
1631                         else
1632                                 entry.label(entry.key());
1633                 }
1634         }
1635 }
1636
1637
1638 //////////////////////////////////////////////////////////////////////
1639 //
1640 // CitationStyle
1641 //
1642 //////////////////////////////////////////////////////////////////////
1643
1644
1645 CitationStyle citationStyleFromString(string const & command,
1646                                       BufferParams const & params)
1647 {
1648         CitationStyle cs;
1649         if (command.empty())
1650                 return cs;
1651
1652         string const alias = params.getCiteAlias(command);
1653         string cmd = alias.empty() ? command : alias;
1654         if (isUpperCase(command[0])) {
1655                 cs.forceUpperCase = true;
1656                 cmd[0] = lowercase(cmd[0]);
1657         }
1658
1659         size_t const n = command.size() - 1;
1660         if (command[n] == '*') {
1661                 cs.hasStarredVersion = true;
1662                 if (suffixIs(cmd, '*'))
1663                         cmd = cmd.substr(0, cmd.size() - 1);
1664         }
1665
1666         cs.name = cmd;
1667         return cs;
1668 }
1669
1670
1671 string citationStyleToString(const CitationStyle & cs, bool const latex)
1672 {
1673         string cmd = latex ? cs.cmd : cs.name;
1674         if (cs.forceUpperCase)
1675                 cmd[0] = uppercase(cmd[0]);
1676         if (cs.hasStarredVersion)
1677                 cmd += '*';
1678         return cmd;
1679 }
1680
1681
1682 docstring authorsToDocBookAuthorGroup(docstring const & authorsString, XMLStream & xs, Buffer const & buf)
1683 {
1684         // This function closely mimics getAuthorList, but produces DocBook instead of text.
1685         // It has been greatly simplified, as the complete list of authors is always produced. No separators are required,
1686         // as the output has a database-like shape.
1687         // constructName has also been merged within, as it becomes really simple and leads to no copy-paste.
1688
1689         if (authorsString.empty()) {
1690                 return docstring();
1691         }
1692
1693         // Split the input list of authors into individual authors.
1694         vector<docstring> const authors = getAuthors(authorsString);
1695
1696         // Retrieve the "et al." variation.
1697         string const etal = buf.params().documentClass().getCiteMacro(buf.params().citeEngineType(), "_etal");
1698
1699         // Output the list of authors.
1700         xs << xml::StartTag("authorgroup");
1701         xs << xml::CR();
1702
1703         auto it = authors.cbegin();
1704         auto en = authors.cend();
1705         for (size_t i = 0; it != en; ++it, ++i) {
1706                 xs << xml::StartTag("author");
1707                 xs << xml::CR();
1708                 xs << xml::StartTag("personname");
1709                 xs << xml::CR();
1710                 docstring name = *it;
1711
1712                 // All authors go in a <personname>. If more structure is known, use it; otherwise (just "et al."), print it as such.
1713                 if (name == "others") {
1714                         xs << buf.B_(etal);
1715                 } else {
1716                         name_parts parts = nameParts(name);
1717                         if (! parts.prefix.empty()) {
1718                                 xs << xml::StartTag("honorific");
1719                                 xs << parts.prefix;
1720                                 xs << xml::EndTag("honorific");
1721                                 xs << xml::CR();
1722                         }
1723                         if (! parts.prename.empty()) {
1724                                 xs << xml::StartTag("firstname");
1725                                 xs << parts.prename;
1726                                 xs << xml::EndTag("firstname");
1727                                 xs << xml::CR();
1728                         }
1729                         if (! parts.surname.empty()) {
1730                                 xs << xml::StartTag("surname");
1731                                 xs << parts.surname;
1732                                 xs << xml::EndTag("surname");
1733                                 xs << xml::CR();
1734                         }
1735                         if (! parts.suffix.empty()) {
1736                                 xs << xml::StartTag("othername", "role=\"suffix\"");
1737                                 xs << parts.suffix;
1738                                 xs << xml::EndTag("othername");
1739                                 xs << xml::CR();
1740                         }
1741                 }
1742
1743                 xs << xml::EndTag("personname");
1744                 xs << xml::CR();
1745                 xs << xml::EndTag("author");
1746                 xs << xml::CR();
1747
1748                 // Could add an affiliation after <personname>, but not stored in BibTeX.
1749         }
1750         xs << xml::EndTag("authorgroup");
1751         xs << xml::CR();
1752
1753         return docstring();
1754 }
1755
1756 } // namespace lyx