src/BiblioInfo.cpp

   1 /**
   2  * \file BiblioInfo.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Angus Leeming
   7  * \author Herbert Voß
   8  * \author Richard Heck
   9  * \author Julien Rioux
  10  * \author Jürgen Spitzmüller
  11  *
  12  * Full author contact details are available in file CREDITS.
  13  */
  14
  15 #include <config.h>
  16
  17 #include "BiblioInfo.h"
  18 #include "Buffer.h"
  19 #include "BufferParams.h"
  20 #include "buffer_funcs.h"
  21 #include "Citation.h"
  22 #include "Encoding.h"
  23 #include "InsetIterator.h"
  24 #include "Language.h"
  25 #include "xml.h"
  26 #include "Paragraph.h"
  27 #include "TextClass.h"
  28 #include "TocBackend.h"
  29
  30 #include "support/convert.h"
  31 #include "support/debug.h"
  32 #include "support/docstream.h"
  33 #include "support/gettext.h"
  34 #include "support/lassert.h"
  35 #include "support/lstrings.h"
  36 #include "support/regex.h"
  37 #include "support/textutils.h"
  38
  39 #include <map>
  40 #include <set>
  41
  42 using namespace std;
  43 using namespace lyx::support;
  44
  45
  46 namespace lyx {
  47
  48 namespace {
  49
  50 // Remove placeholders from names
  51 docstring renormalize(docstring const & input)
  52 {
  53         docstring res = subst(input, from_ascii("$$space!"), from_ascii(" "));
  54         return subst(res, from_ascii("$$comma!"), from_ascii(","));
  55 }
  56
  57
  58 // Split the surname into prefix ("von-part") and family name
  59 pair<docstring, docstring> parseSurname(docstring const & sname)
  60 {
  61         // Split the surname into its tokens
  62         vector<docstring> pieces = getVectorFromString(sname, from_ascii(" "));
  63         if (pieces.size() < 2)
  64                 return make_pair(docstring(), sname);
  65
  66         // Now we look for pieces that begin with a lower case letter.
  67         // All except for the very last token constitute the "von-part".
  68         docstring prefix;
  69         vector<docstring>::const_iterator it = pieces.begin();
  70         vector<docstring>::const_iterator const en = pieces.end();
  71         bool first = true;
  72         for (; it != en; ++it) {
  73                 if ((*it).empty())
  74                         continue;
  75                 // If this is the last piece, then what we now have is
  76                 // the family name, notwithstanding the casing.
  77                 if (it + 1 == en)
  78                         break;
  79                 char_type const c = (*it)[0];
  80                 // If the piece starts with a upper case char, we assume
  81                 // this is part of the surname.
  82                 if (!isLower(c))
  83                         break;
  84                 // Nothing of the former, so add this piece to the prename
  85                 if (!first)
  86                         prefix += " ";
  87                 else
  88                         first = false;
  89                 prefix += *it;
  90         }
  91
  92         // Reconstruct the family name.
  93         // Note that if we left the loop with because it + 1 == en,
  94         // then this will still do the right thing, i.e., make surname
  95         // just be the last piece.
  96         docstring surname;
  97         first = true;
  98         for (; it != en; ++it) {
  99                 if (!first)
 100                         surname += " ";
 101                 else
 102                         first = false;
 103                 surname += *it;
 104         }
 105         return make_pair(prefix, surname);
 106 }
 107
 108
 109 struct name_parts {
 110         docstring surname;
 111         docstring prename;
 112         docstring suffix;
 113         docstring prefix;
 114 };
 115
 116
 117 // gets the name parts (prename, surname, prefix, suffix) from an author-type string
 118 name_parts nameParts(docstring const & iname)
 119 {
 120         name_parts res;
 121         if (iname.empty())
 122                 return res;
 123
 124         // First we check for goupings (via {...}) and replace blanks and
 125         // commas inside groups with temporary placeholders
 126         docstring name;
 127         int gl = 0;
 128         docstring::const_iterator p = iname.begin();
 129         while (p != iname.end()) {
 130                 // count grouping level
 131                 if (*p == '{')
 132                         ++gl;
 133                 else if (*p == '}')
 134                         --gl;
 135                 // generate string with probable placeholders
 136                 if (*p == ' ' && gl > 0)
 137                         name += from_ascii("$$space!");
 138                 else if (*p == ',' && gl > 0)
 139                         name += from_ascii("$$comma!");
 140                 else
 141                         name += *p;
 142                 ++p;
 143         }
 144
 145         // Now we look for a comma, and take the last name to be everything
 146         // preceding the right-most one, so that we also get the name suffix
 147         // (aka "jr" part).
 148         vector<docstring> pieces = getVectorFromString(name);
 149         if (pieces.size() > 1) {
 150                 // Whether we have a name suffix or not, the prename is
 151                 // always last item
 152                 res.prename = renormalize(pieces.back());
 153                 // The family name, conversely, is always the first item.
 154                 // However, it might contain a prefix (aka "von" part)
 155                 docstring const sname = pieces.front();
 156                 res.prefix = renormalize(parseSurname(sname).first);
 157                 res.surname = renormalize(parseSurname(sname).second);
 158                 // If we have three pieces (the maximum allowed by BibTeX),
 159                 // the second one is the name suffix.
 160                 if (pieces.size() > 2)
 161                         res.suffix = renormalize(pieces.at(1));
 162                 return res;
 163         }
 164
 165         // OK, so now we want to look for the last name.
 166         // Split on spaces, to get various tokens.
 167         pieces = getVectorFromString(name, from_ascii(" "));
 168         // No space: Only a family name given
 169         if (pieces.size() < 2) {
 170                 res.surname = renormalize(pieces.back());
 171                 return res;
 172         }
 173         // If we get two pieces, assume "prename surname"
 174         if (pieces.size() == 2) {
 175                 res.prename = renormalize(pieces.front());
 176                 res.surname = renormalize(pieces.back());
 177                 return res;
 178         }
 179
 180         // More than 3 pieces: A name prefix (aka "von" part) might be included.
 181         // We look for the first piece that begins with a lower case letter
 182         // (which is the name prefix, if it is not the last token) or the last token.
 183         docstring prename;
 184         vector<docstring>::const_iterator it = pieces.begin();
 185         vector<docstring>::const_iterator const en = pieces.end();
 186         bool first = true;
 187         for (; it != en; ++it) {
 188                 if ((*it).empty())
 189                         continue;
 190                 char_type const c = (*it)[0];
 191                 // If the piece starts with a lower case char, we assume
 192                 // this is the name prefix and thus prename is complete.
 193                 if (isLower(c))
 194                         break;
 195                 // Same if this is the last piece, which is always the surname.
 196                 if (it + 1 == en)
 197                         break;
 198                 // Nothing of the former, so add this piece to the prename
 199                 if (!first)
 200                         prename += " ";
 201                 else
 202                         first = false;
 203                 prename += *it;
 204         }
 205
 206         // Now reconstruct the family name and strip the prefix.
 207         // Note that if we left the loop because it + 1 == en,
 208         // then this will still do the right thing, i.e., make surname
 209         // just be the last piece.
 210         docstring surname;
 211         first = true;
 212         for (; it != en; ++it) {
 213                 if (!first)
 214                         surname += " ";
 215                 else
 216                         first = false;
 217                 surname += *it;
 218         }
 219         res.prename = renormalize(prename);
 220         res.prefix = renormalize(parseSurname(surname).first);
 221         res.surname = renormalize(parseSurname(surname).second);
 222         return res;
 223 }
 224
 225
 226 docstring constructName(docstring const & name, string const & scheme)
 227 {
 228         // re-constructs a name from name parts according
 229         // to a given scheme
 230         docstring const prename = nameParts(name).prename;
 231         docstring const surname = nameParts(name).surname;
 232         docstring const prefix = nameParts(name).prefix;
 233         docstring const suffix = nameParts(name).suffix;
 234         string res = scheme;
 235         static regex const reg1("(.*)(\\{%prename%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 236         static regex const reg2("(.*)(\\{%suffix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 237         static regex const reg3("(.*)(\\{%prefix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 238         smatch sub;
 239         // Changing the first parameter of regex_match() may corrupt the
 240         // second one. In this case we use the temporary string tmp.
 241         if (regex_match(scheme, sub, reg1)) {
 242                 res = sub.str(1);
 243                 if (!prename.empty())
 244                         res += sub.str(3);
 245                 res += sub.str(5);
 246         }
 247         if (regex_match(res, sub, reg2)) {
 248                 string tmp = sub.str(1);
 249                 if (!suffix.empty())
 250                         tmp += sub.str(3);
 251                 res = tmp + sub.str(5);
 252         }
 253         if (regex_match(res, sub, reg3)) {
 254                 string tmp = sub.str(1);
 255                 if (!prefix.empty())
 256                         tmp += sub.str(3);
 257                 res = tmp + sub.str(5);
 258         }
 259         docstring result = from_ascii(res);
 260         result = subst(result, from_ascii("%prename%"), prename);
 261         result = subst(result, from_ascii("%surname%"), surname);
 262         result = subst(result, from_ascii("%prefix%"), prefix);
 263         result = subst(result, from_ascii("%suffix%"), suffix);
 264         return result;
 265 }
 266
 267
 268 vector<docstring> const getAuthors(docstring const & author)
 269 {
 270         // We check for goupings (via {...}) and only consider " and "
 271         // outside groups as author separator. This is to account
 272         // for cases such as {{Barnes and Noble, Inc.}}, which
 273         // need to be treated as one single family name.
 274         // We use temporary placeholders in order to differentiate the
 275         // diverse " and " cases.
 276
 277         // First, we temporarily replace all ampersands. It is rather unusual
 278         // in author names, but can happen (consider cases such as "C \& A Corp.").
 279         docstring iname = subst(author, from_ascii("&"), from_ascii("$$amp!"));
 280         // Then, we temporarily make all " and " strings to ampersands in order
 281         // to handle them later on a per-char level.
 282         iname = subst(iname, from_ascii(" and "), from_ascii(" & "));
 283         // Now we traverse through the string and replace the "&" by the proper
 284         // output in- and outside groups
 285         docstring name;
 286         int gl = 0;
 287         docstring::const_iterator p = iname.begin();
 288         while (p != iname.end()) {
 289                 // count grouping level
 290                 if (*p == '{')
 291                         ++gl;
 292                 else if (*p == '}')
 293                         --gl;
 294                 // generate string with probable placeholders
 295                 if (*p == '&') {
 296                         if (gl > 0)
 297                                 // Inside groups, we output "and"
 298                                 name += from_ascii("and");
 299                         else
 300                                 // Outside groups, we output a separator
 301                                 name += from_ascii("$$namesep!");
 302                 }
 303                 else
 304                         name += *p;
 305                 ++p;
 306         }
 307
 308         // re-insert the literal ampersands
 309         name = subst(name, from_ascii("$$amp!"), from_ascii("&"));
 310
 311         // Now construct the actual vector
 312         return getVectorFromString(name, from_ascii(" $$namesep! "));
 313 }
 314
 315
 316 bool multipleAuthors(docstring const & author)
 317 {
 318         return getAuthors(author).size() > 1;
 319 }
 320
 321
 322 // converts a string containing LaTeX commands into unicode
 323 // for display.
 324 docstring convertLaTeXCommands(docstring const & str)
 325 {
 326         docstring val = str;
 327         docstring ret;
 328
 329         bool scanning_cmd = false;
 330         bool scanning_math = false;
 331         bool escaped = false; // used to catch \$, etc.
 332         while (!val.empty()) {
 333                 char_type const ch = val[0];
 334
 335                 // if we're scanning math, we output everything until we
 336                 // find an unescaped $, at which point we break out.
 337                 if (scanning_math) {
 338                         if (escaped)
 339                                 escaped = false;
 340                         else if (ch == '\\')
 341                                 escaped = true;
 342                         else if (ch == '$')
 343                                 scanning_math = false;
 344                         ret += ch;
 345                         val = val.substr(1);
 346                         continue;
 347                 }
 348
 349                 // if we're scanning a command name, then we just
 350                 // discard characters until we hit something that
 351                 // isn't alpha.
 352                 if (scanning_cmd) {
 353                         if (isAlphaASCII(ch)) {
 354                                 val = val.substr(1);
 355                                 escaped = false;
 356                                 continue;
 357                         }
 358                         // so we're done with this command.
 359                         // now we fall through and check this character.
 360                         scanning_cmd = false;
 361                 }
 362
 363                 // was the last character a \? If so, then this is something like:
 364                 // \\ or \$, so we'll just output it. That's probably not always right...
 365                 if (escaped) {
 366                         // exception: output \, as THIN SPACE
 367                         if (ch == ',')
 368                                 ret.push_back(0x2009);
 369                         else
 370                                 ret += ch;
 371                         val = val.substr(1);
 372                         escaped = false;
 373                         continue;
 374                 }
 375
 376                 if (ch == '$') {
 377                         ret += ch;
 378                         val = val.substr(1);
 379                         scanning_math = true;
 380                         continue;
 381                 }
 382
 383                 // Change text mode accents in the form
 384                 // {\v a} to \v{a} (see #9340).
 385                 // FIXME: This is a sort of mini-tex2lyx.
 386                 //        Use the real tex2lyx instead!
 387                 static lyx::regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}");
 388                 if (lyx::regex_search(to_utf8(val), tma_reg)) {
 389                         val = val.substr(1);
 390                         val.replace(2, 1, from_ascii("{"));
 391                         continue;
 392                 }
 393
 394                 // Apart from the above, we just ignore braces
 395                 if (ch == '{' || ch == '}') {
 396                         val = val.substr(1);
 397                         continue;
 398                 }
 399
 400                 // we're going to check things that look like commands, so if
 401                 // this doesn't, just output it.
 402                 if (ch != '\\') {
 403                         ret += ch;
 404                         val = val.substr(1);
 405                         continue;
 406                 }
 407
 408                 // ok, could be a command of some sort
 409                 // let's see if it corresponds to some unicode
 410                 // unicodesymbols has things in the form: \"{u},
 411                 // whereas we may see things like: \"u. So we'll
 412                 // look for that and change it, if necessary.
 413                 // FIXME: This is a sort of mini-tex2lyx.
 414                 //        Use the real tex2lyx instead!
 415                 static lyx::regex const reg("^\\\\\\W\\w");
 416                 if (lyx::regex_search(to_utf8(val), reg)) {
 417                         val.insert(3, from_ascii("}"));
 418                         val.insert(2, from_ascii("{"));
 419                 }
 420                 bool termination;
 421                 docstring rem;
 422                 docstring const cnvtd = Encodings::fromLaTeXCommand(val,
 423                                 Encodings::TEXT_CMD, termination, rem);
 424                 if (!cnvtd.empty()) {
 425                         // it did, so we'll take that bit and proceed with what's left
 426                         ret += cnvtd;
 427                         val = rem;
 428                         continue;
 429                 }
 430                 // it's a command of some sort
 431                 scanning_cmd = true;
 432                 escaped = true;
 433                 val = val.substr(1);
 434         }
 435         return ret;
 436 }
 437
 438
 439 // Escape '<' and '>' and remove richtext markers (e.g. {!this is richtext!}) from a string.
 440 docstring processRichtext(docstring const & str, bool richtext)
 441 {
 442         docstring val = str;
 443         docstring ret;
 444
 445         bool scanning_rich = false;
 446         while (!val.empty()) {
 447                 char_type const ch = val[0];
 448                 if (ch == '{' && val.size() > 1 && val[1] == '!') {
 449                         // beginning of rich text
 450                         scanning_rich = true;
 451                         val = val.substr(2);
 452                         continue;
 453                 }
 454                 if (scanning_rich && ch == '!' && val.size() > 1 && val[1] == '}') {
 455                         // end of rich text
 456                         scanning_rich = false;
 457                         val = val.substr(2);
 458                         continue;
 459                 }
 460                 if (richtext) {
 461                         if (scanning_rich)
 462                                 ret += ch;
 463                         else {
 464                                 // we need to escape '<' and '>'
 465                                 if (ch == '<')
 466                                         ret += "&lt;";
 467                                 else if (ch == '>')
 468                                         ret += "&gt;";
 469                                 else
 470                                         ret += ch;
 471                         }
 472                 } else if (!scanning_rich /* && !richtext */)
 473                         ret += ch;
 474                 // else the character is discarded, which will happen only if
 475                 // richtext == false and we are scanning rich text
 476                 val = val.substr(1);
 477         }
 478         return ret;
 479 }
 480
 481 } // namespace
 482
 483
 484 //////////////////////////////////////////////////////////////////////
 485 //
 486 // BibTeXInfo
 487 //
 488 //////////////////////////////////////////////////////////////////////
 489
 490 BibTeXInfo::BibTeXInfo(docstring const & key, docstring const & type)
 491         : is_bibtex_(true), bib_key_(key), num_bib_key_(0), entry_type_(type),
 492           info_(), format_(), modifier_(0)
 493 {}
 494
 495
 496
 497 docstring const BibTeXInfo::getAuthorOrEditorList(Buffer const * buf,
 498                                           bool full, bool forceshort) const
 499 {
 500         docstring author = operator[]("author");
 501         if (author.empty())
 502                 author = operator[]("editor");
 503
 504         return getAuthorList(buf, author, full, forceshort);
 505 }
 506
 507
 508 docstring const BibTeXInfo::getAuthorList(Buffer const * buf,
 509                 docstring const & author, bool const full, bool const forceshort,
 510                 bool const allnames, bool const beginning) const
 511 {
 512         // Maxnames treshold depend on engine
 513         size_t maxnames = buf ?
 514                 buf->params().documentClass().max_citenames() : 2;
 515
 516         if (!is_bibtex_) {
 517                 docstring const opt = label();
 518                 if (opt.empty())
 519                         return docstring();
 520
 521                 docstring authors;
 522                 docstring const remainder = trim(split(opt, authors, '('));
 523                 if (remainder.empty())
 524                         // in this case, we didn't find a "(",
 525                         // so we don't have author (year)
 526                         return docstring();
 527                 if (full) {
 528                         // Natbib syntax is "Jones et al.(1990)Jones, Baker, and Williams"
 529                         docstring const fullauthors = trim(rsplit(remainder, ')'));
 530                         if (!fullauthors.empty())
 531                                 return fullauthors;
 532                 }
 533                 return authors;
 534         }
 535
 536         if (author.empty())
 537                 return author;
 538
 539         // OK, we've got some names. Let's format them.
 540         // Try to split the author list
 541         vector<docstring> const authors = getAuthors(author);
 542
 543         docstring retval;
 544
 545         CiteEngineType const engine_type = buf ? buf->params().citeEngineType()
 546                                                : ENGINE_TYPE_DEFAULT;
 547
 548         // These are defined in the styles
 549         string const etal =
 550                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_etal")
 551                     : " et al.";
 552         string const namesep =
 553                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_namesep")
 554                    : ", ";
 555         string const lastnamesep =
 556                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_lastnamesep")
 557                     : ", and ";
 558         string const pairnamesep =
 559                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_pairnamesep")
 560                      : " and ";
 561         string firstnameform =
 562                         buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstnameform")
 563                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 564         if (!beginning)
 565                 firstnameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstbynameform")
 566                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 567         string othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!othernameform")
 568                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 569         if (!beginning)
 570                 othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!otherbynameform")
 571                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 572         string citenameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!citenameform")
 573                              : "{%prefix%[[%prefix% ]]}%surname%";
 574
 575         // Shorten the list (with et al.) if forceshort is set
 576         // and the list can actually be shortened, else if maxcitenames
 577         // is passed and full is not set.
 578         bool shorten = forceshort && authors.size() > 1;
 579         vector<docstring>::const_iterator it = authors.begin();
 580         vector<docstring>::const_iterator en = authors.end();
 581         for (size_t i = 0; it != en; ++it, ++i) {
 582                 if (i >= maxnames && !full) {
 583                         shorten = true;
 584                         break;
 585                 }
 586                 if (*it == "others") {
 587                         retval += buf ? buf->B_(etal) : from_ascii(etal);
 588                         break;
 589                 }
 590                 if (i > 0 && i == authors.size() - 1) {
 591                         if (authors.size() == 2)
 592                                 retval += buf ? buf->B_(pairnamesep) : from_ascii(pairnamesep);
 593                         else
 594                                 retval += buf ? buf->B_(lastnamesep) : from_ascii(lastnamesep);
 595                 } else if (i > 0)
 596                         retval += buf ? buf->B_(namesep) : from_ascii(namesep);
 597                 if (allnames)
 598                         retval += (i == 0) ? constructName(*it, firstnameform)
 599                                 : constructName(*it, othernameform);
 600                 else
 601                         retval += constructName(*it, citenameform);
 602         }
 603         if (shorten) {
 604                 if (allnames)
 605                         retval = constructName(authors[0], firstnameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 606                 else
 607                         retval = constructName(authors[0], citenameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 608         }
 609
 610         return convertLaTeXCommands(retval);
 611 }
 612
 613
 614 docstring const BibTeXInfo::getYear() const
 615 {
 616         if (is_bibtex_) {
 617                 // first try legacy year field
 618                 docstring year = operator[]("year");
 619                 if (!year.empty())
 620                         return year;
 621                 // now try biblatex's date field
 622                 year = operator[]("date");
 623                 // Format is [-]YYYY-MM-DD*/[-]YYYY-MM-DD*
 624                 // We only want the years.
 625                 static regex const yreg("[-]?([\\d]{4}).*");
 626                 static regex const ereg(".*/[-]?([\\d]{4}).*");
 627                 smatch sm;
 628                 string const date = to_utf8(year);
 629                 if (!regex_match(date, sm, yreg))
 630                         // cannot parse year.
 631                         return docstring();
 632                 year = from_ascii(sm[1]);
 633                 // check for an endyear
 634                 if (regex_match(date, sm, ereg))
 635                         year += char_type(0x2013) + from_ascii(sm[1]);
 636                 return year;
 637         }
 638
 639         docstring const opt = label();
 640         if (opt.empty())
 641                 return docstring();
 642
 643         docstring authors;
 644         docstring tmp = split(opt, authors, '(');
 645         if (tmp.empty())
 646                 // we don't have author (year)
 647                 return docstring();
 648         docstring year;
 649         tmp = split(tmp, year, ')');
 650         return year;
 651 }
 652
 653
 654 void BibTeXInfo::getLocators(docstring & doi, docstring & url, docstring & file) const
 655 {
 656         if (is_bibtex_) {
 657                 // get "doi" entry from citation record
 658                 doi = operator[]("doi");
 659                 if (!doi.empty() && !prefixIs(doi,from_ascii("http")))
 660                         doi = "https://doi.org/" + doi;
 661                 // get "url" entry from citation record
 662                 url = operator[]("url");
 663                 // get "file" entry from citation record
 664                 file = operator[]("file");
 665
 666                 // Jabref case, field has a format:
 667                 // Description:Location:Filetype;Description:Location:Filetype...
 668                 // We will grab only first pdf
 669                 if (!file.empty()) {
 670                         docstring ret, filedest, tmp;
 671                         ret = split(file, tmp, ':');
 672                         tmp = split(ret, filedest, ':');
 673                         //TODO howto deal with relative directories?
 674                         FileName f(to_utf8(filedest));
 675                         if (f.exists())
 676                                 file = "file:///" + filedest;
 677                 }
 678
 679                 // kbibtex case, format:
 680                 // file1.pdf;file2.pdf
 681                 // We will grab only first pdf
 682                 docstring kfile;
 683                 if (file.empty())
 684                         kfile = operator[]("localfile");
 685                 if (!kfile.empty()) {
 686                         docstring filedest, tmp;
 687                         tmp = split(kfile, filedest, ';');
 688                         //TODO howto deal with relative directories?
 689                         FileName f(to_utf8(filedest));
 690                         if (f.exists())
 691                                 file = "file:///" + filedest;
 692                 }
 693
 694                 if (!url.empty())
 695                         return;
 696
 697                 // try biblatex specific fields, see its manual
 698                 // 3.13.7 "Electronic Publishing Informationl"
 699                 docstring eprinttype = operator[]("eprinttype");
 700                 docstring eprint = operator[]("eprint");
 701                 if (eprint.empty())
 702                         return;
 703
 704                 if (eprinttype == "arxiv")
 705                         url = "https://arxiv.org/abs/" + eprint;
 706                 if (eprinttype == "jstor")
 707                         url = "https://www.jstor.org/stable/" + eprint;
 708                 if (eprinttype == "pubmed")
 709                         url = "http://www.ncbi.nlm.nih.gov/pubmed/" + eprint;
 710                 if (eprinttype == "hdl")
 711                         url = "https://hdl.handle.net/" + eprint;
 712                 if (eprinttype == "googlebooks")
 713                         url = "http://books.google.com/books?id=" + eprint;
 714
 715                 return;
 716         }
 717
 718         // Here can be handled the bibliography environment. All one could do
 719         // here is let LyX scan the entry for URL or HRef insets.
 720 }
 721
 722
 723 namespace {
 724
 725 docstring parseOptions(docstring const & format, string & optkey,
 726                     docstring & ifpart, docstring & elsepart);
 727
 728 // Calls parseOptions to deal with an embedded option, such as:
 729 //   {%number%[[, no.~%number%]]}
 730 // which must appear at the start of format. ifelsepart gets the
 731 // whole of the option, and we return what's left after the option.
 732 // we return format if there is an error.
 733 docstring parseEmbeddedOption(docstring const & format, docstring & ifelsepart)
 734 {
 735         LASSERT(format[0] == '{' && format[1] == '%', return format);
 736         string optkey;
 737         docstring ifpart;
 738         docstring elsepart;
 739         docstring const rest = parseOptions(format, optkey, ifpart, elsepart);
 740         if (format == rest) { // parse error
 741                 LYXERR0("ERROR! Couldn't parse `" << format <<"'.");
 742                 return format;
 743         }
 744         LASSERT(rest.size() <= format.size(),
 745                 { ifelsepart = docstring(); return format; });
 746         ifelsepart = format.substr(0, format.size() - rest.size());
 747         return rest;
 748 }
 749
 750
 751 // Gets a "clause" from a format string, where the clause is
 752 // delimited by '[[' and ']]'. Returns what is left after the
 753 // clause is removed, and returns format if there is an error.
 754 docstring getClause(docstring const & format, docstring & clause)
 755 {
 756         docstring fmt = format;
 757         // remove '[['
 758         fmt = fmt.substr(2);
 759         // we'll remove characters from the front of fmt as we
 760         // deal with them
 761         while (!fmt.empty()) {
 762                 if (fmt[0] == ']' && fmt.size() > 1 && fmt[1] == ']') {
 763                         // that's the end
 764                         fmt = fmt.substr(2);
 765                         break;
 766                 }
 767                 // check for an embedded option
 768                 if (fmt[0] == '{' && fmt.size() > 1 && fmt[1] == '%') {
 769                         docstring part;
 770                         docstring const rest = parseEmbeddedOption(fmt, part);
 771                         if (fmt == rest) {
 772                                 LYXERR0("ERROR! Couldn't parse embedded option in `" << format <<"'.");
 773                                 return format;
 774                         }
 775                         clause += part;
 776                         fmt = rest;
 777                 } else { // it's just a normal character
 778                                 clause += fmt[0];
 779                                 fmt = fmt.substr(1);
 780                 }
 781         }
 782         return fmt;
 783 }
 784
 785
 786 // parse an options string, which must appear at the start of the
 787 // format parameter. puts the parsed bits in optkey, ifpart, and
 788 // elsepart and returns what's left after the option is removed.
 789 // if there's an error, it returns format itself.
 790 docstring parseOptions(docstring const & format, string & optkey,
 791                     docstring & ifpart, docstring & elsepart)
 792 {
 793         LASSERT(format[0] == '{' && format[1] == '%', return format);
 794         // strip '{%'
 795         docstring fmt = format.substr(2);
 796         size_t pos = fmt.find('%'); // end of key
 797         if (pos == string::npos) {
 798                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of key.");
 799                 return format;
 800         }
 801         optkey = to_utf8(fmt.substr(0, pos));
 802         fmt = fmt.substr(pos + 1);
 803         // [[format]] should be next
 804         if (fmt[0] != '[' || fmt[1] != '[') {
 805                 LYXERR0("Error parsing  `" << format <<"'. Can't find '[[' after key.");
 806                 return format;
 807         }
 808
 809         docstring curfmt = fmt;
 810         fmt = getClause(curfmt, ifpart);
 811         if (fmt == curfmt) {
 812                 LYXERR0("Error parsing  `" << format <<"'. Couldn't get if clause.");
 813                 return format;
 814         }
 815
 816         if (fmt[0] == '}') // we're done, no else clause
 817                 return fmt.substr(1);
 818
 819         // else part should follow
 820         if (fmt[0] != '[' || fmt[1] != '[') {
 821                 LYXERR0("Error parsing  `" << format <<"'. Can't find else clause.");
 822                 return format;
 823         }
 824
 825         curfmt = fmt;
 826         fmt = getClause(curfmt, elsepart);
 827         // we should be done
 828         if (fmt == curfmt || fmt[0] != '}') {
 829                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of option.");
 830                 return format;
 831         }
 832         return fmt.substr(1);
 833 }
 834
 835
 836 } // namespace
 837
 838 /* FIXME
 839 Bug #9131 revealed an oddity in how we are generating citation information
 840 when more than one key is given. We end up building a longer and longer format
 841 string as we go, which we then have to re-parse, over and over and over again,
 842 rather than generating the information for the individual keys and then putting
 843 all of that together. We do that to deal with the way separators work, from what
 844 I can tell, but it still feels like a hack. Fixing this would require quite a
 845 bit of work, however.
 846 */
 847 docstring BibTeXInfo::expandFormat(docstring const & format,
 848                 BibTeXInfoList const & xrefs, int & counter, Buffer const & buf,
 849                 CiteItem const & ci, bool next, bool second) const
 850 {
 851         // incorrect use of macros could put us in an infinite loop
 852         static int const max_passes = 5000;
 853         // the use of overly large keys can lead to performance problems, due
 854         // to eventual attempts to convert LaTeX macros to unicode. See bug
 855         // #8944. By default, the size is limited to 128 (in CiteItem), but
 856         // for specific purposes (such as XHTML export), it needs to be enlarged
 857         // This is perhaps not the best solution, but it will have to do for now.
 858         size_t const max_keysize = ci.max_key_size;
 859         odocstringstream ret; // return value
 860         string key;
 861         bool scanning_key = false;
 862         bool scanning_rich = false;
 863
 864         CiteEngineType const engine_type = buf.params().citeEngineType();
 865         docstring fmt = format;
 866         // we'll remove characters from the front of fmt as we
 867         // deal with them
 868         while (!fmt.empty()) {
 869                 if (counter > max_passes) {
 870                         LYXERR0("Recursion limit reached while parsing `"
 871                                 << format << "'.");
 872                         return _("ERROR!");
 873                 }
 874
 875                 char_type thischar = fmt[0];
 876                 if (thischar == '%') {
 877                         // beginning or end of key
 878                         if (scanning_key) {
 879                                 // end of key
 880                                 scanning_key = false;
 881                                 // so we replace the key with its value, which may be empty
 882                                 if (key[0] == '!') {
 883                                         // macro
 884                                         string const val =
 885                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 886                                         fmt = from_utf8(val) + fmt.substr(1);
 887                                         counter += 1;
 888                                         continue;
 889                                 } else if (prefixIs(key, "B_")) {
 890                                         // a translatable bit (to the Buffer language)
 891                                         string const val =
 892                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 893                                         docstring const trans =
 894                                                 translateIfPossible(from_utf8(val), buf.params().language->code());
 895                                         ret << trans;
 896                                 } else if (key[0] == '_') {
 897                                         // a translatable bit (to the GUI language)
 898                                         string const val =
 899                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 900                                         docstring const trans =
 901                                                 translateIfPossible(from_utf8(val));
 902                                         ret << trans;
 903                                 } else {
 904                                         docstring const val =
 905                                                 getValueForKey(key, buf, ci, xrefs, max_keysize);
 906                                         if (!scanning_rich)
 907                                                 ret << from_ascii("{!<span class=\"bib-" + key + "\">!}");
 908                                         ret << val;
 909                                         if (!scanning_rich)
 910                                                 ret << from_ascii("{!</span>!}");
 911                                 }
 912                         } else {
 913                                 // beginning of key
 914                                 key.clear();
 915                                 scanning_key = true;
 916                         }
 917                 }
 918                 else if (thischar == '{') {
 919                         // beginning of option?
 920                         if (scanning_key) {
 921                                 LYXERR0("ERROR: Found `{' when scanning key in `" << format << "'.");
 922                                 return _("ERROR!");
 923                         }
 924                         if (fmt.size() > 1) {
 925                                 if (fmt[1] == '%') {
 926                                         // it is the beginning of an optional format
 927                                         string optkey;
 928                                         docstring ifpart;
 929                                         docstring elsepart;
 930                                         docstring const newfmt =
 931                                                 parseOptions(fmt, optkey, ifpart, elsepart);
 932                                         if (newfmt == fmt) // parse error
 933                                                 return _("ERROR!");
 934                                         fmt = newfmt;
 935                                         docstring const val =
 936                                                 getValueForKey(optkey, buf, ci, xrefs);
 937                                         if (optkey == "next" && next)
 938                                                 ret << ifpart; // without expansion
 939                                         else if (optkey == "second" && second) {
 940                                                 int newcounter = 0;
 941                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 942                                                         ci, next);
 943                                         } else if (!val.empty()) {
 944                                                 int newcounter = 0;
 945                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 946                                                         ci, next);
 947                                         } else if (!elsepart.empty()) {
 948                                                 int newcounter = 0;
 949                                                 ret << expandFormat(elsepart, xrefs, newcounter, buf,
 950                                                         ci, next);
 951                                         }
 952                                         // fmt will have been shortened for us already
 953                                         continue;
 954                                 }
 955                                 if (fmt[1] == '!') {
 956                                         // beginning of rich text
 957                                         scanning_rich = true;
 958                                         fmt = fmt.substr(2);
 959                                         ret << from_ascii("{!");
 960                                         continue;
 961                                 }
 962                         }
 963                         // we are here if '{' was not followed by % or !.
 964                         // So it's just a character.
 965                         ret << thischar;
 966                 }
 967                 else if (scanning_rich && thischar == '!'
 968                          && fmt.size() > 1 && fmt[1] == '}') {
 969                         // end of rich text
 970                         scanning_rich = false;
 971                         fmt = fmt.substr(2);
 972                         ret << from_ascii("!}");
 973                         continue;
 974                 }
 975                 else if (scanning_key)
 976                         key += char(thischar);
 977                 else {
 978                         try {
 979                                 ret.put(thischar);
 980                         } catch (EncodingException & /* e */) {
 981                                 LYXERR0("Uncodable character '" << docstring(1, thischar) << " in citation label!");
 982                         }
 983                 }
 984                 fmt = fmt.substr(1);
 985         } // for loop
 986         if (scanning_key) {
 987                 LYXERR0("Never found end of key in `" << format << "'!");
 988                 return _("ERROR!");
 989         }
 990         if (scanning_rich) {
 991                 LYXERR0("Never found end of rich text in `" << format << "'!");
 992                 return _("ERROR!");
 993         }
 994         return ret.str();
 995 }
 996
 997
 998 docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs,
 999         Buffer const & buf, CiteItem const & ci, docstring const & format_in) const
1000 {
1001         bool const richtext = ci.richtext;
1002
1003         CiteEngineType const engine_type = buf.params().citeEngineType();
1004         DocumentClass const & dc = buf.params().documentClass();
1005         docstring const & format = format_in.empty()?
1006                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_)))
1007                               : format_in;
1008
1009         if (format != format_) {
1010                 // clear caches since format changed
1011                 info_.clear();
1012                 info_richtext_.clear();
1013                 format_ = format;
1014         }
1015
1016         if (!richtext && !info_.empty()) {
1017                 info_ = convertLaTeXCommands(processRichtext(info_, false));
1018                 return info_;
1019         }
1020         if (richtext && !info_richtext_.empty())
1021                 return info_richtext_;
1022
1023         if (!is_bibtex_) {
1024                 BibTeXInfo::const_iterator it = find(from_ascii("ref"));
1025                 info_ = it->second;
1026                 return info_;
1027         }
1028
1029         int counter = 0;
1030         info_ = expandFormat(format, xrefs, counter, buf,
1031                 ci, false, false);
1032
1033         if (info_.empty()) {
1034                 // this probably shouldn't happen
1035                 return info_;
1036         }
1037
1038         if (richtext) {
1039                 info_richtext_ = convertLaTeXCommands(processRichtext(info_, true));
1040                 return info_richtext_;
1041         }
1042
1043         info_ = convertLaTeXCommands(processRichtext(info_, false));
1044         return info_;
1045 }
1046
1047
1048 docstring const BibTeXInfo::getLabel(BibTeXInfoList const xrefs,
1049         Buffer const & buf, docstring const & format,
1050         CiteItem const & ci, bool next, bool second) const
1051 {
1052         docstring loclabel;
1053
1054         int counter = 0;
1055         loclabel = expandFormat(format, xrefs, counter, buf, ci, next, second);
1056
1057         if (!loclabel.empty() && !next) {
1058                 loclabel = processRichtext(loclabel, ci.richtext);
1059                 loclabel = convertLaTeXCommands(loclabel);
1060         }
1061
1062         return loclabel;
1063 }
1064
1065
1066 docstring const & BibTeXInfo::operator[](docstring const & field) const
1067 {
1068         BibTeXInfo::const_iterator it = find(field);
1069         if (it != end())
1070                 return it->second;
1071         static docstring const empty_value = docstring();
1072         return empty_value;
1073 }
1074
1075
1076 docstring const & BibTeXInfo::operator[](string const & field) const
1077 {
1078         return operator[](from_ascii(field));
1079 }
1080
1081
1082 docstring BibTeXInfo::getValueForKey(string const & oldkey, Buffer const & buf,
1083         CiteItem const & ci, BibTeXInfoList const & xrefs, size_t maxsize) const
1084 {
1085         // anything less is pointless
1086         LASSERT(maxsize >= 16, maxsize = 16);
1087         string key = oldkey;
1088         bool cleanit = false;
1089         if (prefixIs(oldkey, "clean:")) {
1090                 key = oldkey.substr(6);
1091                 cleanit = true;
1092         }
1093
1094         docstring ret = operator[](key);
1095         if (ret.empty() && !xrefs.empty()) {
1096                 // xr is a (reference to a) BibTeXInfo const *
1097                 for (auto const & xr : xrefs) {
1098                         if (xr && !(*xr)[key].empty()) {
1099                                 ret = (*xr)[key];
1100                                 break;
1101                         }
1102                 }
1103         }
1104         if (ret.empty()) {
1105                 // some special keys
1106                 // FIXME: dialog, textbefore and textafter have nothing to do with this
1107                 if (key == "dialog" && ci.context == CiteItem::Dialog)
1108                         ret = from_ascii("x"); // any non-empty string will do
1109                 else if (key == "export" && ci.context == CiteItem::Export)
1110                         ret = from_ascii("x"); // any non-empty string will do
1111                 else if (key == "ifstar" && ci.Starred)
1112                         ret = from_ascii("x"); // any non-empty string will do
1113                 else if (key == "ifqualified" && ci.isQualified)
1114                         ret = from_ascii("x"); // any non-empty string will do
1115                 else if (key == "entrytype")
1116                         ret = entry_type_;
1117                 else if (prefixIs(key, "ifentrytype:")
1118                          && from_ascii(key.substr(12)) == entry_type_)
1119                         ret = from_ascii("x"); // any non-empty string will do
1120                 else if (key == "key")
1121                         ret = bib_key_;
1122                 else if (key == "label")
1123                         ret = label_;
1124                 else if (key == "modifier" && modifier_ != 0)
1125                         ret = modifier_;
1126                 else if (key == "numericallabel")
1127                         ret = cite_number_;
1128                 else if (prefixIs(key, "ifmultiple:")) {
1129                         // Return whether we have multiple authors
1130                         docstring const kind = operator[](from_ascii(key.substr(11)));
1131                         if (multipleAuthors(kind))
1132                                 ret = from_ascii("x"); // any non-empty string will do
1133                 }
1134                 else if (prefixIs(key, "abbrvnames:")) {
1135                         // Special key to provide abbreviated name list,
1136                         // with respect to maxcitenames. Suitable for Bibliography
1137                         // beginnings.
1138                         docstring const kind = operator[](from_ascii(key.substr(11)));
1139                         ret = getAuthorList(&buf, kind, false, false, true);
1140                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1141                                 ret[0] = uppercase(ret[0]);
1142                 } else if (prefixIs(key, "fullnames:")) {
1143                         // Return a full name list. Suitable for Bibliography
1144                         // beginnings.
1145                         docstring const kind = operator[](from_ascii(key.substr(10)));
1146                         ret = getAuthorList(&buf, kind, true, false, true);
1147                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1148                                 ret[0] = uppercase(ret[0]);
1149                 } else if (prefixIs(key, "forceabbrvnames:")) {
1150                         // Special key to provide abbreviated name lists,
1151                         // irrespective of maxcitenames. Suitable for Bibliography
1152                         // beginnings.
1153                         docstring const kind = operator[](from_ascii(key.substr(15)));
1154                         ret = getAuthorList(&buf, kind, false, true, true);
1155                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1156                                 ret[0] = uppercase(ret[0]);
1157                 } else if (prefixIs(key, "abbrvbynames:")) {
1158                         // Special key to provide abbreviated name list,
1159                         // with respect to maxcitenames. Suitable for further names inside a
1160                         // bibliography item // (such as "ed. by ...")
1161                         docstring const kind = operator[](from_ascii(key.substr(11)));
1162                         ret = getAuthorList(&buf, kind, false, false, true, false);
1163                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1164                                 ret[0] = uppercase(ret[0]);
1165                 } else if (prefixIs(key, "fullbynames:")) {
1166                         // Return a full name list. Suitable for further names inside a
1167                         // bibliography item // (such as "ed. by ...")
1168                         docstring const kind = operator[](from_ascii(key.substr(10)));
1169                         ret = getAuthorList(&buf, kind, true, false, true, false);
1170                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1171                                 ret[0] = uppercase(ret[0]);
1172                 } else if (prefixIs(key, "forceabbrvbynames:")) {
1173                         // Special key to provide abbreviated name lists,
1174                         // irrespective of maxcitenames. Suitable for further names inside a
1175                         // bibliography item // (such as "ed. by ...")
1176                         docstring const kind = operator[](from_ascii(key.substr(15)));
1177                         ret = getAuthorList(&buf, kind, false, true, true, false);
1178                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1179                                 ret[0] = uppercase(ret[0]);
1180                 } else if (key == "abbrvciteauthor") {
1181                         // Special key to provide abbreviated author or
1182                         // editor names (suitable for citation labels),
1183                         // with respect to maxcitenames.
1184                         ret = getAuthorOrEditorList(&buf, false, false);
1185                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1186                                 ret[0] = uppercase(ret[0]);
1187                 } else if (key == "fullciteauthor") {
1188                         // Return a full author or editor list (for citation labels)
1189                         ret = getAuthorOrEditorList(&buf, true, false);
1190                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1191                                 ret[0] = uppercase(ret[0]);
1192                 } else if (key == "forceabbrvciteauthor") {
1193                         // Special key to provide abbreviated author or
1194                         // editor names (suitable for citation labels),
1195                         // irrespective of maxcitenames.
1196                         ret = getAuthorOrEditorList(&buf, false, true);
1197                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1198                                 ret[0] = uppercase(ret[0]);
1199                 } else if (key == "bibentry") {
1200                         // Special key to provide the full bibliography entry: see getInfo()
1201                         CiteEngineType const engine_type = buf.params().citeEngineType();
1202                         DocumentClass const & dc = buf.params().documentClass();
1203                         docstring const & format =
1204                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_), false));
1205                         int counter = 0;
1206                         ret = expandFormat(format, xrefs, counter, buf, ci, false, false);
1207                 } else if (key == "textbefore")
1208                         ret = ci.textBefore;
1209                 else if (key == "textafter")
1210                         ret = ci.textAfter;
1211                 else if (key == "curpretext") {
1212                         vector<pair<docstring, docstring>> pres = ci.getPretexts();
1213                         vector<pair<docstring, docstring>>::iterator it = pres.begin();
1214                         int numkey = 1;
1215                         for (; it != pres.end() ; ++it) {
1216                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1217                                         ret = (*it).second;
1218                                         pres.erase(it);
1219                                         break;
1220                                 }
1221                                 if ((*it).first == bib_key_)
1222                                         ++numkey;
1223                         }
1224                 } else if (key == "curposttext") {
1225                         vector<pair<docstring, docstring>> posts = ci.getPosttexts();
1226                         vector<pair<docstring, docstring>>::iterator it = posts.begin();
1227                         int numkey = 1;
1228                         for (; it != posts.end() ; ++it) {
1229                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1230                                         ret = (*it).second;
1231                                         posts.erase(it);
1232                                         break;
1233                                 }
1234                                 if ((*it).first == bib_key_)
1235                                         ++numkey;
1236                         }
1237                 } else if (key == "year")
1238                         ret = getYear();
1239         }
1240
1241         if (cleanit)
1242                 ret = xml::cleanAttr(ret);
1243
1244         // make sure it is not too big
1245         support::truncateWithEllipsis(ret, maxsize);
1246         return ret;
1247 }
1248
1249
1250 //////////////////////////////////////////////////////////////////////
1251 //
1252 // BiblioInfo
1253 //
1254 //////////////////////////////////////////////////////////////////////
1255
1256 namespace {
1257
1258 // A functor for use with sort, leading to case insensitive sorting
1259 bool compareNoCase(const docstring & a, const docstring & b) {
1260         return compare_no_case(a, b) < 0;
1261 }
1262
1263 } // namespace
1264
1265
1266 vector<docstring> const BiblioInfo::getXRefs(BibTeXInfo const & data, bool const nested) const
1267 {
1268         vector<docstring> result;
1269         if (!data.isBibTeX())
1270                 return result;
1271         // Legacy crossref field. This is not nestable.
1272         if (!nested && !data["crossref"].empty()) {
1273                 docstring const xrefkey = data["crossref"];
1274                 result.push_back(xrefkey);
1275                 // However, check for nested xdatas
1276                 BiblioInfo::const_iterator it = find(xrefkey);
1277                 if (it != end()) {
1278                         BibTeXInfo const & xref = it->second;
1279                         vector<docstring> const nxdata = getXRefs(xref, true);
1280                         if (!nxdata.empty())
1281                                 result.insert(result.end(), nxdata.begin(), nxdata.end());
1282                 }
1283         }
1284         // Biblatex's xdata field. Infinitely nestable.
1285         // XData field can consist of a comma-separated list of keys
1286         vector<docstring> const xdatakeys = getVectorFromString(data["xdata"]);
1287         if (!xdatakeys.empty()) {
1288                 for (auto const & xdatakey : xdatakeys) {
1289                         result.push_back(xdatakey);
1290                         BiblioInfo::const_iterator it = find(xdatakey);
1291                         if (it != end()) {
1292                                 BibTeXInfo const & xdata = it->second;
1293                                 vector<docstring> const nxdata = getXRefs(xdata, true);
1294                                 if (!nxdata.empty())
1295                                         result.insert(result.end(), nxdata.begin(), nxdata.end());
1296                         }
1297                 }
1298         }
1299         return result;
1300 }
1301
1302
1303 vector<docstring> const BiblioInfo::getKeys() const
1304 {
1305         vector<docstring> bibkeys;
1306         for (auto const & bi : *this)
1307                 bibkeys.push_back(bi.first);
1308         sort(bibkeys.begin(), bibkeys.end(), &compareNoCase);
1309         return bibkeys;
1310 }
1311
1312
1313 vector<docstring> const BiblioInfo::getFields() const
1314 {
1315         vector<docstring> bibfields;
1316         for (auto const & fn : field_names_)
1317                 bibfields.push_back(fn);
1318         sort(bibfields.begin(), bibfields.end());
1319         return bibfields;
1320 }
1321
1322
1323 vector<docstring> const BiblioInfo::getEntries() const
1324 {
1325         vector<docstring> bibentries;
1326         for (auto const & et : entry_types_)
1327                 bibentries.push_back(et);
1328         sort(bibentries.begin(), bibentries.end());
1329         return bibentries;
1330 }
1331
1332
1333 docstring const BiblioInfo::getAuthorOrEditorList(docstring const & key, Buffer const & buf) const
1334 {
1335         BiblioInfo::const_iterator it = find(key);
1336         if (it == end())
1337                 return docstring();
1338         BibTeXInfo const & data = it->second;
1339         return data.getAuthorOrEditorList(&buf, false);
1340 }
1341
1342
1343 docstring const BiblioInfo::getCiteNumber(docstring const & key) const
1344 {
1345         BiblioInfo::const_iterator it = find(key);
1346         if (it == end())
1347                 return docstring();
1348         BibTeXInfo const & data = it->second;
1349         return data.citeNumber();
1350 }
1351
1352 void BiblioInfo::getLocators(docstring const & key, docstring & doi, docstring & url, docstring & file) const
1353 {
1354         BiblioInfo::const_iterator it = find(key);
1355          if (it == end())
1356                 return;
1357         BibTeXInfo const & data = it->second;
1358         data.getLocators(doi,url,file);
1359 }
1360
1361
1362 docstring const BiblioInfo::getYear(docstring const & key, bool use_modifier) const
1363 {
1364         BiblioInfo::const_iterator it = find(key);
1365         if (it == end())
1366                 return docstring();
1367         BibTeXInfo const & data = it->second;
1368         docstring year = data.getYear();
1369         if (year.empty()) {
1370                 // let's try the crossrefs
1371                 vector<docstring> const xrefs = getXRefs(data);
1372                 if (xrefs.empty())
1373                         // no luck
1374                         return docstring();
1375                 for (docstring const & xref : xrefs) {
1376                         BiblioInfo::const_iterator const xrefit = find(xref);
1377                         if (xrefit == end())
1378                                 continue;
1379                         BibTeXInfo const & xref_data = xrefit->second;
1380                         year = xref_data.getYear();
1381                         if (!year.empty())
1382                                 // success!
1383                                 break;
1384                 }
1385         }
1386         if (use_modifier && data.modifier() != 0)
1387                 year += data.modifier();
1388         return year;
1389 }
1390
1391
1392 docstring const BiblioInfo::getYear(docstring const & key, Buffer const & buf, bool use_modifier) const
1393 {
1394         docstring const year = getYear(key, use_modifier);
1395         if (year.empty())
1396                 return buf.B_("No year");
1397         return year;
1398 }
1399
1400
1401 docstring const BiblioInfo::getInfo(docstring const & key,
1402         Buffer const & buf, CiteItem const & ci, docstring const & format) const
1403 {
1404         BiblioInfo::const_iterator it = find(key);
1405         if (it == end())
1406                 return docstring(_("Bibliography entry not found!"));
1407         BibTeXInfo const & data = it->second;
1408         BibTeXInfoList xrefptrs;
1409         for (docstring const & xref : getXRefs(data)) {
1410                 BiblioInfo::const_iterator const xrefit = find(xref);
1411                 if (xrefit != end())
1412                         xrefptrs.push_back(&(xrefit->second));
1413         }
1414         return data.getInfo(xrefptrs, buf, ci, format);
1415 }
1416
1417
1418 docstring const BiblioInfo::getLabel(vector<docstring> keys,
1419         Buffer const & buf, string const & style, CiteItem const & ci) const
1420 {
1421         size_t max_size = ci.max_size;
1422         // shorter makes no sense
1423         LASSERT(max_size >= 16, max_size = 16);
1424
1425         // we can't display more than 10 of these, anyway
1426         // but since we truncate in the middle,
1427         // we need to split into two halfs.
1428         bool const too_many_keys = keys.size() > 10;
1429         vector<docstring> lkeys;
1430         if (too_many_keys) {
1431                 lkeys.insert(lkeys.end(), keys.end() - 5, keys.end());
1432                 keys.resize(5);
1433                 keys.insert(keys.end(), lkeys.begin(), lkeys.end());
1434         }
1435
1436         CiteEngineType const engine_type = buf.params().citeEngineType();
1437         DocumentClass const & dc = buf.params().documentClass();
1438         docstring const & format = from_utf8(dc.getCiteFormat(engine_type, style, false, "cite"));
1439         docstring ret = format;
1440         vector<docstring>::const_iterator key = keys.begin();
1441         vector<docstring>::const_iterator ken = keys.end();
1442         vector<docstring> handled_keys;
1443         for (int i = 0; key != ken; ++key, ++i) {
1444                 handled_keys.push_back(*key);
1445                 int n = 0;
1446                 for (auto const & k : handled_keys) {
1447                         if (k == *key)
1448                                 ++n;
1449                 }
1450                 BiblioInfo::const_iterator it = find(*key);
1451                 BibTeXInfo empty_data;
1452                 empty_data.key(*key);
1453                 BibTeXInfo & data = empty_data;
1454                 vector<BibTeXInfo const *> xrefptrs;
1455                 if (it != end()) {
1456                         data = it->second;
1457                         for (docstring const & xref : getXRefs(data)) {
1458                                 BiblioInfo::const_iterator const xrefit = find(xref);
1459                                 if (xrefit != end())
1460                                         xrefptrs.push_back(&(xrefit->second));
1461                         }
1462                 }
1463                 data.numKey(n);
1464                 ret = data.getLabel(xrefptrs, buf, ret, ci, key + 1 != ken, i == 1);
1465         }
1466
1467         support::truncateWithEllipsis(ret, max_size, true);
1468
1469         return ret;
1470 }
1471
1472
1473 bool BiblioInfo::isBibtex(docstring const & key) const
1474 {
1475         docstring key1;
1476         split(key, key1, ',');
1477         BiblioInfo::const_iterator it = find(key1);
1478         if (it == end())
1479                 return false;
1480         return it->second.isBibTeX();
1481 }
1482
1483
1484 BiblioInfo::CiteStringMap const BiblioInfo::getCiteStrings(
1485         vector<docstring> const & keys, vector<CitationStyle> const & styles,
1486         Buffer const & buf, CiteItem const & ci) const
1487 {
1488         if (empty())
1489                 return vector<pair<docstring,docstring>>();
1490
1491         string style;
1492         CiteStringMap csm(styles.size());
1493         for (size_t i = 0; i != csm.size(); ++i) {
1494                 style = styles[i].name;
1495                 csm[i] = make_pair(from_ascii(style), getLabel(keys, buf, style, ci));
1496         }
1497
1498         return csm;
1499 }
1500
1501
1502 void BiblioInfo::mergeBiblioInfo(BiblioInfo const & info)
1503 {
1504         bimap_.insert(info.begin(), info.end());
1505         field_names_.insert(info.field_names_.begin(), info.field_names_.end());
1506         entry_types_.insert(info.entry_types_.begin(), info.entry_types_.end());
1507 }
1508
1509
1510 namespace {
1511
1512 // used in xhtml to sort a list of BibTeXInfo objects
1513 bool lSorter(BibTeXInfo const * lhs, BibTeXInfo const * rhs)
1514 {
1515         docstring const lauth = lhs->getAuthorOrEditorList();
1516         docstring const rauth = rhs->getAuthorOrEditorList();
1517         docstring const lyear = lhs->getYear();
1518         docstring const ryear = rhs->getYear();
1519         docstring const ltitl = lhs->operator[]("title");
1520         docstring const rtitl = rhs->operator[]("title");
1521         return  (lauth < rauth)
1522                 || (lauth == rauth && lyear < ryear)
1523                 || (lauth == rauth && lyear == ryear && ltitl < rtitl);
1524 }
1525
1526 } // namespace
1527
1528
1529 void BiblioInfo::collectCitedEntries(Buffer const & buf)
1530 {
1531         cited_entries_.clear();
1532         // We are going to collect all the citation keys used in the document,
1533         // getting them from the TOC.
1534         // FIXME We may want to collect these differently, in the first case,
1535         // so that we might have them in order of appearance.
1536         set<docstring> citekeys;
1537         Toc const & toc = *buf.tocBackend().toc("citation");
1538         for (auto const & t : toc) {
1539                 if (t.str().empty())
1540                         continue;
1541                 vector<docstring> const keys = getVectorFromString(t.str());
1542                 citekeys.insert(keys.begin(), keys.end());
1543         }
1544         if (citekeys.empty())
1545                 return;
1546
1547         // We have a set of the keys used in this document.
1548         // We will now convert it to a list of the BibTeXInfo objects used in
1549         // this document...
1550         vector<BibTeXInfo const *> bi;
1551         for (auto const & ck : citekeys) {
1552                 BiblioInfo::const_iterator const bt = find(ck);
1553                 if (bt == end() || !bt->second.isBibTeX())
1554                         continue;
1555                 bi.push_back(&(bt->second));
1556         }
1557         // ...and sort it.
1558         sort(bi.begin(), bi.end(), lSorter);
1559
1560         // Now we can write the sorted keys
1561         // b is a BibTeXInfo const *
1562         for (auto const & b : bi)
1563                 cited_entries_.push_back(b->key());
1564 }
1565
1566
1567 void BiblioInfo::makeCitationLabels(Buffer const & buf)
1568 {
1569         collectCitedEntries(buf);
1570         CiteEngineType const engine_type = buf.params().citeEngineType();
1571         bool const numbers = (engine_type & ENGINE_TYPE_NUMERICAL);
1572
1573         int keynumber = 0;
1574         char modifier = 0;
1575         // used to remember the last one we saw
1576         // we'll be comparing entries to see if we need to add
1577         // modifiers, like "1984a"
1578         map<docstring, BibTeXInfo>::iterator last = bimap_.end();
1579
1580         // add letters to years
1581         for (auto const & ce : cited_entries_) {
1582                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1583                 // this shouldn't happen, but...
1584                 if (biit == bimap_.end())
1585                         // ...fail gracefully, anyway.
1586                         continue;
1587                 BibTeXInfo & entry = biit->second;
1588                 if (numbers) {
1589                         docstring const num = convert<docstring>(++keynumber);
1590                         entry.setCiteNumber(num);
1591                 } else {
1592                         // The first test here is checking whether this is the first
1593                         // time through the loop. If so, then we do not have anything
1594                         // with which to compare.
1595                         if (last != bimap_.end()
1596                             && entry.getAuthorOrEditorList() == last->second.getAuthorOrEditorList()
1597                             // we access the year via getYear() so as to get it from the xref,
1598                             // if we need to do so
1599                             && getYear(entry.key()) == getYear(last->second.key())) {
1600                                 if (modifier == 0) {
1601                                         // so the last one should have been 'a'
1602                                         last->second.setModifier('a');
1603                                         modifier = 'b';
1604                                 } else if (modifier == 'z')
1605                                         modifier = 'A';
1606                                 else
1607                                         modifier++;
1608                         } else {
1609                                 modifier = 0;
1610                         }
1611                         entry.setModifier(modifier);
1612                         // remember the last one
1613                         last = biit;
1614                 }
1615         }
1616         // Set the labels
1617         for (auto const & ce : cited_entries_) {
1618                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1619                 // this shouldn't happen, but...
1620                 if (biit == bimap_.end())
1621                         // ...fail gracefully, anyway.
1622                         continue;
1623                 BibTeXInfo & entry = biit->second;
1624                 if (numbers) {
1625                         entry.label(entry.citeNumber());
1626                 } else {
1627                         docstring const auth = entry.getAuthorOrEditorList(&buf, false);
1628                         // we do it this way so as to access the xref, if necessary
1629                         // note that this also gives us the modifier
1630                         docstring const year = getYear(ce, buf, true);
1631                         if (!auth.empty() && !year.empty())
1632                                 entry.label(auth + ' ' + year);
1633                         else
1634                                 entry.label(entry.key());
1635                 }
1636         }
1637 }
1638
1639
1640 //////////////////////////////////////////////////////////////////////
1641 //
1642 // CitationStyle
1643 //
1644 //////////////////////////////////////////////////////////////////////
1645
1646
1647 CitationStyle citationStyleFromString(string const & command,
1648                                       BufferParams const & params)
1649 {
1650         CitationStyle cs;
1651         if (command.empty())
1652                 return cs;
1653
1654         string const alias = params.getCiteAlias(command);
1655         string cmd = alias.empty() ? command : alias;
1656         if (isUpperCase(command[0])) {
1657                 cs.forceUpperCase = true;
1658                 cmd[0] = lowercase(cmd[0]);
1659         }
1660
1661         size_t const n = command.size() - 1;
1662         if (command[n] == '*') {
1663                 cs.hasStarredVersion = true;
1664                 if (suffixIs(cmd, '*'))
1665                         cmd = cmd.substr(0, cmd.size() - 1);
1666         }
1667
1668         cs.name = cmd;
1669         return cs;
1670 }
1671
1672
1673 string citationStyleToString(const CitationStyle & cs, bool const latex)
1674 {
1675         string cmd = latex ? cs.cmd : cs.name;
1676         if (cs.forceUpperCase)
1677                 cmd[0] = uppercase(cmd[0]);
1678         if (cs.hasStarredVersion)
1679                 cmd += '*';
1680         return cmd;
1681 }
1682
1683
1684 docstring authorsToDocBookAuthorGroup(docstring const & authorsString, XMLStream & xs, Buffer const & buf)
1685 {
1686         // This function closely mimics getAuthorList, but produces DocBook instead of text.
1687         // It has been greatly simplified, as the complete list of authors is always produced. No separators are required,
1688         // as the output has a database-like shape.
1689         // constructName has also been merged within, as it becomes really simple and leads to no copy-paste.
1690
1691         if (authorsString.empty()) {
1692                 return docstring();
1693         }
1694
1695         // Split the input list of authors into individual authors.
1696         vector<docstring> const authors = getAuthors(authorsString);
1697
1698         // Retrieve the "et al." variation.
1699         string const etal = buf.params().documentClass().getCiteMacro(buf.params().citeEngineType(), "_etal");
1700
1701         // Output the list of authors.
1702         xs << xml::StartTag("authorgroup");
1703         xs << xml::CR();
1704
1705         auto it = authors.cbegin();
1706         auto en = authors.cend();
1707         for (size_t i = 0; it != en; ++it, ++i) {
1708                 xs << xml::StartTag("author");
1709                 xs << xml::CR();
1710                 xs << xml::StartTag("personname");
1711                 xs << xml::CR();
1712                 docstring name = *it;
1713
1714                 // All authors go in a <personname>. If more structure is known, use it; otherwise (just "et al."), print it as such.
1715                 if (name == "others") {
1716                         xs << buf.B_(etal);
1717                 } else {
1718                         name_parts parts = nameParts(name);
1719                         if (! parts.prefix.empty()) {
1720                                 xs << xml::StartTag("honorific");
1721                                 xs << parts.prefix;
1722                                 xs << xml::EndTag("honorific");
1723                                 xs << xml::CR();
1724                         }
1725                         if (! parts.prename.empty()) {
1726                                 xs << xml::StartTag("firstname");
1727                                 xs << parts.prename;
1728                                 xs << xml::EndTag("firstname");
1729                                 xs << xml::CR();
1730                         }
1731                         if (! parts.surname.empty()) {
1732                                 xs << xml::StartTag("surname");
1733                                 xs << parts.surname;
1734                                 xs << xml::EndTag("surname");
1735                                 xs << xml::CR();
1736                         }
1737                         if (! parts.suffix.empty()) {
1738                                 xs << xml::StartTag("othername", "role=\"suffix\"");
1739                                 xs << parts.suffix;
1740                                 xs << xml::EndTag("othername");
1741                                 xs << xml::CR();
1742                         }
1743                 }
1744
1745                 xs << xml::EndTag("personname");
1746                 xs << xml::CR();
1747                 xs << xml::EndTag("author");
1748                 xs << xml::CR();
1749
1750                 // Could add an affiliation after <personname>, but not stored in BibTeX.
1751         }
1752         xs << xml::EndTag("authorgroup");
1753         xs << xml::CR();
1754
1755         return docstring();
1756 }
1757
1758 } // namespace lyx