src/BiblioInfo.cpp

   1 /**
   2  * \file BiblioInfo.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Angus Leeming
   7  * \author Herbert Voß
   8  * \author Richard Heck
   9  * \author Julien Rioux
  10  * \author Jürgen Spitzmüller
  11  *
  12  * Full author contact details are available in file CREDITS.
  13  */
  14
  15 #include <config.h>
  16
  17 #include "BiblioInfo.h"
  18 #include "Buffer.h"
  19 #include "BufferParams.h"
  20 #include "buffer_funcs.h"
  21 #include "Citation.h"
  22 #include "Encoding.h"
  23 #include "InsetIterator.h"
  24 #include "Language.h"
  25 #include "xml.h"
  26 #include "Paragraph.h"
  27 #include "TextClass.h"
  28 #include "TocBackend.h"
  29
  30 #include "support/convert.h"
  31 #include "support/debug.h"
  32 #include "support/docstream.h"
  33 #include "support/FileName.h"
  34 #include "support/gettext.h"
  35 #include "support/lassert.h"
  36 #include "support/lstrings.h"
  37 #include "support/regex.h"
  38 #include "support/textutils.h"
  39
  40 #include <map>
  41 #include <set>
  42
  43 using namespace std;
  44 using namespace lyx::support;
  45
  46
  47 namespace lyx {
  48
  49 namespace {
  50
  51 // Remove placeholders from names
  52 docstring renormalize(docstring const & input)
  53 {
  54         docstring res = subst(input, from_ascii("$$space!"), from_ascii(" "));
  55         return subst(res, from_ascii("$$comma!"), from_ascii(","));
  56 }
  57
  58
  59 // Split the surname into prefix ("von-part") and family name
  60 pair<docstring, docstring> parseSurname(docstring const & sname)
  61 {
  62         // Split the surname into its tokens
  63         vector<docstring> pieces = getVectorFromString(sname, from_ascii(" "));
  64         if (pieces.size() < 2)
  65                 return make_pair(docstring(), sname);
  66
  67         // Now we look for pieces that begin with a lower case letter.
  68         // All except for the very last token constitute the "von-part".
  69         docstring prefix;
  70         vector<docstring>::const_iterator it = pieces.begin();
  71         vector<docstring>::const_iterator const en = pieces.end();
  72         bool first = true;
  73         for (; it != en; ++it) {
  74                 if ((*it).empty())
  75                         continue;
  76                 // If this is the last piece, then what we now have is
  77                 // the family name, notwithstanding the casing.
  78                 if (it + 1 == en)
  79                         break;
  80                 char_type const c = (*it)[0];
  81                 // If the piece starts with a upper case char, we assume
  82                 // this is part of the surname.
  83                 if (!isLower(c))
  84                         break;
  85                 // Nothing of the former, so add this piece to the prename
  86                 if (!first)
  87                         prefix += " ";
  88                 else
  89                         first = false;
  90                 prefix += *it;
  91         }
  92
  93         // Reconstruct the family name.
  94         // Note that if we left the loop with because it + 1 == en,
  95         // then this will still do the right thing, i.e., make surname
  96         // just be the last piece.
  97         docstring surname;
  98         first = true;
  99         for (; it != en; ++it) {
 100                 if (!first)
 101                         surname += " ";
 102                 else
 103                         first = false;
 104                 surname += *it;
 105         }
 106         return make_pair(prefix, surname);
 107 }
 108
 109
 110 struct name_parts {
 111         docstring surname;
 112         docstring prename;
 113         docstring suffix;
 114         docstring prefix;
 115 };
 116
 117
 118 // gets the name parts (prename, surname, prefix, suffix) from an author-type string
 119 name_parts nameParts(docstring const & iname)
 120 {
 121         name_parts res;
 122         if (iname.empty())
 123                 return res;
 124
 125         // First we check for goupings (via {...}) and replace blanks and
 126         // commas inside groups with temporary placeholders
 127         docstring name;
 128         int gl = 0;
 129         docstring::const_iterator p = iname.begin();
 130         while (p != iname.end()) {
 131                 // count grouping level
 132                 if (*p == '{')
 133                         ++gl;
 134                 else if (*p == '}')
 135                         --gl;
 136                 // generate string with probable placeholders
 137                 if (*p == ' ' && gl > 0)
 138                         name += from_ascii("$$space!");
 139                 else if (*p == ',' && gl > 0)
 140                         name += from_ascii("$$comma!");
 141                 else
 142                         name += *p;
 143                 ++p;
 144         }
 145
 146         // Now we look for a comma, and take the last name to be everything
 147         // preceding the right-most one, so that we also get the name suffix
 148         // (aka "jr" part).
 149         vector<docstring> pieces = getVectorFromString(name);
 150         if (pieces.size() > 1) {
 151                 // Whether we have a name suffix or not, the prename is
 152                 // always last item
 153                 res.prename = renormalize(pieces.back());
 154                 // The family name, conversely, is always the first item.
 155                 // However, it might contain a prefix (aka "von" part)
 156                 docstring const sname = pieces.front();
 157                 res.prefix = renormalize(parseSurname(sname).first);
 158                 res.surname = renormalize(parseSurname(sname).second);
 159                 // If we have three pieces (the maximum allowed by BibTeX),
 160                 // the second one is the name suffix.
 161                 if (pieces.size() > 2)
 162                         res.suffix = renormalize(pieces.at(1));
 163                 return res;
 164         }
 165
 166         // OK, so now we want to look for the last name.
 167         // Split on spaces, to get various tokens.
 168         pieces = getVectorFromString(name, from_ascii(" "));
 169         // No space: Only a family name given
 170         if (pieces.size() < 2) {
 171                 res.surname = renormalize(pieces.back());
 172                 return res;
 173         }
 174         // If we get two pieces, assume "prename surname"
 175         if (pieces.size() == 2) {
 176                 res.prename = renormalize(pieces.front());
 177                 res.surname = renormalize(pieces.back());
 178                 return res;
 179         }
 180
 181         // More than 3 pieces: A name prefix (aka "von" part) might be included.
 182         // We look for the first piece that begins with a lower case letter
 183         // (which is the name prefix, if it is not the last token) or the last token.
 184         docstring prename;
 185         vector<docstring>::const_iterator it = pieces.begin();
 186         vector<docstring>::const_iterator const en = pieces.end();
 187         bool first = true;
 188         for (; it != en; ++it) {
 189                 if ((*it).empty())
 190                         continue;
 191                 char_type const c = (*it)[0];
 192                 // If the piece starts with a lower case char, we assume
 193                 // this is the name prefix and thus prename is complete.
 194                 if (isLower(c))
 195                         break;
 196                 // Same if this is the last piece, which is always the surname.
 197                 if (it + 1 == en)
 198                         break;
 199                 // Nothing of the former, so add this piece to the prename
 200                 if (!first)
 201                         prename += " ";
 202                 else
 203                         first = false;
 204                 prename += *it;
 205         }
 206
 207         // Now reconstruct the family name and strip the prefix.
 208         // Note that if we left the loop because it + 1 == en,
 209         // then this will still do the right thing, i.e., make surname
 210         // just be the last piece.
 211         docstring surname;
 212         first = true;
 213         for (; it != en; ++it) {
 214                 if (!first)
 215                         surname += " ";
 216                 else
 217                         first = false;
 218                 surname += *it;
 219         }
 220         res.prename = renormalize(prename);
 221         res.prefix = renormalize(parseSurname(surname).first);
 222         res.surname = renormalize(parseSurname(surname).second);
 223         return res;
 224 }
 225
 226
 227 docstring constructName(docstring const & name, string const & scheme)
 228 {
 229         // re-constructs a name from name parts according
 230         // to a given scheme
 231         docstring const prename = nameParts(name).prename;
 232         docstring const surname = nameParts(name).surname;
 233         docstring const prefix = nameParts(name).prefix;
 234         docstring const suffix = nameParts(name).suffix;
 235         string res = scheme;
 236         static regex const reg1("(.*)(\\{%prename%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 237         static regex const reg2("(.*)(\\{%suffix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 238         static regex const reg3("(.*)(\\{%prefix%\\[\\[)([^\\]]+)(\\]\\]\\})(.*)");
 239         smatch sub;
 240         // Changing the first parameter of regex_match() may corrupt the
 241         // second one. In this case we use the temporary string tmp.
 242         if (regex_match(scheme, sub, reg1)) {
 243                 res = sub.str(1);
 244                 if (!prename.empty())
 245                         res += sub.str(3);
 246                 res += sub.str(5);
 247         }
 248         if (regex_match(res, sub, reg2)) {
 249                 string tmp = sub.str(1);
 250                 if (!suffix.empty())
 251                         tmp += sub.str(3);
 252                 res = tmp + sub.str(5);
 253         }
 254         if (regex_match(res, sub, reg3)) {
 255                 string tmp = sub.str(1);
 256                 if (!prefix.empty())
 257                         tmp += sub.str(3);
 258                 res = tmp + sub.str(5);
 259         }
 260         docstring result = from_ascii(res);
 261         result = subst(result, from_ascii("%prename%"), prename);
 262         result = subst(result, from_ascii("%surname%"), surname);
 263         result = subst(result, from_ascii("%prefix%"), prefix);
 264         result = subst(result, from_ascii("%suffix%"), suffix);
 265         return result;
 266 }
 267
 268
 269 vector<docstring> const getAuthors(docstring const & author)
 270 {
 271         // We check for goupings (via {...}) and only consider " and "
 272         // outside groups as author separator. This is to account
 273         // for cases such as {{Barnes and Noble, Inc.}}, which
 274         // need to be treated as one single family name.
 275         // We use temporary placeholders in order to differentiate the
 276         // diverse " and " cases.
 277
 278         // First, we temporarily replace all ampersands. It is rather unusual
 279         // in author names, but can happen (consider cases such as "C \& A Corp.").
 280         docstring iname = subst(author, from_ascii("&"), from_ascii("$$amp!"));
 281         // Then, we temporarily make all " and " strings to ampersands in order
 282         // to handle them later on a per-char level.
 283         iname = subst(iname, from_ascii(" and "), from_ascii(" & "));
 284         // Now we traverse through the string and replace the "&" by the proper
 285         // output in- and outside groups
 286         docstring name;
 287         int gl = 0;
 288         docstring::const_iterator p = iname.begin();
 289         while (p != iname.end()) {
 290                 // count grouping level
 291                 if (*p == '{')
 292                         ++gl;
 293                 else if (*p == '}')
 294                         --gl;
 295                 // generate string with probable placeholders
 296                 if (*p == '&') {
 297                         if (gl > 0)
 298                                 // Inside groups, we output "and"
 299                                 name += from_ascii("and");
 300                         else
 301                                 // Outside groups, we output a separator
 302                                 name += from_ascii("$$namesep!");
 303                 }
 304                 else
 305                         name += *p;
 306                 ++p;
 307         }
 308
 309         // re-insert the literal ampersands
 310         name = subst(name, from_ascii("$$amp!"), from_ascii("&"));
 311
 312         // Now construct the actual vector
 313         return getVectorFromString(name, from_ascii(" $$namesep! "));
 314 }
 315
 316
 317 bool multipleAuthors(docstring const & author)
 318 {
 319         return getAuthors(author).size() > 1;
 320 }
 321
 322
 323 // converts a string containing LaTeX commands into unicode
 324 // for display.
 325 docstring convertLaTeXCommands(docstring const & str)
 326 {
 327         docstring val = str;
 328         docstring ret;
 329
 330         bool scanning_cmd = false;
 331         bool scanning_math = false;
 332         bool escaped = false; // used to catch \$, etc.
 333         while (!val.empty()) {
 334                 char_type const ch = val[0];
 335
 336                 // if we're scanning math, we output everything until we
 337                 // find an unescaped $, at which point we break out.
 338                 if (scanning_math) {
 339                         if (escaped)
 340                                 escaped = false;
 341                         else if (ch == '\\')
 342                                 escaped = true;
 343                         else if (ch == '$')
 344                                 scanning_math = false;
 345                         ret += ch;
 346                         val = val.substr(1);
 347                         continue;
 348                 }
 349
 350                 // if we're scanning a command name, then we just
 351                 // discard characters until we hit something that
 352                 // isn't alpha.
 353                 if (scanning_cmd) {
 354                         if (isAlphaASCII(ch)) {
 355                                 val = val.substr(1);
 356                                 escaped = false;
 357                                 continue;
 358                         }
 359                         // so we're done with this command.
 360                         // now we fall through and check this character.
 361                         scanning_cmd = false;
 362                 }
 363
 364                 // was the last character a \? If so, then this is something like:
 365                 // \\ or \$, so we'll just output it. That's probably not always right...
 366                 if (escaped) {
 367                         // exception: output \, as THIN SPACE
 368                         if (ch == ',')
 369                                 ret.push_back(0x2009);
 370                         else
 371                                 ret += ch;
 372                         val = val.substr(1);
 373                         escaped = false;
 374                         continue;
 375                 }
 376
 377                 if (ch == '$') {
 378                         ret += ch;
 379                         val = val.substr(1);
 380                         scanning_math = true;
 381                         continue;
 382                 }
 383
 384                 // Change text mode accents in the form
 385                 // {\v a} to \v{a} (see #9340).
 386                 // FIXME: This is a sort of mini-tex2lyx.
 387                 //        Use the real tex2lyx instead!
 388                 static lyx::regex const tma_reg("^\\{\\\\[bcCdfGhHkrtuUv]\\s\\w\\}");
 389                 if (lyx::regex_search(to_utf8(val), tma_reg)) {
 390                         val = val.substr(1);
 391                         val.replace(2, 1, from_ascii("{"));
 392                         continue;
 393                 }
 394
 395                 // Apart from the above, we just ignore braces
 396                 if (ch == '{' || ch == '}') {
 397                         val = val.substr(1);
 398                         continue;
 399                 }
 400
 401                 // we're going to check things that look like commands, so if
 402                 // this doesn't, just output it.
 403                 if (ch != '\\') {
 404                         ret += ch;
 405                         val = val.substr(1);
 406                         continue;
 407                 }
 408
 409                 // ok, could be a command of some sort
 410                 // let's see if it corresponds to some unicode
 411                 // unicodesymbols has things in the form: \"{u},
 412                 // whereas we may see things like: \"u. So we'll
 413                 // look for that and change it, if necessary.
 414                 // FIXME: This is a sort of mini-tex2lyx.
 415                 //        Use the real tex2lyx instead!
 416                 static lyx::regex const reg("^\\\\\\W\\w");
 417                 if (lyx::regex_search(to_utf8(val), reg)) {
 418                         val.insert(3, from_ascii("}"));
 419                         val.insert(2, from_ascii("{"));
 420                 }
 421                 bool termination;
 422                 docstring rem;
 423                 docstring const cnvtd = Encodings::fromLaTeXCommand(val,
 424                                 Encodings::TEXT_CMD, termination, rem);
 425                 if (!cnvtd.empty()) {
 426                         // it did, so we'll take that bit and proceed with what's left
 427                         ret += cnvtd;
 428                         val = rem;
 429                         continue;
 430                 }
 431                 // it's a command of some sort
 432                 scanning_cmd = true;
 433                 escaped = true;
 434                 val = val.substr(1);
 435         }
 436         return ret;
 437 }
 438
 439
 440 // Escape '<' and '>' and remove richtext markers (e.g. {!this is richtext!}) from a string.
 441 docstring processRichtext(docstring const & str, bool richtext)
 442 {
 443         docstring val = str;
 444         docstring ret;
 445
 446         bool scanning_rich = false;
 447         while (!val.empty()) {
 448                 char_type const ch = val[0];
 449                 if (ch == '{' && val.size() > 1 && val[1] == '!') {
 450                         // beginning of rich text
 451                         scanning_rich = true;
 452                         val = val.substr(2);
 453                         continue;
 454                 }
 455                 if (scanning_rich && ch == '!' && val.size() > 1 && val[1] == '}') {
 456                         // end of rich text
 457                         scanning_rich = false;
 458                         val = val.substr(2);
 459                         continue;
 460                 }
 461                 if (richtext) {
 462                         if (scanning_rich)
 463                                 ret += ch;
 464                         else {
 465                                 // we need to escape '<' and '>'
 466                                 if (ch == '<')
 467                                         ret += "&lt;";
 468                                 else if (ch == '>')
 469                                         ret += "&gt;";
 470                                 else
 471                                         ret += ch;
 472                         }
 473                 } else if (!scanning_rich /* && !richtext */)
 474                         ret += ch;
 475                 // else the character is discarded, which will happen only if
 476                 // richtext == false and we are scanning rich text
 477                 val = val.substr(1);
 478         }
 479         return ret;
 480 }
 481
 482 } // namespace
 483
 484
 485 //////////////////////////////////////////////////////////////////////
 486 //
 487 // BibTeXInfo
 488 //
 489 //////////////////////////////////////////////////////////////////////
 490
 491 BibTeXInfo::BibTeXInfo(docstring const & key, docstring const & type)
 492         : is_bibtex_(true), bib_key_(key), num_bib_key_(0), entry_type_(type),
 493           info_(), format_(), modifier_(0)
 494 {}
 495
 496
 497
 498 docstring const BibTeXInfo::getAuthorOrEditorList(Buffer const * buf,
 499                                           bool full, bool forceshort) const
 500 {
 501         docstring author = operator[]("author");
 502         if (author.empty())
 503                 author = operator[]("editor");
 504
 505         return getAuthorList(buf, author, full, forceshort);
 506 }
 507
 508
 509 docstring const BibTeXInfo::getAuthorList(Buffer const * buf,
 510                 docstring const & author, bool const full, bool const forceshort,
 511                 bool const allnames, bool const beginning) const
 512 {
 513         // Maxnames treshold depend on engine
 514         size_t maxnames = buf ?
 515                 buf->params().documentClass().max_citenames() : 2;
 516
 517         if (!is_bibtex_) {
 518                 docstring const opt = label();
 519                 if (opt.empty())
 520                         return docstring();
 521
 522                 docstring authors;
 523                 docstring const remainder = trim(split(opt, authors, '('));
 524                 if (remainder.empty())
 525                         // in this case, we didn't find a "(",
 526                         // so we don't have author (year)
 527                         return docstring();
 528                 if (full) {
 529                         // Natbib syntax is "Jones et al.(1990)Jones, Baker, and Williams"
 530                         docstring const fullauthors = trim(rsplit(remainder, ')'));
 531                         if (!fullauthors.empty())
 532                                 return fullauthors;
 533                 }
 534                 return authors;
 535         }
 536
 537         if (author.empty())
 538                 return author;
 539
 540         // OK, we've got some names. Let's format them.
 541         // Try to split the author list
 542         vector<docstring> const authors = getAuthors(author);
 543
 544         docstring retval;
 545
 546         CiteEngineType const engine_type = buf ? buf->params().citeEngineType()
 547                                                : ENGINE_TYPE_DEFAULT;
 548
 549         // These are defined in the styles
 550         string const etal =
 551                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_etal")
 552                     : " et al.";
 553         string const namesep =
 554                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_namesep")
 555                    : ", ";
 556         string const lastnamesep =
 557                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_lastnamesep")
 558                     : ", and ";
 559         string const pairnamesep =
 560                 buf ? buf->params().documentClass().getCiteMacro(engine_type, "B_pairnamesep")
 561                      : " and ";
 562         string firstnameform =
 563                         buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstnameform")
 564                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 565         if (!beginning)
 566                 firstnameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!firstbynameform")
 567                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 568         string othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!othernameform")
 569                              : "{%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}{%prename%[[, %prename%]]}";
 570         if (!beginning)
 571                 othernameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!otherbynameform")
 572                                              : "%prename% {%prefix%[[%prefix% ]]}%surname%{%suffix%[[, %suffix%]]}";
 573         string citenameform = buf ? buf->params().documentClass().getCiteMacro(engine_type, "!citenameform")
 574                              : "{%prefix%[[%prefix% ]]}%surname%";
 575
 576         // Shorten the list (with et al.) if forceshort is set
 577         // and the list can actually be shortened, else if maxcitenames
 578         // is passed and full is not set.
 579         bool shorten = forceshort && authors.size() > 1;
 580         vector<docstring>::const_iterator it = authors.begin();
 581         vector<docstring>::const_iterator en = authors.end();
 582         for (size_t i = 0; it != en; ++it, ++i) {
 583                 if (i >= maxnames && !full) {
 584                         shorten = true;
 585                         break;
 586                 }
 587                 if (*it == "others") {
 588                         retval += buf ? buf->B_(etal) : from_ascii(etal);
 589                         break;
 590                 }
 591                 if (i > 0 && i == authors.size() - 1) {
 592                         if (authors.size() == 2)
 593                                 retval += buf ? buf->B_(pairnamesep) : from_ascii(pairnamesep);
 594                         else
 595                                 retval += buf ? buf->B_(lastnamesep) : from_ascii(lastnamesep);
 596                 } else if (i > 0)
 597                         retval += buf ? buf->B_(namesep) : from_ascii(namesep);
 598                 if (allnames)
 599                         retval += (i == 0) ? constructName(*it, firstnameform)
 600                                 : constructName(*it, othernameform);
 601                 else
 602                         retval += constructName(*it, citenameform);
 603         }
 604         if (shorten) {
 605                 if (allnames)
 606                         retval = constructName(authors[0], firstnameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 607                 else
 608                         retval = constructName(authors[0], citenameform) + (buf ? buf->B_(etal) : from_ascii(etal));
 609         }
 610
 611         return convertLaTeXCommands(retval);
 612 }
 613
 614
 615 docstring const BibTeXInfo::getYear() const
 616 {
 617         if (is_bibtex_) {
 618                 // first try legacy year field
 619                 docstring year = operator[]("year");
 620                 if (!year.empty())
 621                         return year;
 622                 // now try biblatex's date field
 623                 year = operator[]("date");
 624                 // Format is [-]YYYY-MM-DD*/[-]YYYY-MM-DD*
 625                 // We only want the years.
 626                 static regex const yreg("[-]?([\\d]{4}).*");
 627                 static regex const ereg(".*/[-]?([\\d]{4}).*");
 628                 smatch sm;
 629                 string const date = to_utf8(year);
 630                 if (!regex_match(date, sm, yreg))
 631                         // cannot parse year.
 632                         return docstring();
 633                 year = from_ascii(sm[1]);
 634                 // check for an endyear
 635                 if (regex_match(date, sm, ereg))
 636                         year += char_type(0x2013) + from_ascii(sm[1]);
 637                 return year;
 638         }
 639
 640         docstring const opt = label();
 641         if (opt.empty())
 642                 return docstring();
 643
 644         docstring authors;
 645         docstring tmp = split(opt, authors, '(');
 646         if (tmp.empty())
 647                 // we don't have author (year)
 648                 return docstring();
 649         docstring year;
 650         tmp = split(tmp, year, ')');
 651         return year;
 652 }
 653
 654
 655 void BibTeXInfo::getLocators(docstring & doi, docstring & url, docstring & file) const
 656 {
 657         if (is_bibtex_) {
 658                 // get "doi" entry from citation record
 659                 doi = operator[]("doi");
 660                 if (!doi.empty() && !prefixIs(doi,from_ascii("http")))
 661                         doi = "https://doi.org/" + doi;
 662                 // get "url" entry from citation record
 663                 url = operator[]("url");
 664                 // get "file" entry from citation record
 665                 file = operator[]("file");
 666
 667                 // Jabref case, field has a format:
 668                 // Description:Location:Filetype;Description:Location:Filetype...
 669                 // We will grab only first pdf
 670                 if (!file.empty()) {
 671                         docstring ret, filedest, tmp;
 672                         ret = split(file, tmp, ':');
 673                         tmp = split(ret, filedest, ':');
 674                         //TODO howto deal with relative directories?
 675                         FileName f(to_utf8(filedest));
 676                         if (f.exists())
 677                                 file = "file:///" + filedest;
 678                 }
 679
 680                 // kbibtex case, format:
 681                 // file1.pdf;file2.pdf
 682                 // We will grab only first pdf
 683                 docstring kfile;
 684                 if (file.empty())
 685                         kfile = operator[]("localfile");
 686                 if (!kfile.empty()) {
 687                         docstring filedest, tmp;
 688                         tmp = split(kfile, filedest, ';');
 689                         //TODO howto deal with relative directories?
 690                         FileName f(to_utf8(filedest));
 691                         if (f.exists())
 692                                 file = "file:///" + filedest;
 693                 }
 694
 695                 if (!url.empty())
 696                         return;
 697
 698                 // try biblatex specific fields, see its manual
 699                 // 3.13.7 "Electronic Publishing Informationl"
 700                 docstring eprinttype = operator[]("eprinttype");
 701                 docstring eprint = operator[]("eprint");
 702                 if (eprint.empty())
 703                         return;
 704
 705                 if (eprinttype == "arxiv")
 706                         url = "https://arxiv.org/abs/" + eprint;
 707                 if (eprinttype == "jstor")
 708                         url = "https://www.jstor.org/stable/" + eprint;
 709                 if (eprinttype == "pubmed")
 710                         url = "http://www.ncbi.nlm.nih.gov/pubmed/" + eprint;
 711                 if (eprinttype == "hdl")
 712                         url = "https://hdl.handle.net/" + eprint;
 713                 if (eprinttype == "googlebooks")
 714                         url = "http://books.google.com/books?id=" + eprint;
 715
 716                 return;
 717         }
 718
 719         // Here can be handled the bibliography environment. All one could do
 720         // here is let LyX scan the entry for URL or HRef insets.
 721 }
 722
 723
 724 namespace {
 725
 726 docstring parseOptions(docstring const & format, string & optkey,
 727                     docstring & ifpart, docstring & elsepart);
 728
 729 // Calls parseOptions to deal with an embedded option, such as:
 730 //   {%number%[[, no.~%number%]]}
 731 // which must appear at the start of format. ifelsepart gets the
 732 // whole of the option, and we return what's left after the option.
 733 // we return format if there is an error.
 734 docstring parseEmbeddedOption(docstring const & format, docstring & ifelsepart)
 735 {
 736         LASSERT(format[0] == '{' && format[1] == '%', return format);
 737         string optkey;
 738         docstring ifpart;
 739         docstring elsepart;
 740         docstring const rest = parseOptions(format, optkey, ifpart, elsepart);
 741         if (format == rest) { // parse error
 742                 LYXERR0("ERROR! Couldn't parse `" << format <<"'.");
 743                 return format;
 744         }
 745         LASSERT(rest.size() <= format.size(),
 746                 { ifelsepart = docstring(); return format; });
 747         ifelsepart = format.substr(0, format.size() - rest.size());
 748         return rest;
 749 }
 750
 751
 752 // Gets a "clause" from a format string, where the clause is
 753 // delimited by '[[' and ']]'. Returns what is left after the
 754 // clause is removed, and returns format if there is an error.
 755 docstring getClause(docstring const & format, docstring & clause)
 756 {
 757         docstring fmt = format;
 758         // remove '[['
 759         fmt = fmt.substr(2);
 760         // we'll remove characters from the front of fmt as we
 761         // deal with them
 762         while (!fmt.empty()) {
 763                 if (fmt[0] == ']' && fmt.size() > 1 && fmt[1] == ']') {
 764                         // that's the end
 765                         fmt = fmt.substr(2);
 766                         break;
 767                 }
 768                 // check for an embedded option
 769                 if (fmt[0] == '{' && fmt.size() > 1 && fmt[1] == '%') {
 770                         docstring part;
 771                         docstring const rest = parseEmbeddedOption(fmt, part);
 772                         if (fmt == rest) {
 773                                 LYXERR0("ERROR! Couldn't parse embedded option in `" << format <<"'.");
 774                                 return format;
 775                         }
 776                         clause += part;
 777                         fmt = rest;
 778                 } else { // it's just a normal character
 779                                 clause += fmt[0];
 780                                 fmt = fmt.substr(1);
 781                 }
 782         }
 783         return fmt;
 784 }
 785
 786
 787 // parse an options string, which must appear at the start of the
 788 // format parameter. puts the parsed bits in optkey, ifpart, and
 789 // elsepart and returns what's left after the option is removed.
 790 // if there's an error, it returns format itself.
 791 docstring parseOptions(docstring const & format, string & optkey,
 792                     docstring & ifpart, docstring & elsepart)
 793 {
 794         LASSERT(format[0] == '{' && format[1] == '%', return format);
 795         // strip '{%'
 796         docstring fmt = format.substr(2);
 797         size_t pos = fmt.find('%'); // end of key
 798         if (pos == string::npos) {
 799                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of key.");
 800                 return format;
 801         }
 802         optkey = to_utf8(fmt.substr(0, pos));
 803         fmt = fmt.substr(pos + 1);
 804         // [[format]] should be next
 805         if (fmt[0] != '[' || fmt[1] != '[') {
 806                 LYXERR0("Error parsing  `" << format <<"'. Can't find '[[' after key.");
 807                 return format;
 808         }
 809
 810         docstring curfmt = fmt;
 811         fmt = getClause(curfmt, ifpart);
 812         if (fmt == curfmt) {
 813                 LYXERR0("Error parsing  `" << format <<"'. Couldn't get if clause.");
 814                 return format;
 815         }
 816
 817         if (fmt[0] == '}') // we're done, no else clause
 818                 return fmt.substr(1);
 819
 820         // else part should follow
 821         if (fmt[0] != '[' || fmt[1] != '[') {
 822                 LYXERR0("Error parsing  `" << format <<"'. Can't find else clause.");
 823                 return format;
 824         }
 825
 826         curfmt = fmt;
 827         fmt = getClause(curfmt, elsepart);
 828         // we should be done
 829         if (fmt == curfmt || fmt[0] != '}') {
 830                 LYXERR0("Error parsing  `" << format <<"'. Can't find end of option.");
 831                 return format;
 832         }
 833         return fmt.substr(1);
 834 }
 835
 836
 837 } // namespace
 838
 839 /* FIXME
 840 Bug #9131 revealed an oddity in how we are generating citation information
 841 when more than one key is given. We end up building a longer and longer format
 842 string as we go, which we then have to re-parse, over and over and over again,
 843 rather than generating the information for the individual keys and then putting
 844 all of that together. We do that to deal with the way separators work, from what
 845 I can tell, but it still feels like a hack. Fixing this would require quite a
 846 bit of work, however.
 847 */
 848 docstring BibTeXInfo::expandFormat(docstring const & format,
 849                 BibTeXInfoList const & xrefs, int & counter, Buffer const & buf,
 850                 CiteItem const & ci, bool next, bool second) const
 851 {
 852         // incorrect use of macros could put us in an infinite loop
 853         static int const max_passes = 5000;
 854         // the use of overly large keys can lead to performance problems, due
 855         // to eventual attempts to convert LaTeX macros to unicode. See bug
 856         // #8944. By default, the size is limited to 128 (in CiteItem), but
 857         // for specific purposes (such as XHTML export), it needs to be enlarged
 858         // This is perhaps not the best solution, but it will have to do for now.
 859         size_t const max_keysize = ci.max_key_size;
 860         odocstringstream ret; // return value
 861         string key;
 862         bool scanning_key = false;
 863         bool scanning_rich = false;
 864
 865         CiteEngineType const engine_type = buf.params().citeEngineType();
 866         docstring fmt = format;
 867         // we'll remove characters from the front of fmt as we
 868         // deal with them
 869         while (!fmt.empty()) {
 870                 if (counter > max_passes) {
 871                         LYXERR0("Recursion limit reached while parsing `"
 872                                 << format << "'.");
 873                         return _("ERROR!");
 874                 }
 875
 876                 char_type thischar = fmt[0];
 877                 if (thischar == '%') {
 878                         // beginning or end of key
 879                         if (scanning_key) {
 880                                 // end of key
 881                                 scanning_key = false;
 882                                 // so we replace the key with its value, which may be empty
 883                                 if (key[0] == '!') {
 884                                         // macro
 885                                         string const val =
 886                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 887                                         fmt = from_utf8(val) + fmt.substr(1);
 888                                         counter += 1;
 889                                         continue;
 890                                 } else if (prefixIs(key, "B_")) {
 891                                         // a translatable bit (to the Buffer language)
 892                                         string const val =
 893                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 894                                         docstring const trans =
 895                                                 translateIfPossible(from_utf8(val), buf.params().language->code());
 896                                         ret << trans;
 897                                 } else if (key[0] == '_') {
 898                                         // a translatable bit (to the GUI language)
 899                                         string const val =
 900                                                 buf.params().documentClass().getCiteMacro(engine_type, key);
 901                                         docstring const trans =
 902                                                 translateIfPossible(from_utf8(val));
 903                                         ret << trans;
 904                                 } else {
 905                                         docstring const val =
 906                                                 getValueForKey(key, buf, ci, xrefs, max_keysize);
 907                                         if (!scanning_rich)
 908                                                 ret << from_ascii("{!<span class=\"bib-" + key + "\">!}");
 909                                         ret << val;
 910                                         if (!scanning_rich)
 911                                                 ret << from_ascii("{!</span>!}");
 912                                 }
 913                         } else {
 914                                 // beginning of key
 915                                 key.clear();
 916                                 scanning_key = true;
 917                         }
 918                 }
 919                 else if (thischar == '{') {
 920                         // beginning of option?
 921                         if (scanning_key) {
 922                                 LYXERR0("ERROR: Found `{' when scanning key in `" << format << "'.");
 923                                 return _("ERROR!");
 924                         }
 925                         if (fmt.size() > 1) {
 926                                 if (fmt[1] == '%') {
 927                                         // it is the beginning of an optional format
 928                                         string optkey;
 929                                         docstring ifpart;
 930                                         docstring elsepart;
 931                                         docstring const newfmt =
 932                                                 parseOptions(fmt, optkey, ifpart, elsepart);
 933                                         if (newfmt == fmt) // parse error
 934                                                 return _("ERROR!");
 935                                         fmt = newfmt;
 936                                         docstring const val =
 937                                                 getValueForKey(optkey, buf, ci, xrefs);
 938                                         if (optkey == "next" && next)
 939                                                 ret << ifpart; // without expansion
 940                                         else if (optkey == "second" && second) {
 941                                                 int newcounter = 0;
 942                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 943                                                         ci, next);
 944                                         } else if (!val.empty()) {
 945                                                 int newcounter = 0;
 946                                                 ret << expandFormat(ifpart, xrefs, newcounter, buf,
 947                                                         ci, next);
 948                                         } else if (!elsepart.empty()) {
 949                                                 int newcounter = 0;
 950                                                 ret << expandFormat(elsepart, xrefs, newcounter, buf,
 951                                                         ci, next);
 952                                         }
 953                                         // fmt will have been shortened for us already
 954                                         continue;
 955                                 }
 956                                 if (fmt[1] == '!') {
 957                                         // beginning of rich text
 958                                         scanning_rich = true;
 959                                         fmt = fmt.substr(2);
 960                                         ret << from_ascii("{!");
 961                                         continue;
 962                                 }
 963                         }
 964                         // we are here if '{' was not followed by % or !.
 965                         // So it's just a character.
 966                         ret << thischar;
 967                 }
 968                 else if (scanning_rich && thischar == '!'
 969                          && fmt.size() > 1 && fmt[1] == '}') {
 970                         // end of rich text
 971                         scanning_rich = false;
 972                         fmt = fmt.substr(2);
 973                         ret << from_ascii("!}");
 974                         continue;
 975                 }
 976                 else if (scanning_key)
 977                         key += char(thischar);
 978                 else {
 979                         try {
 980                                 ret.put(thischar);
 981                         } catch (EncodingException & /* e */) {
 982                                 LYXERR0("Uncodable character '" << docstring(1, thischar) << " in citation label!");
 983                         }
 984                 }
 985                 fmt = fmt.substr(1);
 986         } // for loop
 987         if (scanning_key) {
 988                 LYXERR0("Never found end of key in `" << format << "'!");
 989                 return _("ERROR!");
 990         }
 991         if (scanning_rich) {
 992                 LYXERR0("Never found end of rich text in `" << format << "'!");
 993                 return _("ERROR!");
 994         }
 995         return ret.str();
 996 }
 997
 998
 999 docstring const & BibTeXInfo::getInfo(BibTeXInfoList const & xrefs,
1000         Buffer const & buf, CiteItem const & ci, docstring const & format_in) const
1001 {
1002         bool const richtext = ci.richtext;
1003
1004         CiteEngineType const engine_type = buf.params().citeEngineType();
1005         DocumentClass const & dc = buf.params().documentClass();
1006         docstring const & format = format_in.empty()?
1007                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_)))
1008                               : format_in;
1009
1010         if (format != format_) {
1011                 // clear caches since format changed
1012                 info_.clear();
1013                 info_richtext_.clear();
1014                 format_ = format;
1015         }
1016
1017         if (!richtext && !info_.empty()) {
1018                 info_ = convertLaTeXCommands(processRichtext(info_, false));
1019                 return info_;
1020         }
1021         if (richtext && !info_richtext_.empty())
1022                 return info_richtext_;
1023
1024         if (!is_bibtex_) {
1025                 BibTeXInfo::const_iterator it = find(from_ascii("ref"));
1026                 info_ = it->second;
1027                 return info_;
1028         }
1029
1030         int counter = 0;
1031         info_ = expandFormat(format, xrefs, counter, buf,
1032                 ci, false, false);
1033
1034         if (info_.empty()) {
1035                 // this probably shouldn't happen
1036                 return info_;
1037         }
1038
1039         if (richtext) {
1040                 info_richtext_ = convertLaTeXCommands(processRichtext(info_, true));
1041                 return info_richtext_;
1042         }
1043
1044         info_ = convertLaTeXCommands(processRichtext(info_, false));
1045         return info_;
1046 }
1047
1048
1049 docstring const BibTeXInfo::getLabel(BibTeXInfoList const & xrefs,
1050         Buffer const & buf, docstring const & format,
1051         CiteItem const & ci, bool next, bool second) const
1052 {
1053         docstring loclabel;
1054
1055         int counter = 0;
1056         loclabel = expandFormat(format, xrefs, counter, buf, ci, next, second);
1057
1058         if (!loclabel.empty() && !next) {
1059                 loclabel = processRichtext(loclabel, ci.richtext);
1060                 loclabel = convertLaTeXCommands(loclabel);
1061         }
1062
1063         return loclabel;
1064 }
1065
1066
1067 docstring const & BibTeXInfo::operator[](docstring const & field) const
1068 {
1069         BibTeXInfo::const_iterator it = find(field);
1070         if (it != end())
1071                 return it->second;
1072         static docstring const empty_value = docstring();
1073         return empty_value;
1074 }
1075
1076
1077 docstring const & BibTeXInfo::operator[](string const & field) const
1078 {
1079         return operator[](from_ascii(field));
1080 }
1081
1082
1083 docstring BibTeXInfo::getValueForKey(string const & oldkey, Buffer const & buf,
1084         CiteItem const & ci, BibTeXInfoList const & xrefs, size_t maxsize) const
1085 {
1086         // anything less is pointless
1087         LASSERT(maxsize >= 16, maxsize = 16);
1088         string key = oldkey;
1089         bool cleanit = false;
1090         if (prefixIs(oldkey, "clean:")) {
1091                 key = oldkey.substr(6);
1092                 cleanit = true;
1093         }
1094
1095         docstring ret = operator[](key);
1096         if (ret.empty() && !xrefs.empty()) {
1097                 // xr is a (reference to a) BibTeXInfo const *
1098                 for (auto const & xr : xrefs) {
1099                         if (xr && !(*xr)[key].empty()) {
1100                                 ret = (*xr)[key];
1101                                 break;
1102                         }
1103                 }
1104         }
1105         if (ret.empty()) {
1106                 // some special keys
1107                 // FIXME: dialog, textbefore and textafter have nothing to do with this
1108                 if (key == "dialog" && ci.context == CiteItem::Dialog)
1109                         ret = from_ascii("x"); // any non-empty string will do
1110                 else if (key == "export" && ci.context == CiteItem::Export)
1111                         ret = from_ascii("x"); // any non-empty string will do
1112                 else if (key == "ifstar" && ci.Starred)
1113                         ret = from_ascii("x"); // any non-empty string will do
1114                 else if (key == "ifqualified" && ci.isQualified)
1115                         ret = from_ascii("x"); // any non-empty string will do
1116                 else if (key == "entrytype")
1117                         ret = entry_type_;
1118                 else if (prefixIs(key, "ifentrytype:")
1119                          && from_ascii(key.substr(12)) == entry_type_)
1120                         ret = from_ascii("x"); // any non-empty string will do
1121                 else if (key == "key")
1122                         ret = bib_key_;
1123                 else if (key == "label")
1124                         ret = label_;
1125                 else if (key == "modifier" && modifier_ != 0)
1126                         ret = modifier_;
1127                 else if (key == "numericallabel")
1128                         ret = cite_number_;
1129                 else if (prefixIs(key, "ifmultiple:")) {
1130                         // Return whether we have multiple authors
1131                         docstring const kind = operator[](from_ascii(key.substr(11)));
1132                         if (multipleAuthors(kind))
1133                                 ret = from_ascii("x"); // any non-empty string will do
1134                 }
1135                 else if (prefixIs(key, "abbrvnames:")) {
1136                         // Special key to provide abbreviated name list,
1137                         // with respect to maxcitenames. Suitable for Bibliography
1138                         // beginnings.
1139                         docstring const kind = operator[](from_ascii(key.substr(11)));
1140                         ret = getAuthorList(&buf, kind, false, false, true);
1141                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1142                                 ret[0] = uppercase(ret[0]);
1143                 } else if (prefixIs(key, "fullnames:")) {
1144                         // Return a full name list. Suitable for Bibliography
1145                         // beginnings.
1146                         docstring const kind = operator[](from_ascii(key.substr(10)));
1147                         ret = getAuthorList(&buf, kind, true, false, true);
1148                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1149                                 ret[0] = uppercase(ret[0]);
1150                 } else if (prefixIs(key, "forceabbrvnames:")) {
1151                         // Special key to provide abbreviated name lists,
1152                         // irrespective of maxcitenames. Suitable for Bibliography
1153                         // beginnings.
1154                         docstring const kind = operator[](from_ascii(key.substr(15)));
1155                         ret = getAuthorList(&buf, kind, false, true, true);
1156                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1157                                 ret[0] = uppercase(ret[0]);
1158                 } else if (prefixIs(key, "abbrvbynames:")) {
1159                         // Special key to provide abbreviated name list,
1160                         // with respect to maxcitenames. Suitable for further names inside a
1161                         // bibliography item // (such as "ed. by ...")
1162                         docstring const kind = operator[](from_ascii(key.substr(11)));
1163                         ret = getAuthorList(&buf, kind, false, false, true, false);
1164                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1165                                 ret[0] = uppercase(ret[0]);
1166                 } else if (prefixIs(key, "fullbynames:")) {
1167                         // Return a full name list. Suitable for further names inside a
1168                         // bibliography item // (such as "ed. by ...")
1169                         docstring const kind = operator[](from_ascii(key.substr(10)));
1170                         ret = getAuthorList(&buf, kind, true, false, true, false);
1171                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1172                                 ret[0] = uppercase(ret[0]);
1173                 } else if (prefixIs(key, "forceabbrvbynames:")) {
1174                         // Special key to provide abbreviated name lists,
1175                         // irrespective of maxcitenames. Suitable for further names inside a
1176                         // bibliography item // (such as "ed. by ...")
1177                         docstring const kind = operator[](from_ascii(key.substr(15)));
1178                         ret = getAuthorList(&buf, kind, false, true, true, false);
1179                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1180                                 ret[0] = uppercase(ret[0]);
1181                 } else if (key == "abbrvciteauthor") {
1182                         // Special key to provide abbreviated author or
1183                         // editor names (suitable for citation labels),
1184                         // with respect to maxcitenames.
1185                         ret = getAuthorOrEditorList(&buf, false, false);
1186                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1187                                 ret[0] = uppercase(ret[0]);
1188                 } else if (key == "fullciteauthor") {
1189                         // Return a full author or editor list (for citation labels)
1190                         ret = getAuthorOrEditorList(&buf, true, false);
1191                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1192                                 ret[0] = uppercase(ret[0]);
1193                 } else if (key == "forceabbrvciteauthor") {
1194                         // Special key to provide abbreviated author or
1195                         // editor names (suitable for citation labels),
1196                         // irrespective of maxcitenames.
1197                         ret = getAuthorOrEditorList(&buf, false, true);
1198                         if (ci.forceUpperCase && isLowerCase(ret[0]))
1199                                 ret[0] = uppercase(ret[0]);
1200                 } else if (key == "bibentry") {
1201                         // Special key to provide the full bibliography entry: see getInfo()
1202                         CiteEngineType const engine_type = buf.params().citeEngineType();
1203                         DocumentClass const & dc = buf.params().documentClass();
1204                         docstring const & format =
1205                                 from_utf8(dc.getCiteFormat(engine_type, to_utf8(entry_type_), false));
1206                         int counter = 0;
1207                         ret = expandFormat(format, xrefs, counter, buf, ci, false, false);
1208                 } else if (key == "textbefore")
1209                         ret = ci.textBefore;
1210                 else if (key == "textafter")
1211                         ret = ci.textAfter;
1212                 else if (key == "curpretext") {
1213                         vector<pair<docstring, docstring>> pres = ci.getPretexts();
1214                         vector<pair<docstring, docstring>>::iterator it = pres.begin();
1215                         int numkey = 1;
1216                         for (; it != pres.end() ; ++it) {
1217                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1218                                         ret = (*it).second;
1219                                         pres.erase(it);
1220                                         break;
1221                                 }
1222                                 if ((*it).first == bib_key_)
1223                                         ++numkey;
1224                         }
1225                 } else if (key == "curposttext") {
1226                         vector<pair<docstring, docstring>> posts = ci.getPosttexts();
1227                         vector<pair<docstring, docstring>>::iterator it = posts.begin();
1228                         int numkey = 1;
1229                         for (; it != posts.end() ; ++it) {
1230                                 if ((*it).first == bib_key_ && numkey == num_bib_key_) {
1231                                         ret = (*it).second;
1232                                         posts.erase(it);
1233                                         break;
1234                                 }
1235                                 if ((*it).first == bib_key_)
1236                                         ++numkey;
1237                         }
1238                 } else if (key == "year")
1239                         ret = getYear();
1240         }
1241
1242         if (cleanit)
1243                 ret = xml::cleanAttr(ret);
1244
1245         // make sure it is not too big
1246         support::truncateWithEllipsis(ret, maxsize);
1247         return ret;
1248 }
1249
1250
1251 //////////////////////////////////////////////////////////////////////
1252 //
1253 // BiblioInfo
1254 //
1255 //////////////////////////////////////////////////////////////////////
1256
1257 namespace {
1258
1259 // A functor for use with sort, leading to case insensitive sorting
1260 bool compareNoCase(const docstring & a, const docstring & b) {
1261         return compare_no_case(a, b) < 0;
1262 }
1263
1264 } // namespace
1265
1266
1267 vector<docstring> const BiblioInfo::getXRefs(BibTeXInfo const & data, bool const nested) const
1268 {
1269         vector<docstring> result;
1270         if (!data.isBibTeX())
1271                 return result;
1272         // Legacy crossref field. This is not nestable.
1273         if (!nested && !data["crossref"].empty()) {
1274                 docstring const xrefkey = data["crossref"];
1275                 result.push_back(xrefkey);
1276                 // However, check for nested xdatas
1277                 BiblioInfo::const_iterator it = find(xrefkey);
1278                 if (it != end()) {
1279                         BibTeXInfo const & xref = it->second;
1280                         vector<docstring> const nxdata = getXRefs(xref, true);
1281                         if (!nxdata.empty())
1282                                 result.insert(result.end(), nxdata.begin(), nxdata.end());
1283                 }
1284         }
1285         // Biblatex's xdata field. Infinitely nestable.
1286         // XData field can consist of a comma-separated list of keys
1287         vector<docstring> const xdatakeys = getVectorFromString(data["xdata"]);
1288         if (!xdatakeys.empty()) {
1289                 for (auto const & xdatakey : xdatakeys) {
1290                         result.push_back(xdatakey);
1291                         BiblioInfo::const_iterator it = find(xdatakey);
1292                         if (it != end()) {
1293                                 BibTeXInfo const & xdata = it->second;
1294                                 vector<docstring> const nxdata = getXRefs(xdata, true);
1295                                 if (!nxdata.empty())
1296                                         result.insert(result.end(), nxdata.begin(), nxdata.end());
1297                         }
1298                 }
1299         }
1300         return result;
1301 }
1302
1303
1304 vector<docstring> const BiblioInfo::getKeys() const
1305 {
1306         vector<docstring> bibkeys;
1307         for (auto const & bi : *this)
1308                 bibkeys.push_back(bi.first);
1309         sort(bibkeys.begin(), bibkeys.end(), &compareNoCase);
1310         return bibkeys;
1311 }
1312
1313
1314 vector<docstring> const BiblioInfo::getFields() const
1315 {
1316         vector<docstring> bibfields;
1317         for (auto const & fn : field_names_)
1318                 bibfields.push_back(fn);
1319         sort(bibfields.begin(), bibfields.end());
1320         return bibfields;
1321 }
1322
1323
1324 vector<docstring> const BiblioInfo::getEntries() const
1325 {
1326         vector<docstring> bibentries;
1327         for (auto const & et : entry_types_)
1328                 bibentries.push_back(et);
1329         sort(bibentries.begin(), bibentries.end());
1330         return bibentries;
1331 }
1332
1333
1334 docstring const BiblioInfo::getAuthorOrEditorList(docstring const & key, Buffer const & buf) const
1335 {
1336         BiblioInfo::const_iterator it = find(key);
1337         if (it == end())
1338                 return docstring();
1339         BibTeXInfo const & data = it->second;
1340         return data.getAuthorOrEditorList(&buf, false);
1341 }
1342
1343
1344 docstring const BiblioInfo::getCiteNumber(docstring const & key) const
1345 {
1346         BiblioInfo::const_iterator it = find(key);
1347         if (it == end())
1348                 return docstring();
1349         BibTeXInfo const & data = it->second;
1350         return data.citeNumber();
1351 }
1352
1353 void BiblioInfo::getLocators(docstring const & key, docstring & doi, docstring & url, docstring & file) const
1354 {
1355         BiblioInfo::const_iterator it = find(key);
1356          if (it == end())
1357                 return;
1358         BibTeXInfo const & data = it->second;
1359         data.getLocators(doi,url,file);
1360 }
1361
1362
1363 docstring const BiblioInfo::getYear(docstring const & key, bool use_modifier) const
1364 {
1365         BiblioInfo::const_iterator it = find(key);
1366         if (it == end())
1367                 return docstring();
1368         BibTeXInfo const & data = it->second;
1369         docstring year = data.getYear();
1370         if (year.empty()) {
1371                 // let's try the crossrefs
1372                 vector<docstring> const xrefs = getXRefs(data);
1373                 if (xrefs.empty())
1374                         // no luck
1375                         return docstring();
1376                 for (docstring const & xref : xrefs) {
1377                         BiblioInfo::const_iterator const xrefit = find(xref);
1378                         if (xrefit == end())
1379                                 continue;
1380                         BibTeXInfo const & xref_data = xrefit->second;
1381                         year = xref_data.getYear();
1382                         if (!year.empty())
1383                                 // success!
1384                                 break;
1385                 }
1386         }
1387         if (use_modifier && data.modifier() != 0)
1388                 year += data.modifier();
1389         return year;
1390 }
1391
1392
1393 docstring const BiblioInfo::getYear(docstring const & key, Buffer const & buf, bool use_modifier) const
1394 {
1395         docstring const year = getYear(key, use_modifier);
1396         if (year.empty())
1397                 return buf.B_("No year");
1398         return year;
1399 }
1400
1401
1402 docstring const BiblioInfo::getInfo(docstring const & key,
1403         Buffer const & buf, CiteItem const & ci, docstring const & format) const
1404 {
1405         BiblioInfo::const_iterator it = find(key);
1406         if (it == end())
1407                 return docstring(_("Bibliography entry not found!"));
1408         BibTeXInfo const & data = it->second;
1409         BibTeXInfoList xrefptrs;
1410         for (docstring const & xref : getXRefs(data)) {
1411                 BiblioInfo::const_iterator const xrefit = find(xref);
1412                 if (xrefit != end())
1413                         xrefptrs.push_back(&(xrefit->second));
1414         }
1415         return data.getInfo(xrefptrs, buf, ci, format);
1416 }
1417
1418
1419 docstring const BiblioInfo::getLabel(vector<docstring> keys,
1420         Buffer const & buf, string const & style, CiteItem const & ci) const
1421 {
1422         size_t max_size = ci.max_size;
1423         // shorter makes no sense
1424         LASSERT(max_size >= 16, max_size = 16);
1425
1426         // we can't display more than 10 of these, anyway
1427         // but since we truncate in the middle,
1428         // we need to split into two halfs.
1429         bool const too_many_keys = keys.size() > 10;
1430         vector<docstring> lkeys;
1431         if (too_many_keys) {
1432                 lkeys.insert(lkeys.end(), keys.end() - 5, keys.end());
1433                 keys.resize(5);
1434                 keys.insert(keys.end(), lkeys.begin(), lkeys.end());
1435         }
1436
1437         CiteEngineType const engine_type = buf.params().citeEngineType();
1438         DocumentClass const & dc = buf.params().documentClass();
1439         docstring const & format = from_utf8(dc.getCiteFormat(engine_type, style, false, "cite"));
1440         docstring ret = format;
1441         vector<docstring>::const_iterator key = keys.begin();
1442         vector<docstring>::const_iterator ken = keys.end();
1443         vector<docstring> handled_keys;
1444         for (int i = 0; key != ken; ++key, ++i) {
1445                 handled_keys.push_back(*key);
1446                 int n = 0;
1447                 for (auto const & k : handled_keys) {
1448                         if (k == *key)
1449                                 ++n;
1450                 }
1451                 BiblioInfo::const_iterator it = find(*key);
1452                 BibTeXInfo empty_data;
1453                 empty_data.key(*key);
1454                 BibTeXInfo & data = empty_data;
1455                 vector<BibTeXInfo const *> xrefptrs;
1456                 if (it != end()) {
1457                         data = it->second;
1458                         for (docstring const & xref : getXRefs(data)) {
1459                                 BiblioInfo::const_iterator const xrefit = find(xref);
1460                                 if (xrefit != end())
1461                                         xrefptrs.push_back(&(xrefit->second));
1462                         }
1463                 }
1464                 data.numKey(n);
1465                 ret = data.getLabel(xrefptrs, buf, ret, ci, key + 1 != ken, i == 1);
1466         }
1467
1468         support::truncateWithEllipsis(ret, max_size, true);
1469
1470         return ret;
1471 }
1472
1473
1474 bool BiblioInfo::isBibtex(docstring const & key) const
1475 {
1476         docstring key1;
1477         split(key, key1, ',');
1478         BiblioInfo::const_iterator it = find(key1);
1479         if (it == end())
1480                 return false;
1481         return it->second.isBibTeX();
1482 }
1483
1484
1485 BiblioInfo::CiteStringMap const BiblioInfo::getCiteStrings(
1486         vector<docstring> const & keys, vector<CitationStyle> const & styles,
1487         Buffer const & buf, CiteItem const & ci) const
1488 {
1489         if (empty())
1490                 return vector<pair<docstring,docstring>>();
1491
1492         string style;
1493         CiteStringMap csm(styles.size());
1494         for (size_t i = 0; i != csm.size(); ++i) {
1495                 style = styles[i].name;
1496                 csm[i] = make_pair(from_ascii(style), getLabel(keys, buf, style, ci));
1497         }
1498
1499         return csm;
1500 }
1501
1502
1503 void BiblioInfo::mergeBiblioInfo(BiblioInfo const & info)
1504 {
1505         bimap_.insert(info.begin(), info.end());
1506         field_names_.insert(info.field_names_.begin(), info.field_names_.end());
1507         entry_types_.insert(info.entry_types_.begin(), info.entry_types_.end());
1508 }
1509
1510
1511 namespace {
1512
1513 // used in xhtml to sort a list of BibTeXInfo objects
1514 bool lSorter(BibTeXInfo const * lhs, BibTeXInfo const * rhs)
1515 {
1516         docstring const lauth = lhs->getAuthorOrEditorList();
1517         docstring const rauth = rhs->getAuthorOrEditorList();
1518         docstring const lyear = lhs->getYear();
1519         docstring const ryear = rhs->getYear();
1520         docstring const ltitl = lhs->operator[]("title");
1521         docstring const rtitl = rhs->operator[]("title");
1522         return  (lauth < rauth)
1523                 || (lauth == rauth && lyear < ryear)
1524                 || (lauth == rauth && lyear == ryear && ltitl < rtitl);
1525 }
1526
1527 } // namespace
1528
1529
1530 void BiblioInfo::collectCitedEntries(Buffer const & buf)
1531 {
1532         cited_entries_.clear();
1533         // We are going to collect all the citation keys used in the document,
1534         // getting them from the TOC.
1535         // FIXME We may want to collect these differently, in the first case,
1536         // so that we might have them in order of appearance.
1537         set<docstring> citekeys;
1538         Toc const & toc = *buf.tocBackend().toc("citation");
1539         for (auto const & t : toc) {
1540                 if (t.str().empty())
1541                         continue;
1542                 vector<docstring> const keys = getVectorFromString(t.str());
1543                 citekeys.insert(keys.begin(), keys.end());
1544         }
1545         if (citekeys.empty())
1546                 return;
1547
1548         // We have a set of the keys used in this document.
1549         // We will now convert it to a list of the BibTeXInfo objects used in
1550         // this document...
1551         vector<BibTeXInfo const *> bi;
1552         for (auto const & ck : citekeys) {
1553                 BiblioInfo::const_iterator const bt = find(ck);
1554                 if (bt == end() || !bt->second.isBibTeX())
1555                         continue;
1556                 bi.push_back(&(bt->second));
1557         }
1558         // ...and sort it.
1559         sort(bi.begin(), bi.end(), lSorter);
1560
1561         // Now we can write the sorted keys
1562         // b is a BibTeXInfo const *
1563         for (auto const & b : bi)
1564                 cited_entries_.push_back(b->key());
1565 }
1566
1567
1568 void BiblioInfo::makeCitationLabels(Buffer const & buf)
1569 {
1570         collectCitedEntries(buf);
1571         CiteEngineType const engine_type = buf.params().citeEngineType();
1572         bool const numbers = (engine_type & ENGINE_TYPE_NUMERICAL);
1573
1574         int keynumber = 0;
1575         char modifier = 0;
1576         // used to remember the last one we saw
1577         // we'll be comparing entries to see if we need to add
1578         // modifiers, like "1984a"
1579         map<docstring, BibTeXInfo>::iterator last = bimap_.end();
1580
1581         // add letters to years
1582         for (auto const & ce : cited_entries_) {
1583                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1584                 // this shouldn't happen, but...
1585                 if (biit == bimap_.end())
1586                         // ...fail gracefully, anyway.
1587                         continue;
1588                 BibTeXInfo & entry = biit->second;
1589                 if (numbers) {
1590                         docstring const num = convert<docstring>(++keynumber);
1591                         entry.setCiteNumber(num);
1592                 } else {
1593                         // The first test here is checking whether this is the first
1594                         // time through the loop. If so, then we do not have anything
1595                         // with which to compare.
1596                         if (last != bimap_.end()
1597                             && entry.getAuthorOrEditorList() == last->second.getAuthorOrEditorList()
1598                             // we access the year via getYear() so as to get it from the xref,
1599                             // if we need to do so
1600                             && getYear(entry.key()) == getYear(last->second.key())) {
1601                                 if (modifier == 0) {
1602                                         // so the last one should have been 'a'
1603                                         last->second.setModifier('a');
1604                                         modifier = 'b';
1605                                 } else if (modifier == 'z')
1606                                         modifier = 'A';
1607                                 else
1608                                         modifier++;
1609                         } else {
1610                                 modifier = 0;
1611                         }
1612                         entry.setModifier(modifier);
1613                         // remember the last one
1614                         last = biit;
1615                 }
1616         }
1617         // Set the labels
1618         for (auto const & ce : cited_entries_) {
1619                 map<docstring, BibTeXInfo>::iterator const biit = bimap_.find(ce);
1620                 // this shouldn't happen, but...
1621                 if (biit == bimap_.end())
1622                         // ...fail gracefully, anyway.
1623                         continue;
1624                 BibTeXInfo & entry = biit->second;
1625                 if (numbers) {
1626                         entry.label(entry.citeNumber());
1627                 } else {
1628                         docstring const auth = entry.getAuthorOrEditorList(&buf, false);
1629                         // we do it this way so as to access the xref, if necessary
1630                         // note that this also gives us the modifier
1631                         docstring const year = getYear(ce, buf, true);
1632                         if (!auth.empty() && !year.empty())
1633                                 entry.label(auth + ' ' + year);
1634                         else
1635                                 entry.label(entry.key());
1636                 }
1637         }
1638 }
1639
1640
1641 //////////////////////////////////////////////////////////////////////
1642 //
1643 // CitationStyle
1644 //
1645 //////////////////////////////////////////////////////////////////////
1646
1647
1648 CitationStyle citationStyleFromString(string const & command,
1649                                       BufferParams const & params)
1650 {
1651         CitationStyle cs;
1652         if (command.empty())
1653                 return cs;
1654
1655         string const alias = params.getCiteAlias(command);
1656         string cmd = alias.empty() ? command : alias;
1657         if (isUpperCase(command[0])) {
1658                 cs.forceUpperCase = true;
1659                 cmd[0] = lowercase(cmd[0]);
1660         }
1661
1662         size_t const n = command.size() - 1;
1663         if (command[n] == '*') {
1664                 cs.hasStarredVersion = true;
1665                 if (suffixIs(cmd, '*'))
1666                         cmd = cmd.substr(0, cmd.size() - 1);
1667         }
1668
1669         cs.name = cmd;
1670         return cs;
1671 }
1672
1673
1674 string citationStyleToString(const CitationStyle & cs, bool const latex)
1675 {
1676         string cmd = latex ? cs.cmd : cs.name;
1677         if (cs.forceUpperCase)
1678                 cmd[0] = uppercase(cmd[0]);
1679         if (cs.hasStarredVersion)
1680                 cmd += '*';
1681         return cmd;
1682 }
1683
1684
1685 docstring authorsToDocBookAuthorGroup(docstring const & authorsString, XMLStream & xs, Buffer const & buf)
1686 {
1687         // This function closely mimics getAuthorList, but produces DocBook instead of text.
1688         // It has been greatly simplified, as the complete list of authors is always produced. No separators are required,
1689         // as the output has a database-like shape.
1690         // constructName has also been merged within, as it becomes really simple and leads to no copy-paste.
1691
1692         if (authorsString.empty()) {
1693                 return docstring();
1694         }
1695
1696         // Split the input list of authors into individual authors.
1697         vector<docstring> const authors = getAuthors(authorsString);
1698
1699         // Retrieve the "et al." variation.
1700         string const etal = buf.params().documentClass().getCiteMacro(buf.params().citeEngineType(), "_etal");
1701
1702         // Output the list of authors.
1703         xs << xml::StartTag("authorgroup");
1704         xs << xml::CR();
1705
1706         auto it = authors.cbegin();
1707         auto en = authors.cend();
1708         for (size_t i = 0; it != en; ++it, ++i) {
1709                 xs << xml::StartTag("author");
1710                 xs << xml::CR();
1711                 xs << xml::StartTag("personname");
1712                 xs << xml::CR();
1713                 docstring name = *it;
1714
1715                 // All authors go in a <personname>. If more structure is known, use it; otherwise (just "et al."), print it as such.
1716                 if (name == "others") {
1717                         xs << buf.B_(etal);
1718                 } else {
1719                         name_parts parts = nameParts(name);
1720                         if (! parts.prefix.empty()) {
1721                                 xs << xml::StartTag("honorific");
1722                                 xs << parts.prefix;
1723                                 xs << xml::EndTag("honorific");
1724                                 xs << xml::CR();
1725                         }
1726                         if (! parts.prename.empty()) {
1727                                 xs << xml::StartTag("firstname");
1728                                 xs << parts.prename;
1729                                 xs << xml::EndTag("firstname");
1730                                 xs << xml::CR();
1731                         }
1732                         if (! parts.surname.empty()) {
1733                                 xs << xml::StartTag("surname");
1734                                 xs << parts.surname;
1735                                 xs << xml::EndTag("surname");
1736                                 xs << xml::CR();
1737                         }
1738                         if (! parts.suffix.empty()) {
1739                                 xs << xml::StartTag("othername", "role=\"suffix\"");
1740                                 xs << parts.suffix;
1741                                 xs << xml::EndTag("othername");
1742                                 xs << xml::CR();
1743                         }
1744                 }
1745
1746                 xs << xml::EndTag("personname");
1747                 xs << xml::CR();
1748                 xs << xml::EndTag("author");
1749                 xs << xml::CR();
1750
1751                 // Could add an affiliation after <personname>, but not stored in BibTeX.
1752         }
1753         xs << xml::EndTag("authorgroup");
1754         xs << xml::CR();
1755
1756         return docstring();
1757 }
1758
1759 } // namespace lyx