]> git.lyx.org Git - lyx.git/blob - src/insets/insetbibtex.C
Fix bug 1826 + A more precise parser for bibtex files based on the description found...
[lyx.git] / src / insets / insetbibtex.C
1 /**
2  * \file insetbibtex.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author Alejandro Aguilar Sierra
7  *
8  * Full author contact details are available in file CREDITS.
9  */
10
11 #include <config.h>
12
13 #include "insetbibtex.h"
14
15 #include "buffer.h"
16 #include "bufferparams.h"
17 #include "dispatchresult.h"
18 #include "debug.h"
19 #include "encoding.h"
20 #include "funcrequest.h"
21 #include "gettext.h"
22 #include "LaTeXFeatures.h"
23 #include "metricsinfo.h"
24 #include "outputparams.h"
25
26 #include "frontends/Alert.h"
27
28 #include "support/filetools.h"
29 #include "support/lstrings.h"
30 #include "support/lyxlib.h"
31 #include "support/os.h"
32 #include "support/path.h"
33 #include "support/textutils.h"
34
35 #include <boost/tokenizer.hpp>
36
37
38 namespace lyx {
39
40 using support::absolutePath;
41 using support::ascii_lowercase;
42 using support::changeExtension;
43 using support::contains;
44 using support::copy;
45 using support::DocFileName;
46 using support::FileName;
47 using support::findtexfile;
48 using support::isFileReadable;
49 using support::latex_path;
50 using support::ltrim;
51 using support::makeAbsPath;
52 using support::makeRelPath;
53 using support::Path;
54 using support::prefixIs;
55 using support::removeExtension;
56 using support::rtrim;
57 using support::split;
58 using support::subst;
59 using support::tokenPos;
60 using support::trim;
61 using support::lowercase;
62
63 namespace Alert = frontend::Alert;
64 namespace os = support::os;
65
66 using std::endl;
67 using std::getline;
68 using std::string;
69 using std::ostream;
70 using std::pair;
71 using std::vector;
72 using std::map;
73
74
75 InsetBibtex::InsetBibtex(InsetCommandParams const & p)
76         : InsetCommand(p, "bibtex")
77 {}
78
79
80 std::auto_ptr<InsetBase> InsetBibtex::doClone() const
81 {
82         return std::auto_ptr<InsetBase>(new InsetBibtex(*this));
83 }
84
85
86 void InsetBibtex::doDispatch(LCursor & cur, FuncRequest & cmd)
87 {
88         switch (cmd.action) {
89
90         case LFUN_INSET_MODIFY: {
91                 InsetCommandParams p("bibtex");
92                 InsetCommandMailer::string2params("bibtex", to_utf8(cmd.argument()), p);
93                 if (!p.getCmdName().empty()) {
94                         setParams(p);
95                         cur.buffer().updateBibfilesCache();
96                 } else
97                         cur.noUpdate();
98                 break;
99         }
100
101         default:
102                 InsetCommand::doDispatch(cur, cmd);
103                 break;
104         }
105 }
106
107
108 docstring const InsetBibtex::getScreenLabel(Buffer const &) const
109 {
110         return _("BibTeX Generated Bibliography");
111 }
112
113
114 namespace {
115
116 string normalize_name(Buffer const & buffer, OutputParams const & runparams,
117                       string const & name, string const & ext)
118 {
119         string const fname = makeAbsPath(name, buffer.filePath()).absFilename();
120         if (absolutePath(name) || !isFileReadable(FileName(fname + ext)))
121                 return name;
122         else if (!runparams.nice)
123                 return fname;
124         else
125                 // FIXME UNICODE
126                 return to_utf8(makeRelPath(from_utf8(fname),
127                                            from_utf8(buffer.getMasterBuffer()->filePath())));
128 }
129
130 }
131
132
133 int InsetBibtex::latex(Buffer const & buffer, odocstream & os,
134                        OutputParams const & runparams) const
135 {
136         // the sequence of the commands:
137         // 1. \bibliographystyle{style}
138         // 2. \addcontentsline{...} - if option bibtotoc set
139         // 3. \bibliography{database}
140         // and with bibtopic:
141         // 1. \bibliographystyle{style}
142         // 2. \begin{btSect}{database}
143         // 3. \btPrint{Cited|NotCited|All}
144         // 4. \end{btSect}
145
146         // Database(s)
147         // If we are processing the LaTeX file in a temp directory then
148         // copy the .bib databases to this temp directory, mangling their
149         // names in the process. Store this mangled name in the list of
150         // all databases.
151         // (We need to do all this because BibTeX *really*, *really*
152         // can't handle "files with spaces" and Windows users tend to
153         // use such filenames.)
154         // Otherwise, store the (maybe absolute) path to the original,
155         // unmangled database name.
156         typedef boost::char_separator<char_type> Separator;
157         typedef boost::tokenizer<Separator, docstring::const_iterator, docstring> Tokenizer;
158
159         Separator const separator(from_ascii(",").c_str());
160         // The tokenizer must not be called with temporary strings, since
161         // it does not make a copy and uses iterators of the string further
162         // down. getParam returns a reference, so this is OK.
163         Tokenizer const tokens(getParam("bibfiles"), separator);
164         Tokenizer::const_iterator const begin = tokens.begin();
165         Tokenizer::const_iterator const end = tokens.end();
166
167         odocstringstream dbs;
168         for (Tokenizer::const_iterator it = begin; it != end; ++it) {
169                 docstring const input = trim(*it);
170                 // FIXME UNICODE
171                 string utf8input(to_utf8(input));
172                 string database =
173                         normalize_name(buffer, runparams, utf8input, ".bib");
174                 FileName const try_in_file(makeAbsPath(database + ".bib", buffer.filePath()));
175                 bool const not_from_texmf = isFileReadable(try_in_file);
176
177                 if (!runparams.inComment && !runparams.dryrun && !runparams.nice &&
178                     not_from_texmf) {
179
180                         // mangledFilename() needs the extension
181                         DocFileName const in_file = DocFileName(try_in_file);
182                         database = removeExtension(in_file.mangledFilename());
183                         FileName const out_file(makeAbsPath(database + ".bib",
184                                         buffer.getMasterBuffer()->temppath()));
185
186                         bool const success = copy(in_file, out_file);
187                         if (!success) {
188                                 lyxerr << "Failed to copy '" << in_file
189                                        << "' to '" << out_file << "'"
190                                        << endl;
191                         }
192                 }
193
194                 if (it != begin)
195                         dbs << ',';
196                 // FIXME UNICODE
197                 dbs << from_utf8(latex_path(database));
198         }
199         docstring const db_out = dbs.str();
200
201         // Post this warning only once.
202         static bool warned_about_spaces = false;
203         if (!warned_about_spaces &&
204             runparams.nice && db_out.find(' ') != docstring::npos) {
205                 warned_about_spaces = true;
206
207                 Alert::warning(_("Export Warning!"),
208                                _("There are spaces in the paths to your BibTeX databases.\n"
209                                               "BibTeX will be unable to find them."));
210
211         }
212
213         // Style-Options
214         string style = to_utf8(getParam("options")); // maybe empty! and with bibtotoc
215         string bibtotoc;
216         if (prefixIs(style, "bibtotoc")) {
217                 bibtotoc = "bibtotoc";
218                 if (contains(style, ',')) {
219                         style = split(style, bibtotoc, ',');
220                 }
221         }
222
223         // line count
224         int nlines = 0;
225
226         if (!style.empty()) {
227                 string base =
228                         normalize_name(buffer, runparams, style, ".bst");
229                 FileName const try_in_file(makeAbsPath(base + ".bst", buffer.filePath()));
230                 bool const not_from_texmf = isFileReadable(try_in_file);
231                 // If this style does not come from texmf and we are not
232                 // exporting to .tex copy it to the tmp directory.
233                 // This prevents problems with spaces and 8bit charcaters
234                 // in the file name.
235                 if (!runparams.inComment && !runparams.dryrun && !runparams.nice &&
236                     not_from_texmf) {
237                         // use new style name
238                         DocFileName const in_file = DocFileName(try_in_file);
239                         base = removeExtension(in_file.mangledFilename());
240                         FileName const out_file(makeAbsPath(base + ".bst",
241                                         buffer.getMasterBuffer()->temppath()));
242                         bool const success = copy(in_file, out_file);
243                         if (!success) {
244                                 lyxerr << "Failed to copy '" << in_file
245                                        << "' to '" << out_file << "'"
246                                        << endl;
247                         }
248                 }
249                 // FIXME UNICODE
250                 os << "\\bibliographystyle{"
251                    << from_utf8(latex_path(normalize_name(buffer, runparams, base, ".bst")))
252                    << "}\n";
253                 nlines += 1;
254         }
255
256         // Post this warning only once.
257         static bool warned_about_bst_spaces = false;
258         if (!warned_about_bst_spaces && runparams.nice && contains(style, ' ')) {
259                 warned_about_bst_spaces = true;
260                 Alert::warning(_("Export Warning!"),
261                                _("There are spaces in the path to your BibTeX style file.\n"
262                                               "BibTeX will be unable to find it."));
263         }
264
265         if (!db_out.empty() && buffer.params().use_bibtopic){
266                 os << "\\begin{btSect}{" << db_out << "}\n";
267                 docstring btprint = getParam("btprint");
268                 if (btprint.empty())
269                         // default
270                         btprint = from_ascii("btPrintCited");
271                 os << "\\" << btprint << "\n"
272                    << "\\end{btSect}\n";
273                 nlines += 3;
274         }
275
276         // bibtotoc-Option
277         if (!bibtotoc.empty() && !buffer.params().use_bibtopic) {
278                 // maybe a problem when a textclass has no "art" as
279                 // part of its name, because it's than book.
280                 // For the "official" lyx-layouts it's no problem to support
281                 // all well
282                 if (!contains(buffer.params().getLyXTextClass().name(),
283                               "art")) {
284                         if (buffer.params().sides == LyXTextClass::OneSide) {
285                                 // oneside
286                                 os << "\\clearpage";
287                         } else {
288                                 // twoside
289                                 os << "\\cleardoublepage";
290                         }
291
292                         // bookclass
293                         os << "\\addcontentsline{toc}{chapter}{\\bibname}";
294
295                 } else {
296                         // article class
297                         os << "\\addcontentsline{toc}{section}{\\refname}";
298                 }
299         }
300
301         if (!db_out.empty() && !buffer.params().use_bibtopic){
302                 os << "\\bibliography{" << db_out << "}\n";
303                 nlines += 1;
304         }
305
306         return nlines;
307 }
308
309
310 vector<FileName> const InsetBibtex::getFiles(Buffer const & buffer) const
311 {
312         Path p(buffer.filePath());
313
314         vector<FileName> vec;
315
316         string tmp;
317         // FIXME UNICODE
318         string bibfiles = to_utf8(getParam("bibfiles"));
319         bibfiles = split(bibfiles, tmp, ',');
320         while (!tmp.empty()) {
321                 FileName const file = findtexfile(changeExtension(tmp, "bib"), "bib");
322                 LYXERR(Debug::LATEX) << "Bibfile: " << file << endl;
323
324                 // If we didn't find a matching file name just fail silently
325                 if (!file.empty())
326                         vec.push_back(file);
327
328                 // Get next file name
329                 bibfiles = split(bibfiles, tmp, ',');
330         }
331
332         return vec;
333 }
334
335 namespace {
336
337         // methods for parsing bibtex files
338
339         typedef map<docstring, docstring> VarMap;
340
341         /// remove whitespace characters, optionally a single comma, 
342         /// and further whitespace characters from the stream.
343         /// @return true if a comma was found, false otherwise
344         ///
345         bool removeWSAndComma(idocfstream & ifs) {
346                 char_type ch;
347
348                 if (!ifs) 
349                         return false;
350
351                 // skip whitespace
352                 do {
353                         ifs.get(ch);
354                 } while (ifs && isSpace(ch));
355
356                 if (!ifs) 
357                         return false;
358
359                 if (ch != ',') {
360                         ifs.putback(ch);
361                         return false;
362                 }
363
364                 // skip whitespace
365                 do {
366                         ifs.get(ch);
367                 } while (ifs && isSpace(ch));
368
369                 if (ifs) {
370                         ifs.putback(ch);
371                 }
372
373                 return true;
374         }
375
376         /// remove whitespace characters, read characer sequence
377         /// not containing whitespace characters or characters in
378         /// delimChars, and remove further whitespace characters.
379         ///
380         /// @return true if a string of length > 0 could be read.
381         /// 
382         bool readTypeOrKey(docstring & val, idocfstream & ifs, docstring const & delimChars) {
383
384                 char_type ch;
385
386                 val.clear();
387
388                 if (!ifs) 
389                         return false;
390
391                 // skip whitespace
392                 do {
393                         ifs.get(ch);
394                 } while (ifs && isSpace(ch));
395
396                 if (!ifs) 
397                         return false;
398
399                 // read value 
400                 while (ifs && !isSpace(ch) && delimChars.find(ch) == docstring::npos) {
401                         val += lowercase(ch);
402                         ifs.get(ch);
403                 }
404
405                 // skip whitespace
406                 while (ifs && isSpace(ch)) {
407                         ifs.get(ch);
408                 }
409
410                 if (ifs) {
411                         ifs.putback(ch);
412                 }
413
414                 return val.length() > 0;
415         }
416
417         /// read subsequent bibtex values that are delimited with a #-character.
418         /// Concatenate all parts and replace names with the associated string in 
419         /// the variable strings.
420         /// @return true if reading was successfull (all single parts were delimited
421         /// correctly)
422         bool readValue(docstring & val, idocfstream & ifs, const VarMap & strings) {
423
424                 char_type ch;
425
426                 val.clear();
427
428                 if (!ifs) 
429                         return false;
430
431                 do {
432                         // skip whitespace
433                         do {
434                                 ifs.get(ch);
435                         } while (ifs && isSpace(ch));
436
437                         if (!ifs)
438                                 return false;
439
440                         // check for field type
441                         if (isDigit(ch)) {
442
443                                 // read integer value
444                                 do {
445                                         val += ch;
446                                         ifs.get(ch);
447                                 } while (ifs && isDigit(ch));
448
449                                 if (!ifs)
450                                         return false;
451
452                         } else if (ch == '"' || ch == '{') {
453
454                                 // read delimited text - set end delimiter
455                                 char_type delim = ch == '"'? '"': '}';
456
457                                 // inside this delimited text braces must match.
458                                 // Thus we can have a closing delimiter only
459                                 // when nestLevel == 0
460                                 int nestLevel = 0;
461
462                                 ifs.get(ch);
463                                 while (ifs && (nestLevel > 0 || ch != delim)) {
464                                         val += ch;
465                                         
466                                         // update nesting level
467                                         switch (ch) {
468                                                 case '{':
469                                                         ++nestLevel;
470                                                         break;
471                                                 case '}':
472                                                         --nestLevel;
473                                                         if (nestLevel < 0) return false;
474                                                         break;
475                                         }
476
477                                         ifs.get(ch);
478                                 }
479
480                                 if (!ifs)
481                                         return false;
482
483                                 ifs.get(ch);
484
485                                 if (!ifs)
486                                         return false;
487
488                         } else {
489
490                                 // reading a string name
491                                 docstring strName;
492
493                                 while (ifs && !isSpace(ch) && ch != '#' && ch != ',' && ch != '}' && ch != ')') {
494                                         strName += lowercase(ch);
495                                         ifs.get(ch);
496                                 }
497
498                                 if (!ifs)
499                                         return false;
500
501                                 // replace the string with its assigned value or
502                                 // discard it if it's not assigned
503                                 if (strName.length()) {
504                                         VarMap::const_iterator pos = strings.find(strName);
505                                         if (pos != strings.end()) {
506                                                 val += pos->second;
507                                         }
508                                 }
509                         }
510
511                         // skip WS
512                         while (ifs && isSpace(ch)) {
513                                 ifs.get(ch);
514                         }
515
516                         if (!ifs)
517                                 return false;
518
519                         // continue reading next value on concatenate with '#'
520                 } while (ch == '#');  
521
522                 ifs.putback(ch);
523
524                 return true;
525         }
526 }
527
528
529 // This method returns a comma separated list of Bibtex entries
530 void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
531                 std::vector<std::pair<string, docstring> > & keys) const
532 {
533         vector<FileName> const files = getFiles(buffer);
534         for (vector<FileName>::const_iterator it = files.begin();
535              it != files.end(); ++ it) {
536             // This bibtex parser is a first step to parse bibtex files
537                 // more precisely. 
538                 // 
539                 // - it reads the whole bibtex entry and does a syntax check
540                 //   (matching delimiters, missing commas,...
541                 // - it recovers from errors starting with the next @-character
542                 // - it reads @string definitions and replaces them in the 
543                 //   field values.
544                 // - it accepts more characters in keys or value names than 
545                 //   bibtex does.
546                 //
547                 // TODOS:
548                 // - the entries are split into name = value pairs by the 
549                 //   parser. These have to be merged again because of the 
550                 //   way lyx treats the entries ( pair<...>(...) ). The citation
551                 //   mechanism in lyx should be changed such that it can use
552                 //   the split entries.
553                 // - messages on parsing errors can be generated.
554                 //
555
556                 // Officially bibtex does only support ASCII, but in practice
557                 // you can use the encoding of the main document as long as
558                 // some elements like keys and names are pure ASCII. Therefore
559                 // we convert the file from the buffer encoding.
560                 // We don't restrict keys to ASCII in LyX, since our own
561                 // InsetBibitem can generate non-ASCII keys, and nonstandard
562                 // 8bit clean bibtex forks exist.
563                 idocfstream ifs(it->toFilesystemEncoding().c_str(),
564                                 std::ios_base::in,
565                                 buffer.params().encoding().iconvName());
566                 
567                 char_type ch;
568                 VarMap strings;
569
570                 while (ifs) {
571
572                         ifs.get(ch);
573                         if (!ifs) 
574                                 break;
575
576                         if (ch != '@') 
577                                 continue;
578
579                         docstring entryType;
580
581                         if (!readTypeOrKey(entryType, ifs, from_ascii("{(")) || !ifs)
582                                 continue;
583
584                         if (entryType == from_ascii("comment")) {
585
586                                 ifs.ignore(std::numeric_limits<int>::max(), '\n');
587                                 continue;
588                         } 
589
590                         // check entry delimiter
591                         char_type entryDelim;
592
593                         ifs.get(ch);
594                         if (!ifs) 
595                                 break;
596
597                         if (ch == '(') entryDelim = ')';
598                         else if (ch == '{') entryDelim = ')';
599                         else {
600                                 // invalid entry delimiter
601                                 ifs.putback(ch);
602                                 continue;
603                         }
604
605                         // process the entry
606                         if (entryType == from_ascii("string")) {
607
608                                 // read string and add it to the strings map 
609                                 // (or replace it's old value)
610                                 docstring name;
611                                 docstring value;
612
613                                 if (!readTypeOrKey(name, ifs, from_ascii("#=}),")) || !ifs)
614                                         continue;
615
616                                 ifs.get(ch);
617                                 if (!ifs || ch != '=')
618                                         continue;
619
620                                 if (!readValue(value, ifs, strings))
621                                         continue;
622
623                                 strings[name] = value;
624
625                         } else if (entryType == from_ascii("preamble")) {
626
627                                 // preamble definitions are discarded. 
628                                 // can they be of any use in lyx?
629                                 docstring value;
630
631                                 if (!readValue(value, ifs, strings))
632                                         continue;
633
634                         } else {
635
636                                 // Citation entry. Read the key and all name = value pairs
637                                 docstring key;
638                                 docstring fields;
639                                 docstring name;
640                                 docstring value;
641                                 docstring commaNewline;
642
643                                 if (!readTypeOrKey(key, ifs, from_ascii(",})")) || !ifs)
644                                         continue;
645
646                                 // now we have a key, so we will add an entry 
647                                 // (even if it's empty, as bibtex does)
648                                 // 
649                                 // all items must be separated by a comma. If
650                                 // it is missing the scanning of this entry is
651                                 // stopped and the next is searched.
652                                 bool readNext = removeWSAndComma(ifs);
653
654                                 while (ifs && readNext) {
655
656                                         // read field name
657                                         if (!readTypeOrKey(name, ifs, from_ascii("=}),")) || !ifs)
658                                                 break;
659
660                                         // next char must be an equal sign
661                                         ifs.get(ch);
662                                         if (!ifs)
663                                                 break;
664                                         if (ch != '=') {
665                                                 ifs.putback(ch);
666                                                 break;
667                                         }
668
669                                         // read field value
670                                         if (!readValue(value, ifs, strings)) 
671                                                 break;
672
673                                         // append field to the total entry string.
674                                         //
675                                         // TODO: Here is where the fields can be put in 
676                                         //       a more intelligent structure that preserves
677                                         //           the already known parts.
678                                         fields += commaNewline;
679                                         fields += name + from_ascii(" = {") + value + '}';
680
681                                         if (!commaNewline.length()) 
682                                                 commaNewline = from_ascii(",\n"); 
683
684                                         readNext = removeWSAndComma(ifs);
685                                 }
686
687                                 // add the new entry
688                                 keys.push_back(pair<string, docstring>(
689                                 to_utf8(key), fields));
690                         }
691
692                 } //< searching '@'
693
694         } //< for loop over files
695 }
696
697
698
699 bool InsetBibtex::addDatabase(string const & db)
700 {
701         // FIXME UNICODE
702         string bibfiles(to_utf8(getParam("bibfiles")));
703         if (tokenPos(bibfiles, ',', db) == -1) {
704                 if (!bibfiles.empty())
705                         bibfiles += ',';
706                 setParam("bibfiles", from_utf8(bibfiles + db));
707                 return true;
708         }
709         return false;
710 }
711
712
713 bool InsetBibtex::delDatabase(string const & db)
714 {
715         // FIXME UNICODE
716         string bibfiles(to_utf8(getParam("bibfiles")));
717         if (contains(bibfiles, db)) {
718                 int const n = tokenPos(bibfiles, ',', db);
719                 string bd = db;
720                 if (n > 0) {
721                         // this is not the first database
722                         string tmp = ',' + bd;
723                         setParam("bibfiles", from_utf8(subst(bibfiles, tmp, string())));
724                 } else if (n == 0)
725                         // this is the first (or only) database
726                         setParam("bibfiles", from_utf8(split(bibfiles, bd, ',')));
727                 else
728                         return false;
729         }
730         return true;
731 }
732
733
734 void InsetBibtex::validate(LaTeXFeatures & features) const
735 {
736         if (features.bufferParams().use_bibtopic)
737                 features.require("bibtopic");
738 }
739
740
741 } // namespace lyx