]> git.lyx.org Git - lyx.git/blobdiff - src/insets/insetbibtex.C
* In the process of fixing the math background color bug, this commit transfer backgr...
[lyx.git] / src / insets / insetbibtex.C
index 09ba80701a377147ee54dc8e4e6c19ff1294067d..e1459cf488b33d1b844292bf6f0558f3a285d9cc 100644 (file)
@@ -16,6 +16,7 @@
 #include "bufferparams.h"
 #include "dispatchresult.h"
 #include "debug.h"
+#include "encoding.h"
 #include "funcrequest.h"
 #include "gettext.h"
 #include "LaTeXFeatures.h"
 #include "support/lyxlib.h"
 #include "support/os.h"
 #include "support/path.h"
+#include "support/textutils.h"
 
 #include <boost/tokenizer.hpp>
 
-#include <fstream>
-#include <sstream>
-
 
 namespace lyx {
 
@@ -51,7 +50,6 @@ using support::latex_path;
 using support::ltrim;
 using support::makeAbsPath;
 using support::makeRelPath;
-using support::Path;
 using support::prefixIs;
 using support::removeExtension;
 using support::rtrim;
@@ -59,6 +57,7 @@ using support::split;
 using support::subst;
 using support::tokenPos;
 using support::trim;
+using support::lowercase;
 
 namespace Alert = frontend::Alert;
 namespace os = support::os;
@@ -66,10 +65,10 @@ namespace os = support::os;
 using std::endl;
 using std::getline;
 using std::string;
-using std::ifstream;
 using std::ostream;
 using std::pair;
 using std::vector;
+using std::map;
 
 
 InsetBibtex::InsetBibtex(InsetCommandParams const & p)
@@ -116,13 +115,15 @@ namespace {
 string normalize_name(Buffer const & buffer, OutputParams const & runparams,
                      string const & name, string const & ext)
 {
-       string const fname = makeAbsPath(name, buffer.filePath());
+       string const fname = makeAbsPath(name, buffer.filePath()).absFilename();
        if (absolutePath(name) || !isFileReadable(FileName(fname + ext)))
                return name;
        else if (!runparams.nice)
                return fname;
        else
-               return makeRelPath(fname, buffer.getMasterBuffer()->filePath());
+               // FIXME UNICODE
+               return to_utf8(makeRelPath(from_utf8(fname),
+                                          from_utf8(buffer.getMasterBuffer()->filePath())));
 }
 
 }
@@ -169,8 +170,8 @@ int InsetBibtex::latex(Buffer const & buffer, odocstream & os,
                string utf8input(to_utf8(input));
                string database =
                        normalize_name(buffer, runparams, utf8input, ".bib");
-               string const try_in_file = makeAbsPath(database + ".bib", buffer.filePath());
-               bool const not_from_texmf = isFileReadable(FileName(try_in_file));
+               FileName const try_in_file(makeAbsPath(database + ".bib", buffer.filePath()));
+               bool const not_from_texmf = isFileReadable(try_in_file);
 
                if (!runparams.inComment && !runparams.dryrun && !runparams.nice &&
                    not_from_texmf) {
@@ -178,7 +179,7 @@ int InsetBibtex::latex(Buffer const & buffer, odocstream & os,
                        // mangledFilename() needs the extension
                        DocFileName const in_file = DocFileName(try_in_file);
                        database = removeExtension(in_file.mangledFilename());
-                       FileName const out_file = FileName(makeAbsPath(database + ".bib",
+                       FileName const out_file(makeAbsPath(database + ".bib",
                                        buffer.getMasterBuffer()->temppath()));
 
                        bool const success = copy(in_file, out_file);
@@ -224,8 +225,8 @@ int InsetBibtex::latex(Buffer const & buffer, odocstream & os,
        if (!style.empty()) {
                string base =
                        normalize_name(buffer, runparams, style, ".bst");
-               string const try_in_file = makeAbsPath(base + ".bst", buffer.filePath());
-               bool const not_from_texmf = isFileReadable(FileName(try_in_file));
+               FileName const try_in_file(makeAbsPath(base + ".bst", buffer.filePath()));
+               bool const not_from_texmf = isFileReadable(try_in_file);
                // If this style does not come from texmf and we are not
                // exporting to .tex copy it to the tmp directory.
                // This prevents problems with spaces and 8bit charcaters
@@ -235,7 +236,7 @@ int InsetBibtex::latex(Buffer const & buffer, odocstream & os,
                        // use new style name
                        DocFileName const in_file = DocFileName(try_in_file);
                        base = removeExtension(in_file.mangledFilename());
-                       FileName const out_file = FileName(makeAbsPath(base + ".bst",
+                       FileName const out_file(makeAbsPath(base + ".bst",
                                        buffer.getMasterBuffer()->temppath()));
                        bool const success = copy(in_file, out_file);
                        if (!success) {
@@ -307,7 +308,8 @@ int InsetBibtex::latex(Buffer const & buffer, odocstream & os,
 
 vector<FileName> const InsetBibtex::getFiles(Buffer const & buffer) const
 {
-       Path p(buffer.filePath());
+       FileName path(buffer.filePath());
+       support::Path p(path);
 
        vector<FileName> vec;
 
@@ -317,7 +319,7 @@ vector<FileName> const InsetBibtex::getFiles(Buffer const & buffer) const
        bibfiles = split(bibfiles, tmp, ',');
        while (!tmp.empty()) {
                FileName const file = findtexfile(changeExtension(tmp, "bib"), "bib");
-               lyxerr[Debug::LATEX] << "Bibfile: " << file << endl;
+               LYXERR(Debug::LATEX) << "Bibfile: " << file << endl;
 
                // If we didn't find a matching file name just fail silently
                if (!file.empty())
@@ -330,44 +332,370 @@ vector<FileName> const InsetBibtex::getFiles(Buffer const & buffer) const
        return vec;
 }
 
+namespace {
+
+       // methods for parsing bibtex files
+
+       typedef map<docstring, docstring> VarMap;
+
+       /// remove whitespace characters, optionally a single comma, 
+       /// and further whitespace characters from the stream.
+       /// @return true if a comma was found, false otherwise
+       ///
+       bool removeWSAndComma(idocfstream & ifs) {
+               char_type ch;
+
+               if (!ifs) 
+                       return false;
+
+               // skip whitespace
+               do {
+                       ifs.get(ch);
+               } while (ifs && isSpace(ch));
+
+               if (!ifs) 
+                       return false;
+
+               if (ch != ',') {
+                       ifs.putback(ch);
+                       return false;
+               }
+
+               // skip whitespace
+               do {
+                       ifs.get(ch);
+               } while (ifs && isSpace(ch));
+
+               if (ifs) {
+                       ifs.putback(ch);
+               }
+
+               return true;
+       }
+
+       /// remove whitespace characters, read characer sequence
+       /// not containing whitespace characters or characters in
+       /// delimChars, and remove further whitespace characters.
+       ///
+       /// @return true if a string of length > 0 could be read.
+       /// 
+       bool readTypeOrKey(docstring & val, idocfstream & ifs, docstring const & delimChars) {
+
+               char_type ch;
+
+               val.clear();
+
+               if (!ifs) 
+                       return false;
+
+               // skip whitespace
+               do {
+                       ifs.get(ch);
+               } while (ifs && isSpace(ch));
+
+               if (!ifs) 
+                       return false;
+
+               // read value 
+               while (ifs && !isSpace(ch) && delimChars.find(ch) == docstring::npos) {
+                       val += lowercase(ch);
+                       ifs.get(ch);
+               }
+
+               // skip whitespace
+               while (ifs && isSpace(ch)) {
+                       ifs.get(ch);
+               }
+
+               if (ifs) {
+                       ifs.putback(ch);
+               }
+
+               return val.length() > 0;
+       }
+
+       /// read subsequent bibtex values that are delimited with a #-character.
+       /// Concatenate all parts and replace names with the associated string in 
+       /// the variable strings.
+       /// @return true if reading was successfull (all single parts were delimited
+       /// correctly)
+       bool readValue(docstring & val, idocfstream & ifs, const VarMap & strings) {
+
+               char_type ch;
+
+               val.clear();
+
+               if (!ifs) 
+                       return false;
+
+               do {
+                       // skip whitespace
+                       do {
+                               ifs.get(ch);
+                       } while (ifs && isSpace(ch));
+
+                       if (!ifs)
+                               return false;
+
+                       // check for field type
+                       if (isDigit(ch)) {
+
+                               // read integer value
+                               do {
+                                       val += ch;
+                                       ifs.get(ch);
+                               } while (ifs && isDigit(ch));
+
+                               if (!ifs)
+                                       return false;
+
+                       } else if (ch == '"' || ch == '{') {
+
+                               // read delimited text - set end delimiter
+                               char_type delim = ch == '"'? '"': '}';
+
+                               // inside this delimited text braces must match.
+                               // Thus we can have a closing delimiter only
+                               // when nestLevel == 0
+                               int nestLevel = 0;
+
+                               ifs.get(ch);
+                               while (ifs && (nestLevel > 0 || ch != delim)) {
+                                       val += ch;
+                                       
+                                       // update nesting level
+                                       switch (ch) {
+                                               case '{':
+                                                       ++nestLevel;
+                                                       break;
+                                               case '}':
+                                                       --nestLevel;
+                                                       if (nestLevel < 0) return false;
+                                                       break;
+                                       }
+
+                                       ifs.get(ch);
+                               }
+
+                               if (!ifs)
+                                       return false;
+
+                               ifs.get(ch);
+
+                               if (!ifs)
+                                       return false;
+
+                       } else {
+
+                               // reading a string name
+                               docstring strName;
+
+                               while (ifs && !isSpace(ch) && ch != '#' && ch != ',' && ch != '}' && ch != ')') {
+                                       strName += lowercase(ch);
+                                       ifs.get(ch);
+                               }
+
+                               if (!ifs)
+                                       return false;
+
+                               // replace the string with its assigned value or
+                               // discard it if it's not assigned
+                               if (strName.length()) {
+                                       VarMap::const_iterator pos = strings.find(strName);
+                                       if (pos != strings.end()) {
+                                               val += pos->second;
+                                       }
+                               }
+                       }
+
+                       // skip WS
+                       while (ifs && isSpace(ch)) {
+                               ifs.get(ch);
+                       }
+
+                       if (!ifs)
+                               return false;
+
+                       // continue reading next value on concatenate with '#'
+               } while (ch == '#');  
+
+               ifs.putback(ch);
+
+               return true;
+       }
+}
+
 
 // This method returns a comma separated list of Bibtex entries
 void InsetBibtex::fillWithBibKeys(Buffer const & buffer,
-                                 std::vector<std::pair<string, string> > & keys) const
+               std::vector<std::pair<string, docstring> > & keys) const
 {
        vector<FileName> const files = getFiles(buffer);
        for (vector<FileName>::const_iterator it = files.begin();
             it != files.end(); ++ it) {
-               // This is a _very_ simple parser for Bibtex database
-               // files. All it does is to look for lines starting
-               // in @ and not being @preamble and @string entries.
-               // It does NOT do any syntax checking!
-               ifstream ifs(it->toFilesystemEncoding().c_str());
-               string linebuf0;
-               while (getline(ifs, linebuf0)) {
-                       string linebuf = trim(linebuf0);
-                       if (linebuf.empty()) continue;
-                       if (prefixIs(linebuf, "@")) {
-                               linebuf = subst(linebuf, '{', '(');
-                               string tmp;
-                               linebuf = split(linebuf, tmp, '(');
-                               tmp = ascii_lowercase(tmp);
-                               if (!prefixIs(tmp, "@string")
-                                   && !prefixIs(tmp, "@preamble")) {
-                                       linebuf = split(linebuf, tmp, ',');
-                                       tmp = ltrim(tmp, " \t");
-                                       if (!tmp.empty()) {
-                                               keys.push_back(pair<string,string>(tmp,string()));
+           // This bibtex parser is a first step to parse bibtex files
+               // more precisely. 
+               // 
+               // - it reads the whole bibtex entry and does a syntax check
+               //   (matching delimiters, missing commas,...
+               // - it recovers from errors starting with the next @-character
+               // - it reads @string definitions and replaces them in the 
+               //   field values.
+               // - it accepts more characters in keys or value names than 
+               //   bibtex does.
+               //
+               // TODOS:
+               // - the entries are split into name = value pairs by the 
+               //   parser. These have to be merged again because of the 
+               //   way lyx treats the entries ( pair<...>(...) ). The citation
+               //   mechanism in lyx should be changed such that it can use
+               //   the split entries.
+               // - messages on parsing errors can be generated.
+               //
+
+               // Officially bibtex does only support ASCII, but in practice
+               // you can use the encoding of the main document as long as
+               // some elements like keys and names are pure ASCII. Therefore
+               // we convert the file from the buffer encoding.
+               // We don't restrict keys to ASCII in LyX, since our own
+               // InsetBibitem can generate non-ASCII keys, and nonstandard
+               // 8bit clean bibtex forks exist.
+               idocfstream ifs(it->toFilesystemEncoding().c_str(),
+                               std::ios_base::in,
+                               buffer.params().encoding().iconvName());
+               
+               char_type ch;
+               VarMap strings;
+
+               while (ifs) {
+
+                       ifs.get(ch);
+                       if (!ifs) 
+                               break;
+
+                       if (ch != '@') 
+                               continue;
+
+                       docstring entryType;
+
+                       if (!readTypeOrKey(entryType, ifs, from_ascii("{(")) || !ifs)
+                               continue;
+
+                       if (entryType == from_ascii("comment")) {
+
+                               ifs.ignore(std::numeric_limits<int>::max(), '\n');
+                               continue;
+                       } 
+
+                       // check entry delimiter
+                       char_type entryDelim;
+
+                       ifs.get(ch);
+                       if (!ifs) 
+                               break;
+
+                       if (ch == '(') entryDelim = ')';
+                       else if (ch == '{') entryDelim = ')';
+                       else {
+                               // invalid entry delimiter
+                               ifs.putback(ch);
+                               continue;
+                       }
+
+                       // process the entry
+                       if (entryType == from_ascii("string")) {
+
+                               // read string and add it to the strings map 
+                               // (or replace it's old value)
+                               docstring name;
+                               docstring value;
+
+                               if (!readTypeOrKey(name, ifs, from_ascii("#=}),")) || !ifs)
+                                       continue;
+
+                               ifs.get(ch);
+                               if (!ifs || ch != '=')
+                                       continue;
+
+                               if (!readValue(value, ifs, strings))
+                                       continue;
+
+                               strings[name] = value;
+
+                       } else if (entryType == from_ascii("preamble")) {
+
+                               // preamble definitions are discarded. 
+                               // can they be of any use in lyx?
+                               docstring value;
+
+                               if (!readValue(value, ifs, strings))
+                                       continue;
+
+                       } else {
+
+                               // Citation entry. Read the key and all name = value pairs
+                               docstring key;
+                               docstring fields;
+                               docstring name;
+                               docstring value;
+                               docstring commaNewline;
+
+                               if (!readTypeOrKey(key, ifs, from_ascii(",})")) || !ifs)
+                                       continue;
+
+                               // now we have a key, so we will add an entry 
+                               // (even if it's empty, as bibtex does)
+                               // 
+                               // all items must be separated by a comma. If
+                               // it is missing the scanning of this entry is
+                               // stopped and the next is searched.
+                               bool readNext = removeWSAndComma(ifs);
+
+                               while (ifs && readNext) {
+
+                                       // read field name
+                                       if (!readTypeOrKey(name, ifs, from_ascii("=}),")) || !ifs)
+                                               break;
+
+                                       // next char must be an equal sign
+                                       ifs.get(ch);
+                                       if (!ifs)
+                                               break;
+                                       if (ch != '=') {
+                                               ifs.putback(ch);
+                                               break;
                                        }
+
+                                       // read field value
+                                       if (!readValue(value, ifs, strings)) 
+                                               break;
+
+                                       // append field to the total entry string.
+                                       //
+                                       // TODO: Here is where the fields can be put in 
+                                       //       a more intelligent structure that preserves
+                                       //           the already known parts.
+                                       fields += commaNewline;
+                                       fields += name + from_ascii(" = {") + value + '}';
+
+                                       if (!commaNewline.length()) 
+                                               commaNewline = from_ascii(",\n"); 
+
+                                       readNext = removeWSAndComma(ifs);
                                }
-                       } else if (!keys.empty()) {
-                               keys.back().second += linebuf + "\n";
+
+                               // add the new entry
+                               keys.push_back(pair<string, docstring>(
+                               to_utf8(key), fields));
                        }
-               }
-       }
+
+               } //< searching '@'
+
+       } //< for loop over files
 }
 
 
+
 bool InsetBibtex::addDatabase(string const & db)
 {
        // FIXME UNICODE