From de3e5280f62b1f3efec0ff050d23ae183a010d8d Mon Sep 17 00:00:00 2001 From: Georg Baum Date: Sun, 31 Jan 2016 12:54:59 +0100 Subject: [PATCH] Open tex2lyx input files with correct encoding We open the input file now twice: The first time in latin1 encoding to read the document encoding from the preamble. This does always work, since traditional TeX does not allow non-ASCII contents without an encoding changing command (except for comments, but we do not need them, and using latin1 rather than utf8 ensures that they do not produce an iconv exception, but are simply recored with wrong characters), and we do detect the utf8 based TeX engines XeTeX and LuaTeX as well. The second time we open the file directly with the document encoding. This fixes a few tex2lyx tests on OS X, since changing the encoding of an open file steam does not work with clang on OS X. Files using more than one encoding are still broken, but all single-encoding files are fixed now. --- src/support/docstream.cpp | 12 +++++++++ src/support/docstream.h | 8 ++++++ src/tex2lyx/Preamble.cpp | 53 ++++++++++++++++++++++++++++-------- src/tex2lyx/Preamble.h | 8 +++++- src/tex2lyx/tex2lyx.cpp | 56 ++++++++++++++++++++++++++------------- 5 files changed, 106 insertions(+), 31 deletions(-) diff --git a/src/support/docstream.cpp b/src/support/docstream.cpp index e33dffc3c0..de5a6df035 100644 --- a/src/support/docstream.cpp +++ b/src/support/docstream.cpp @@ -334,6 +334,12 @@ ifdocstream::ifdocstream() : base() } +ifdocstream::ifdocstream(SetEnc const & enc) : base() +{ + setEncoding(*this, enc.encoding, in); +} + + ifdocstream::ifdocstream(const char* s, ios_base::openmode mode, string const & encoding) : base() @@ -349,6 +355,12 @@ ofdocstream::ofdocstream(): base() } +ofdocstream::ofdocstream(SetEnc const & enc) : base() +{ + setEncoding(*this, enc.encoding, out); +} + + ofdocstream::ofdocstream(const char* s, ios_base::openmode mode, string const & encoding) : base() diff --git a/src/support/docstream.h b/src/support/docstream.h index a6197cccd6..460a9b7393 100644 --- a/src/support/docstream.h +++ b/src/support/docstream.h @@ -40,6 +40,8 @@ typedef std::basic_istream idocstream; */ typedef std::basic_ostream odocstream; +struct SetEnc; + /// File stream for reading UTF8-encoded files with automatic conversion to /// UCS4. /// Buffering must be switched off if the encoding is changed after @@ -48,6 +50,9 @@ class ifdocstream : public std::basic_ifstream { typedef std::basic_ifstream base; public: ifdocstream(); + /// Create a stream with a specific encoding \p enc. + /// We must not pass \p enc as string, to avoid confusing it with a file name. + explicit ifdocstream(SetEnc const & enc); explicit ifdocstream(const char* s, std::ios_base::openmode mode = std::ios_base::in, std::string const & encoding = "UTF-8"); @@ -61,6 +66,9 @@ class ofdocstream : public std::basic_ofstream { typedef std::basic_ofstream base; public: ofdocstream(); + /// Create a stream with a specific encoding \p enc. + /// We must not pass \p enc as string, to avoid confusing it with a file name. + explicit ofdocstream(SetEnc const & enc); explicit ofdocstream(const char* s, std::ios_base::openmode mode = std::ios_base::out|std::ios_base::trunc, std::string const & encoding = "UTF-8"); diff --git a/src/tex2lyx/Preamble.cpp b/src/tex2lyx/Preamble.cpp index 2a14aeeb0a..24c403d23e 100644 --- a/src/tex2lyx/Preamble.cpp +++ b/src/tex2lyx/Preamble.cpp @@ -672,7 +672,8 @@ void Preamble::handle_geometry(vector & options) void Preamble::handle_package(Parser &p, string const & name, - string const & opts, bool in_lyx_preamble) + string const & opts, bool in_lyx_preamble, + bool detectEncoding) { vector options = split_options(opts); add_package(name, options); @@ -909,9 +910,11 @@ void Preamble::handle_package(Parser &p, string const & name, string const encoding = options.back(); Encoding const * const enc = encodings.fromLaTeXName( encoding, Encoding::inputenc, true); - if (!enc) - cerr << "Unknown encoding " << encoding << ". Ignoring." << std::endl; - else { + if (!enc) { + if (!detectEncoding) + cerr << "Unknown encoding " << encoding + << ". Ignoring." << std::endl; + } else { if (!enc->unsafe() && options.size() == 1 && one_language == true) h_inputencoding = enc->name(); p.setEncoding(enc->iconvName()); @@ -1026,7 +1029,7 @@ void Preamble::handle_package(Parser &p, string const & name, } // We need to do something with the options... - if (!options.empty()) + if (!options.empty() && !detectEncoding) cerr << "Ignoring options '" << join(options, ",") << "' of package " << name << '.' << endl; @@ -1260,6 +1263,13 @@ void Preamble::parse(Parser & p, string const & forceclass, { // initialize fixed types special_columns_['D'] = 3; + parse(p, forceclass, false, tc); +} + + +void Preamble::parse(Parser & p, string const & forceclass, + bool detectEncoding, TeX2LyXDocClass & tc) +{ bool is_full_document = false; bool is_lyx_file = false; bool in_lyx_preamble = false; @@ -1275,11 +1285,19 @@ void Preamble::parse(Parser & p, string const & forceclass, } p.reset(); + if (detectEncoding && !is_full_document) + return; + while (is_full_document && p.good()) { + if (detectEncoding && h_inputencoding != "auto" && + h_inputencoding != "default") + return; + Token const & t = p.get_token(); #ifdef FILEDEBUG - cerr << "t: " << t << "\n"; + if (!detectEncoding) + cerr << "t: " << t << '\n'; #endif // @@ -1314,7 +1332,8 @@ void Preamble::parse(Parser & p, string const & forceclass, if (comment.size() > magicXeLaTeX.size() && comment.substr(0, magicXeLaTeX.size()) == magicXeLaTeX && h_inputencoding == "auto") { - cerr << "XeLaTeX comment found, switching to UTF8\n"; + if (!detectEncoding) + cerr << "XeLaTeX comment found, switching to UTF8\n"; h_inputencoding = "utf8"; } smatch sub; @@ -1657,16 +1676,18 @@ void Preamble::parse(Parser & p, string const & forceclass, vector::const_iterator end = vecnames.end(); for (; it != end; ++it) handle_package(p, trimSpaceAndEol(*it), options, - in_lyx_preamble); + in_lyx_preamble, detectEncoding); } else if (t.cs() == "inputencoding") { string const encoding = p.getArg('{','}'); Encoding const * const enc = encodings.fromLaTeXName( encoding, Encoding::inputenc, true); - if (!enc) - cerr << "Unknown encoding " << encoding << ". Ignoring." << std::endl; - else { + if (!enc) { + if (!detectEncoding) + cerr << "Unknown encoding " << encoding + << ". Ignoring." << std::endl; + } else { if (!enc->unsafe()) h_inputencoding = enc->name(); p.setEncoding(enc->iconvName()); @@ -1949,6 +1970,16 @@ void Preamble::parse(Parser & p, string const & forceclass, } +string Preamble::parseEncoding(Parser & p, string const & forceclass) +{ + TeX2LyXDocClass dummy; + parse(p, forceclass, true, dummy); + if (h_inputencoding != "auto" && h_inputencoding != "default") + return h_inputencoding; + return ""; +} + + string babel2lyx(string const & language) { char const * const * where = is_known(language, known_languages); diff --git a/src/tex2lyx/Preamble.h b/src/tex2lyx/Preamble.h index 01a5cd32e4..f9342efca2 100644 --- a/src/tex2lyx/Preamble.h +++ b/src/tex2lyx/Preamble.h @@ -92,6 +92,8 @@ public: /// Parses the LaTeX preamble into internal data void parse(Parser & p, std::string const & forceclass, TeX2LyXDocClass & tc); + /// Parse the encoding from a preamble. *this is unusable afterwards. + std::string parseEncoding(Parser & p, std::string const & forceclass); /// Writes the LyX file header from internal data bool writeLyXHeader(std::ostream & os, bool subdoc, std::string const & outfiledir); @@ -103,6 +105,9 @@ public: private: /// + void parse(Parser & p, std::string const & forceclass, + bool detectEncoding, TeX2LyXDocClass & tc); + /// std::map > used_packages; /// Packages that will be loaded automatically by LyX std::set auto_packages; @@ -218,7 +223,8 @@ private: void handle_geometry(std::vector & options); /// void handle_package(Parser &p, std::string const & name, - std::string const & opts, bool in_lyx_preamble); + std::string const & opts, bool in_lyx_preamble, + bool detectEncoding); /// void handle_if(Parser & p, bool in_lyx_preamble); diff --git a/src/tex2lyx/tex2lyx.cpp b/src/tex2lyx/tex2lyx.cpp index 4ce2512179..7a6bbb0412 100644 --- a/src/tex2lyx/tex2lyx.cpp +++ b/src/tex2lyx/tex2lyx.cpp @@ -839,24 +839,9 @@ namespace { * You must ensure that \p parentFilePathTeX is properly set before calling * this function! */ -bool tex2lyx(idocstream & is, ostream & os, string encoding, +bool tex2lyx(idocstream & is, ostream & os, string const & encoding, string const & outfiledir) { - // Set a sensible default encoding. - // This is used until an encoding command is found. - // For child documents use the encoding of the master, else ISO-8859-1, - // (formerly known by its latex name latin1), since ISO-8859-1 does not - // cause an iconv error if the actual encoding is different (bug 7509). - if (encoding.empty()) { - if (preamble.inputencoding() == "auto") - encoding = "ISO-8859-1"; - else { - Encoding const * const enc = encodings.fromLyXName( - preamble.inputencoding(), true); - encoding = enc->iconvName(); - } - } - Parser p(is, fixed_encoding ? default_encoding : string()); p.setEncoding(encoding); //p.dump(); @@ -925,12 +910,45 @@ bool tex2lyx(idocstream & is, ostream & os, string encoding, /// convert TeX from \p infilename to LyX and write it to \p os -bool tex2lyx(FileName const & infilename, ostream & os, string const & encoding, +bool tex2lyx(FileName const & infilename, ostream & os, string encoding, string const & outfiledir) { - ifdocstream is; + // Set a sensible default encoding. + // This is used until an encoding command is found. + // For child documents use the encoding of the master, else try to + // detect it from the preamble, since setting an encoding of an open + // fstream does currently not work on OS X. + // Always start with ISO-8859-1, (formerly known by its latex name + // latin1), since ISO-8859-1 does not cause an iconv error if the + // actual encoding is different (bug 7509). + if (encoding.empty()) { + Encoding const * enc = 0; + if (preamble.inputencoding() == "auto") { + ifdocstream is(setEncoding("ISO-8859-1")); + // forbid buffering on this stream + is.rdbuf()->pubsetbuf(0, 0); + is.open(infilename.toFilesystemEncoding().c_str()); + if (is.good()) { + Parser ep(is, string()); + ep.setEncoding("ISO-8859-1"); + Preamble encodingpreamble; + string const e = encodingpreamble + .parseEncoding(ep, documentclass); + if (!e.empty()) + enc = encodings.fromLyXName(e, true); + } + } else + enc = encodings.fromLyXName( + preamble.inputencoding(), true); + if (enc) + encoding = enc->iconvName(); + else + encoding = "ISO-8859-1"; + } + + ifdocstream is(setEncoding(encoding)); // forbid buffering on this stream - is.rdbuf()->pubsetbuf(0,0); + is.rdbuf()->pubsetbuf(0, 0); is.open(infilename.toFilesystemEncoding().c_str()); if (!is.good()) { cerr << "Could not open input file \"" << infilename -- 2.39.2