X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2Fsupport%2Fdocstream.cpp;h=e8839e07f61420104c3ff49826f7dc4997123ce6;hb=faa87bf9f30b943397429a04254d96963bbf38bc;hp=8f476376e6f586ca95be86a84bebad417b76bfdc;hpb=5cbe5cf5adbc8989ddec8aed408126255207c5b0;p=lyx.git diff --git a/src/support/docstream.cpp b/src/support/docstream.cpp index 8f476376e6..e8839e07f6 100644 --- a/src/support/docstream.cpp +++ b/src/support/docstream.cpp @@ -10,18 +10,27 @@ #include -#include "docstream.h" -#include "unicode.h" +#include "support/docstream.h" +#include "support/lstrings.h" +#include "support/unicode.h" +#include #include #include +#include #include #include +using namespace std; using lyx::ucs4_codeset; +using lyx::support::contains; +using lyx::support::split; -using std::string; + +#if defined(_MSC_VER) && (_MSC_VER >= 1600) +std::locale::id numpunct::id; +#endif namespace { @@ -32,18 +41,18 @@ namespace { /// codecvt facet for conversion of UCS4 (internal representation) to UTF8 /// (external representation) or vice versa -class iconv_codecvt_facet : public std::codecvt +class iconv_codecvt_facet : public codecvt { - typedef std::codecvt base; + typedef codecvt base; public: /// Constructor. You have to specify with \p inout whether you want /// to use this facet only for input, only for output or for both. explicit iconv_codecvt_facet(string const & encoding = "UTF-8", - std::ios_base::openmode inout = std::ios_base::in | std::ios_base::out, + ios_base::openmode inout = ios_base::in | ios_base::out, size_t refs = 0) : base(refs), encoding_(encoding) { - if (inout & std::ios_base::in) { + if (inout & ios_base::in) { in_cd_ = iconv_open(ucs4_codeset, encoding.c_str()); if (in_cd_ == (iconv_t)(-1)) { fprintf(stderr, "Error %d returned from iconv_open(in_cd_): %s\n", @@ -53,7 +62,7 @@ public: } } else in_cd_ = (iconv_t)(-1); - if (inout & std::ios_base::out) { + if (inout & ios_base::out) { out_cd_ = iconv_open(encoding.c_str(), ucs4_codeset); if (out_cd_ == (iconv_t)(-1)) { fprintf(stderr, "Error %d returned from iconv_open(out_cd_): %s\n", @@ -85,13 +94,50 @@ protected: extern_type * to, extern_type * to_end, extern_type *& to_next) const { +#define WORKAROUND_ICONV_BUG 1 +#if WORKAROUND_ICONV_BUG + // Due to a bug in some iconv versions, when the last char + // in the buffer is a wide char and the output encoding is + // ISO-2022-JP and we are going to switch to another encoding, + // the appropriate escape sequence for changing the character + // set is not output (see bugs 5216, 5280, and also 5489). + // As a workaround, we append a nul char in order to force + // a switch to ASCII, and then remove it from output after + // the conversion. + intern_type * from_new = 0; + intern_type const * from_old = from; + size_t extra = 0; + if (*(from_end - 1) >= 0x80 && encoding_ == "ISO-2022-JP") { + size_t len = from_end - from; + from_new = new intern_type[len + 1]; + memcpy(from_new, from, len * sizeof(intern_type)); + from_new[len] = 0; + from_end = from_new + len + 1; + from = from_new; + extra = 1; + } +#endif size_t inbytesleft = (from_end - from) * sizeof(intern_type); size_t outbytesleft = (to_end - to) * sizeof(extern_type); +#if WORKAROUND_ICONV_BUG + outbytesleft += extra * sizeof(extern_type); +#endif from_next = from; to_next = to; result const retval = do_iconv(out_cd_, reinterpret_cast(&from_next), &inbytesleft, &to_next, &outbytesleft); +#if WORKAROUND_ICONV_BUG + // Remove from output the nul char that we inserted at the end + // of the input buffer in order to circumvent an iconv bug. + if (from_new) { + --to_next; + --from_next; + from_next = from_old + (from_next - from); + from = from_old; + delete[] from_new; + } +#endif if (retval == base::error) { fprintf(stderr, "Error %d returned from iconv when converting from %s to %s: %s\n", @@ -201,18 +247,12 @@ protected: return to_next - to; #else size_t const length = end - from; - return std::min(length, max); + return min(length, max); #endif } virtual int do_max_length() const throw() { - // UTF8 uses at most 4 bytes to represent one UCS4 code point - // (see RFC 3629). RFC 2279 specifies 6 bytes, but that - // information is outdated, and RFC 2279 has been superseded by - // RFC 3629. - // All other encodings encode one UCS4 code point in one byte - // (and can therefore only encode a subset of UCS4) - return encoding_ == "UTF-8" ? 4 : 1; + return lyx::max_encoded_bytes(encoding_); } private: /// Do the actual conversion. The interface is equivalent to that of @@ -225,6 +265,14 @@ private: inbytesleft, to, outbytesleft); if (converted == (size_t)(-1)) { switch(errno) { + case 0: + // As strange as it may seem, this + // does happen on windows when parsing + // comments with accented chars in + // tex2lyx. See the following thread + // for details + // http://thread.gmane.org/gmane.editors.lyx.devel/117636 + break; case EINVAL: case E2BIG: return base::partial; @@ -240,7 +288,7 @@ private: iconv_t in_cd_; iconv_t out_cd_; /// The narrow encoding - std::string encoding_; + string encoding_; }; } // namespace anon @@ -248,6 +296,15 @@ private: namespace lyx { +template +void setEncoding(Ios & ios, string const & encoding, ios_base::openmode mode) +{ + // We must imbue the stream before openening the file + locale global; + locale locale(global, new iconv_codecvt_facet(encoding, mode)); + ios.imbue(locale); +} + const char * iconv_codecvt_facet_exception::what() const throw() { @@ -255,46 +312,43 @@ const char * iconv_codecvt_facet_exception::what() const throw() } -idocfstream::idocfstream(string const & encoding) : base() +ifdocstream::ifdocstream() : base() { - std::locale global; - std::locale locale(global, new iconv_codecvt_facet(encoding, in)); - imbue(locale); + setEncoding(*this, "UTF-8", in); } -idocfstream::idocfstream(const char* s, std::ios_base::openmode mode, +ifdocstream::ifdocstream(const char* s, ios_base::openmode mode, string const & encoding) : base() { - // We must imbue the stream before openening the file - std::locale global; - std::locale locale(global, new iconv_codecvt_facet(encoding, in)); - imbue(locale); + setEncoding(*this, encoding, in); open(s, mode); } -odocfstream::odocfstream(string const & encoding) : base() +ofdocstream::ofdocstream(): base() { - std::locale global; - std::locale locale(global, new iconv_codecvt_facet(encoding, out)); - imbue(locale); + setEncoding(*this, "UTF-8", out); } -odocfstream::odocfstream(const char* s, std::ios_base::openmode mode, +ofdocstream::ofdocstream(const char* s, ios_base::openmode mode, string const & encoding) : base() { - // We must imbue the stream before openening the file - std::locale global; - std::locale locale(global, new iconv_codecvt_facet(encoding, out)); - imbue(locale); + setEncoding(*this, encoding, out); open(s, mode); } +void ofdocstream::reset(string const & encoding) +{ + setEncoding(*this, encoding, out); +} + + + SetEnc setEncoding(string const & encoding) { return SetEnc(encoding); @@ -303,14 +357,14 @@ SetEnc setEncoding(string const & encoding) odocstream & operator<<(odocstream & os, SetEnc e) { - if (std::has_facet(os.rdbuf()->getloc())) { + if (has_facet(os.rdbuf()->getloc())) { // This stream must be a file stream, since we never imbue // any other stream with a locale having a iconv_codecvt_facet. // Flush the stream so that all pending output is written // with the old encoding. os.flush(); - std::locale locale(os.rdbuf()->getloc(), - new iconv_codecvt_facet(e.encoding, std::ios_base::out)); + locale locale(os.rdbuf()->getloc(), + new iconv_codecvt_facet(e.encoding, ios_base::out)); // FIXME Does changing the codecvt facet of an open file // stream always work? It does with gcc 4.1, but I have read // somewhere that it does not with MSVC. @@ -320,6 +374,188 @@ odocstream & operator<<(odocstream & os, SetEnc e) return os; } + +//CHECKME: I just copied the code above, and have no idea whether it +//is correct... (JMarc) +idocstream & operator<<(idocstream & is, SetEnc e) +{ + if (has_facet(is.rdbuf()->getloc())) { + // This stream must be a file stream, since we never imbue + // any other stream with a locale having a iconv_codecvt_facet. + // Flush the stream so that all pending output is written + // with the old encoding. + //is.flush(); + locale locale(is.rdbuf()->getloc(), + new iconv_codecvt_facet(e.encoding, ios_base::in)); + // FIXME Does changing the codecvt facet of an open file + // stream always work? It does with gcc 4.1, but I have read + // somewhere that it does not with MSVC. + // What does the standard say? + is.imbue(locale); + } + return is; +} + + +#if ! defined(USE_WCHAR_T) +odocstream & operator<<(odocstream & os, char c) +{ + os.put(c); + return os; +} +#endif + + +void otexstream::put(char_type const & c) +{ + if (protectspace_) { + if (!canbreakline_ && c == ' ') + os_ << "{}"; + protectspace_ = false; + } + os_.put(c); + lastchar_ = c; + if (c == '\n') { + texrow_.newline(); + canbreakline_ = false; + } else + canbreakline_ = true; +} + + +BreakLine breakln; +SafeBreakLine safebreakln; + + +otexstream & operator<<(otexstream & ots, BreakLine) +{ + if (ots.canBreakLine()) { + ots.os().put('\n'); + ots.lastChar('\n'); + ots.canBreakLine(false); + ots.texrow().newline(); + } + ots.protectSpace(false); + return ots; +} + + +otexstream & operator<<(otexstream & ots, SafeBreakLine) +{ + if (ots.canBreakLine()) { + ots.os() << "%\n"; + ots.lastChar('\n'); + ots.canBreakLine(false); + ots.texrow().newline(); + } + ots.protectSpace(false); + return ots; +} + + +otexstream & operator<<(otexstream & ots, odocstream_manip pf) +{ + ots.os() << pf; + if (pf == static_cast(endl)) { + ots.lastChar('\n'); + ots.texrow().newline(); + } + return ots; +} + + +otexstream & operator<<(otexstream & ots, docstring const & s) +{ + size_t const len = s.length(); + + // Check whether there's something to output + if (len == 0) + return ots; + + if (ots.protectSpace()) { + if (!ots.canBreakLine() && s[0] == ' ') + ots.os() << "{}"; + ots.protectSpace(false); + } + + if (contains(s, 0xF0000)) { + // Some encoding changes for the underlying stream are embedded + // in the docstring. The encoding names to be used are enclosed + // between the code points 0xF0000 and 0xF0001, the first two + // characters of plane 15, which is a Private Use Area whose + // codepoints don't have any associated glyph. + docstring s1; + docstring s2 = split(s, s1, 0xF0000); + while (true) { + if (!s1.empty()) + ots.os() << s1; + if (s2.empty()) + break; + docstring enc; + docstring const s3 = split(s2, enc, 0xF0001); + if (!contains(s2, 0xF0001)) + s2 = split(enc, s1, 0xF0000); + else { + ots.os() << setEncoding(to_ascii(enc)); + s2 = split(s3, s1, 0xF0000); + } + } + } else + ots.os() << s; + + ots.lastChar(s[len - 1]); + ots.texrow().newlines(count(s.begin(), s.end(), '\n')); + ots.canBreakLine(s[len - 1] != '\n'); + return ots; +} + + +otexstream & operator<<(otexstream & ots, string const & s) +{ + ots << from_utf8(s); + return ots; +} + + +otexstream & operator<<(otexstream & ots, char const * s) +{ + ots << from_utf8(s); + return ots; +} + + +otexstream & operator<<(otexstream & ots, char c) +{ + if (ots.protectSpace()) { + if (!ots.canBreakLine() && c == ' ') + ots.os() << "{}"; + ots.protectSpace(false); + } + ots.os() << c; + ots.lastChar(c); + if (c == '\n') + ots.texrow().newline(); + ots.canBreakLine(c != '\n'); + return ots; +} + + +template +otexstream & operator<<(otexstream & ots, Type value) +{ + ots.os() << value; + ots.lastChar(0); + ots.canBreakLine(true); + ots.protectSpace(false); + return ots; +} + +template otexstream & operator<< (otexstream & os, SetEnc); +template otexstream & operator<< (otexstream &, double); +template otexstream & operator<< (otexstream &, int); +template otexstream & operator<< (otexstream &, unsigned int); +template otexstream & operator<< (otexstream &, unsigned long); + } #if ! defined(USE_WCHAR_T) && defined(__GNUC__) @@ -327,21 +563,71 @@ odocstream & operator<<(odocstream & os, SetEnc e) // a bug in gcc. The implementation here does not do anything useful, since // it is overriden in iconv_codecvt_facet. namespace std { + template<> codecvt::result -codecvt::do_out(mbstate_t &, const lyx::char_type *, const lyx::char_type *, const lyx::char_type *&, - char *, char *, char *&) const { return error; } +codecvt::do_out( + mbstate_t &, const lyx::char_type *, const lyx::char_type *, + const lyx::char_type *&, char *, char *, char *&) const +{ + return error; +} + + template<> codecvt::result -codecvt::do_unshift(mbstate_t &, char *, char *, char *&) const { return error; } +codecvt::do_unshift( + mbstate_t &, char *, char *, char *&) const +{ + return error; +} + + template<> codecvt::result -codecvt::do_in(mbstate_t &, const char *, const char *, const char *&, - lyx::char_type *, lyx::char_type *, lyx::char_type *&) const { return error; } -template<> int codecvt::do_encoding() const throw() { return 0; } -template<> bool codecvt::do_always_noconv() const throw() { return true; } +codecvt::do_in( + mbstate_t &, const char *, const char *, const char *&, + lyx::char_type *, lyx::char_type *, lyx::char_type *&) const +{ + return error; +} + + +template<> +int codecvt::do_encoding() const throw() +{ + return 0; +} + + +template<> +bool codecvt::do_always_noconv() const throw() +{ + return true; +} + #if __GNUC__ == 3 && __GNUC_MINOR__ < 4 -template<> int codecvt::do_length(mbstate_t const &, const char *, const char *, size_t) const { return 1; } + +template<> +int codecvt::do_length( + mbstate_t const &, const char *, const char *, size_t) const +{ + return 1; +} + #else -template<> int codecvt::do_length(mbstate_t &, const char *, const char *, size_t) const { return 1; } + +template<> +int codecvt::do_length( + mbstate_t &, const char *, const char *, size_t) const +{ + return 1; +} + #endif -template<> int codecvt::do_max_length() const throw() { return 4; } + +template<> +int codecvt::do_max_length() const throw() +{ + return 4; } + +} // namespace std #endif