]> git.lyx.org Git - lyx.git/blobdiff - src/support/docstream.C
* src/encoding.C (latexChar,read):
[lyx.git] / src / support / docstream.C
index b6c592bd61c163bfee9c64860c29e5d1be4d9be1..ac9a8ed23050ac38e91e9c28787dda125151e798 100644 (file)
 #include <config.h>
 
 #include "docstream.h"
+#include "unicode.h"
 
 #include <cerrno>
 #include <cstdio>
 #include <iconv.h>
 #include <locale>
 
-namespace {
 
-#ifdef WORDS_BIGENDIAN
-char const * ucs4_codeset = "UCS-4BE";
-#else
-char const * ucs4_codeset = "UCS-4LE";
-#endif
-char const * utf8_codeset = "UTF-8";
+using lyx::ucs4_codeset;
 
-// We use C IO throughout this file, because the facets might be used with
-// lyxerr in the future.
+using std::string;
 
 
-class utf8_codecvt_facet_exception : public std::exception {
-public:
-       virtual ~utf8_codecvt_facet_exception() throw() {}
-       virtual const char* what() const throw()
-       {
-               return "iconv problem in utf8_codecvt_facet initialization";
-       }
-};
+namespace {
+
+// We use C IO throughout this file, because the facets might be used with
+// lyxerr in the future.
 
 
 /// codecvt facet for conversion of UCS4 (internal representation) to UTF8
 /// (external representation) or vice versa
-class utf8_codecvt_facet : public std::codecvt<lyx::char_type, char, std::mbstate_t>
+class iconv_codecvt_facet : public std::codecvt<lyx::char_type, char, std::mbstate_t>
 {
        typedef std::codecvt<lyx::char_type, char, std::mbstate_t> base;
 public:
        /// Constructor. You have to specify with \p inout whether you want
        /// to use this facet only for input, only for output or for both.
-       explicit utf8_codecvt_facet(std::ios_base::openmode inout = std::ios_base::in | std::ios_base::out,
+       explicit iconv_codecvt_facet(string const & encoding = "UTF-8",
+                       std::ios_base::openmode inout = std::ios_base::in | std::ios_base::out,
                        size_t refs = 0)
-               : base(refs)
+               : base(refs), encoding_(encoding)
        {
                if (inout & std::ios_base::in) {
-                       in_cd_ = iconv_open(ucs4_codeset, utf8_codeset);
+                       in_cd_ = iconv_open(ucs4_codeset, encoding.c_str());
                        if (in_cd_ == (iconv_t)(-1)) {
                                fprintf(stderr, "Error %d returned from iconv_open(in_cd_): %s\n",
                                        errno, strerror(errno));
                                fflush(stderr);
-                               throw utf8_codecvt_facet_exception();
+                               throw lyx::iconv_codecvt_facet_exception();
                        }
                } else
                        in_cd_ = (iconv_t)(-1);
                if (inout & std::ios_base::out) {
-                       out_cd_ = iconv_open(utf8_codeset, ucs4_codeset);
+                       out_cd_ = iconv_open(encoding.c_str(), ucs4_codeset);
                        if (out_cd_ == (iconv_t)(-1)) {
                                fprintf(stderr, "Error %d returned from iconv_open(out_cd_): %s\n",
                                        errno, strerror(errno));
                                fflush(stderr);
-                               throw utf8_codecvt_facet_exception();
+                               throw lyx::iconv_codecvt_facet_exception();
                        }
                } else
                        out_cd_ = (iconv_t)(-1);
        }
 protected:
-       virtual ~utf8_codecvt_facet()
+       virtual ~iconv_codecvt_facet()
        {
                if (in_cd_ != (iconv_t)(-1))
                        if (iconv_close(in_cd_) == -1) {
@@ -98,8 +89,38 @@ protected:
                size_t outbytesleft = (to_end - to) * sizeof(extern_type);
                from_next = from;
                to_next = to;
-               return do_iconv(out_cd_, reinterpret_cast<char const **>(&from_next),
+               result const retval = do_iconv(out_cd_,
+                               reinterpret_cast<char const **>(&from_next),
                                &inbytesleft, &to_next, &outbytesleft);
+               if (retval == base::error) {
+                       fprintf(stderr,
+                               "Error %d returned from iconv when converting from %s to %s: %s\n",
+                               errno, ucs4_codeset, encoding_.c_str(),
+                               strerror(errno));
+                       fputs("Converted input:", stderr);
+                       for (intern_type const * i = from; i < from_next; ++i) {
+                               unsigned int const c = *i;
+                               fprintf(stderr, " 0x%04x", c);
+                       }
+                       unsigned int const c = *from_next;
+                       fprintf(stderr, "\nStopped at: 0x%04x\n", c);
+                       fputs("Unconverted input:", stderr);
+                       for (intern_type const * i = from_next + 1; i < from_end; ++i) {
+                               unsigned int const c = *i;
+                               fprintf(stderr, " 0x%04x", c);
+                       }
+                       fputs("\nConverted output:", stderr);
+                       for (extern_type const * i = to; i < to_next; ++i) {
+                               // extern_type may be signed, avoid output of
+                               // something like 0xffffffc2
+                               unsigned int const c =
+                                       *reinterpret_cast<unsigned char const *>(i);
+                               fprintf(stderr, " 0x%02x", c);
+                       }
+                       fputc('\n', stderr);
+                       fflush(stderr);
+               }
+               return retval;
        }
        virtual result do_unshift(state_type &, extern_type * to,
                        extern_type *, extern_type *& to_next) const
@@ -118,9 +139,40 @@ protected:
                size_t outbytesleft = (to_end - to) * sizeof(intern_type);
                from_next = from;
                to_next = to;
-               return do_iconv(in_cd_, &from_next, &inbytesleft,
+               result const retval = do_iconv(in_cd_, &from_next, &inbytesleft,
                                reinterpret_cast<char **>(&to_next),
                                &outbytesleft);
+               if (retval == base::error) {
+                       fprintf(stderr,
+                               "Error %d returned from iconv when converting from %s to %s: %s\n",
+                               errno, encoding_.c_str(), ucs4_codeset,
+                               strerror(errno));
+                       fputs("Converted input:", stderr);
+                       for (extern_type const * i = from; i < from_next; ++i) {
+                               // extern_type may be signed, avoid output of
+                               // something like 0xffffffc2
+                               unsigned int const c =
+                                       *reinterpret_cast<unsigned char const *>(i);
+                               fprintf(stderr, " 0x%02x", c);
+                       }
+                       unsigned int const c =
+                               *reinterpret_cast<unsigned char const *>(from_next);
+                       fprintf(stderr, "\nStopped at: 0x%02x\n", c);
+                       fputs("Unconverted input:", stderr);
+                       for (extern_type const * i = from_next + 1; i < from_end; ++i) {
+                               unsigned int const c =
+                                       *reinterpret_cast<unsigned char const *>(i);
+                               fprintf(stderr, " 0x%02x", c);
+                       }
+                       fputs("\nConverted output:", stderr);
+                       for (intern_type const * i = to; i < to_next; ++i) {
+                               unsigned int const c = *i;
+                               fprintf(stderr, " 0x%02x", c);
+                       }
+                       fputc('\n', stderr);
+                       fflush(stderr);
+               }
+               return retval;
        }
        virtual int do_encoding() const throw()
        {
@@ -154,8 +206,13 @@ protected:
        }
        virtual int do_max_length() const throw()
        {
-               // UTF8 uses at most 6 bytes to represent one code point
-               return 6;
+               // UTF8 uses at most 4 bytes to represent one UCS4 code point
+               // (see RFC 3629). RFC 2279 specifies 6 bytes, but that
+               // information is outdated, and RFC 2279 has been superseded by
+               // RFC 3629.
+               // All other encodings encode one UCS4 code point in one byte
+               // (and can therefore only encode a subset of UCS4)
+               return encoding_ == "UTF-8" ? 4 : 1;
        }
 private:
        /// Do the actual conversion. The interface is equivalent to that of
@@ -163,7 +220,7 @@ private:
        inline base::result do_iconv(iconv_t cd, char const ** from,
                        size_t * inbytesleft, char ** to, size_t * outbytesleft) const
        {
-               char const * to_start = *to;
+               char const * const to_start = *to;
                size_t converted = iconv(cd, const_cast<char ICONV_CONST **>(from),
                                inbytesleft, to, outbytesleft);
                if (converted == (size_t)(-1)) {
@@ -173,9 +230,6 @@ private:
                                return base::partial;
                        case EILSEQ:
                        default:
-                               fprintf(stderr, "Error %d returned from iconv: %s\n",
-                                       errno, strerror(errno));
-                               fflush(stderr);
                                return base::error;
                        }
                }
@@ -185,57 +239,93 @@ private:
        }
        iconv_t in_cd_;
        iconv_t out_cd_;
+       /// The narrow encoding
+       std::string encoding_;
 };
 
-}
+} // namespace anon
 
 
 namespace lyx {
 
 
-idocfstream::idocfstream() : base()
+const char * iconv_codecvt_facet_exception::what() const throw()
+{
+       return "iconv problem in iconv_codecvt_facet initialization";
+}
+
+
+idocfstream::idocfstream(string const & encoding) : base()
 {
        std::locale global;
-       std::locale locale(global, new utf8_codecvt_facet(in));
+       std::locale locale(global, new iconv_codecvt_facet(encoding, in));
        imbue(locale);
 }
 
        
-idocfstream::idocfstream(const char* s, std::ios_base::openmode mode)
+idocfstream::idocfstream(const char* s, std::ios_base::openmode mode,
+                         string const & encoding)
        : base()
 {
        // We must imbue the stream before openening the file
        std::locale global;
-       std::locale locale(global, new utf8_codecvt_facet(in));
+       std::locale locale(global, new iconv_codecvt_facet(encoding, in));
        imbue(locale);
        open(s, mode);
 }
 
 
-odocfstream::odocfstream() : base()
+odocfstream::odocfstream(string const & encoding) : base()
 {
        std::locale global;
-       std::locale locale(global, new utf8_codecvt_facet(out));
+       std::locale locale(global, new iconv_codecvt_facet(encoding, out));
        imbue(locale);
 }
 
-       
-odocfstream::odocfstream(const char* s, std::ios_base::openmode mode)
+
+odocfstream::odocfstream(const char* s, std::ios_base::openmode mode,
+                         string const & encoding)
        : base()
 {
        // We must imbue the stream before openening the file
        std::locale global;
-       std::locale locale(global, new utf8_codecvt_facet(out));
+       std::locale locale(global, new iconv_codecvt_facet(encoding, out));
        imbue(locale);
        open(s, mode);
 }
 
+
+SetEnc setEncoding(string const & encoding)
+{
+       return SetEnc(encoding);
+}
+
+
+odocstream & operator<<(odocstream & os, SetEnc e)
+{
+       if (std::has_facet<iconv_codecvt_facet>(os.rdbuf()->getloc())) {
+               // This stream must be a file stream, since we never imbue
+               // any other stream with a locale having a iconv_codecvt_facet.
+               // Flush the stream so that all pending output is written
+               // with the old encoding.
+               os.flush();
+               std::locale locale(os.rdbuf()->getloc(),
+                       new iconv_codecvt_facet(e.encoding, std::ios_base::out));
+               // FIXME Does changing the codecvt facet of an open file
+               // stream always work? It does with gcc 4.1, but I have read
+               // somewhere that it does not with MSVC.
+               // What does the standard say?
+               os.imbue(locale);
+       }
+       return os;
+}
+
 }
 
 #if (!defined(HAVE_WCHAR_T) || SIZEOF_WCHAR_T != 4) && defined(__GNUC__)
 // We get undefined references to these virtual methods. This looks like
 // a bug in gcc. The implementation here does not do anything useful, since
-// it is overriden in utf8_codecvt_facet.
+// it is overriden in iconv_codecvt_facet.
 namespace std {
 template<> codecvt<lyx::char_type, char, mbstate_t>::result
 codecvt<lyx::char_type, char, mbstate_t>::do_out(mbstate_t &, const lyx::char_type *, const lyx::char_type *, const lyx::char_type *&,