#include <config.h>
#include "docstream.h"
+#include "unicode.h"
#include <cerrno>
#include <cstdio>
#include <iconv.h>
#include <locale>
-namespace {
-#ifdef WORDS_BIGENDIAN
-char const * ucs4_codeset = "UCS-4BE";
-#else
-char const * ucs4_codeset = "UCS-4LE";
-#endif
-char const * utf8_codeset = "UTF-8";
+using lyx::ucs4_codeset;
-// We use C IO throughout this file, because the facets might be used with
-// lyxerr in the future.
+using std::string;
-class utf8_codecvt_facet_exception : public std::exception {
-public:
- virtual ~utf8_codecvt_facet_exception() throw() {}
- virtual const char* what() const throw()
- {
- return "iconv problem in utf8_codecvt_facet initialization";
- }
-};
+namespace {
+
+// We use C IO throughout this file, because the facets might be used with
+// lyxerr in the future.
/// codecvt facet for conversion of UCS4 (internal representation) to UTF8
/// (external representation) or vice versa
-class utf8_codecvt_facet : public std::codecvt<lyx::char_type, char, std::mbstate_t>
+class iconv_codecvt_facet : public std::codecvt<lyx::char_type, char, std::mbstate_t>
{
typedef std::codecvt<lyx::char_type, char, std::mbstate_t> base;
public:
/// Constructor. You have to specify with \p inout whether you want
/// to use this facet only for input, only for output or for both.
- explicit utf8_codecvt_facet(std::ios_base::openmode inout = std::ios_base::in | std::ios_base::out,
+ explicit iconv_codecvt_facet(string const & encoding = "UTF-8",
+ std::ios_base::openmode inout = std::ios_base::in | std::ios_base::out,
size_t refs = 0)
- : base(refs)
+ : base(refs), encoding_(encoding)
{
if (inout & std::ios_base::in) {
- in_cd_ = iconv_open(ucs4_codeset, utf8_codeset);
+ in_cd_ = iconv_open(ucs4_codeset, encoding.c_str());
if (in_cd_ == (iconv_t)(-1)) {
fprintf(stderr, "Error %d returned from iconv_open(in_cd_): %s\n",
errno, strerror(errno));
fflush(stderr);
- throw utf8_codecvt_facet_exception();
+ throw lyx::iconv_codecvt_facet_exception();
}
} else
in_cd_ = (iconv_t)(-1);
if (inout & std::ios_base::out) {
- out_cd_ = iconv_open(utf8_codeset, ucs4_codeset);
+ out_cd_ = iconv_open(encoding.c_str(), ucs4_codeset);
if (out_cd_ == (iconv_t)(-1)) {
fprintf(stderr, "Error %d returned from iconv_open(out_cd_): %s\n",
errno, strerror(errno));
fflush(stderr);
- throw utf8_codecvt_facet_exception();
+ throw lyx::iconv_codecvt_facet_exception();
}
} else
out_cd_ = (iconv_t)(-1);
}
protected:
- virtual ~utf8_codecvt_facet()
+ virtual ~iconv_codecvt_facet()
{
if (in_cd_ != (iconv_t)(-1))
if (iconv_close(in_cd_) == -1) {
size_t outbytesleft = (to_end - to) * sizeof(extern_type);
from_next = from;
to_next = to;
- return do_iconv(out_cd_, reinterpret_cast<char const **>(&from_next),
+ result const retval = do_iconv(out_cd_,
+ reinterpret_cast<char const **>(&from_next),
&inbytesleft, &to_next, &outbytesleft);
+ if (retval == base::error) {
+ fprintf(stderr,
+ "Error %d returned from iconv when converting from %s to %s: %s\n",
+ errno, ucs4_codeset, encoding_.c_str(),
+ strerror(errno));
+ fputs("Converted input:", stderr);
+ for (intern_type const * i = from; i < from_next; ++i) {
+ unsigned int const c = *i;
+ fprintf(stderr, " 0x%04x", c);
+ }
+ unsigned int const c = *from_next;
+ fprintf(stderr, "\nStopped at: 0x%04x\n", c);
+ fputs("Unconverted input:", stderr);
+ for (intern_type const * i = from_next + 1; i < from_end; ++i) {
+ unsigned int const c = *i;
+ fprintf(stderr, " 0x%04x", c);
+ }
+ fputs("\nConverted output:", stderr);
+ for (extern_type const * i = to; i < to_next; ++i) {
+ // extern_type may be signed, avoid output of
+ // something like 0xffffffc2
+ unsigned int const c =
+ *reinterpret_cast<unsigned char const *>(i);
+ fprintf(stderr, " 0x%02x", c);
+ }
+ fputc('\n', stderr);
+ fflush(stderr);
+ }
+ return retval;
}
virtual result do_unshift(state_type &, extern_type * to,
extern_type *, extern_type *& to_next) const
size_t outbytesleft = (to_end - to) * sizeof(intern_type);
from_next = from;
to_next = to;
- return do_iconv(in_cd_, &from_next, &inbytesleft,
+ result const retval = do_iconv(in_cd_, &from_next, &inbytesleft,
reinterpret_cast<char **>(&to_next),
&outbytesleft);
+ if (retval == base::error) {
+ fprintf(stderr,
+ "Error %d returned from iconv when converting from %s to %s: %s\n",
+ errno, encoding_.c_str(), ucs4_codeset,
+ strerror(errno));
+ fputs("Converted input:", stderr);
+ for (extern_type const * i = from; i < from_next; ++i) {
+ // extern_type may be signed, avoid output of
+ // something like 0xffffffc2
+ unsigned int const c =
+ *reinterpret_cast<unsigned char const *>(i);
+ fprintf(stderr, " 0x%02x", c);
+ }
+ unsigned int const c =
+ *reinterpret_cast<unsigned char const *>(from_next);
+ fprintf(stderr, "\nStopped at: 0x%02x\n", c);
+ fputs("Unconverted input:", stderr);
+ for (extern_type const * i = from_next + 1; i < from_end; ++i) {
+ unsigned int const c =
+ *reinterpret_cast<unsigned char const *>(i);
+ fprintf(stderr, " 0x%02x", c);
+ }
+ fputs("\nConverted output:", stderr);
+ for (intern_type const * i = to; i < to_next; ++i) {
+ unsigned int const c = *i;
+ fprintf(stderr, " 0x%02x", c);
+ }
+ fputc('\n', stderr);
+ fflush(stderr);
+ }
+ return retval;
}
virtual int do_encoding() const throw()
{
}
virtual int do_max_length() const throw()
{
- // UTF8 uses at most 6 bytes to represent one code point
- return 6;
+ // UTF8 uses at most 4 bytes to represent one UCS4 code point
+ // (see RFC 3629). RFC 2279 specifies 6 bytes, but that
+ // information is outdated, and RFC 2279 has been superseded by
+ // RFC 3629.
+ // All other encodings encode one UCS4 code point in one byte
+ // (and can therefore only encode a subset of UCS4)
+ return encoding_ == "UTF-8" ? 4 : 1;
}
private:
/// Do the actual conversion. The interface is equivalent to that of
inline base::result do_iconv(iconv_t cd, char const ** from,
size_t * inbytesleft, char ** to, size_t * outbytesleft) const
{
- char const * to_start = *to;
+ char const * const to_start = *to;
size_t converted = iconv(cd, const_cast<char ICONV_CONST **>(from),
inbytesleft, to, outbytesleft);
if (converted == (size_t)(-1)) {
return base::partial;
case EILSEQ:
default:
- fprintf(stderr, "Error %d returned from iconv: %s\n",
- errno, strerror(errno));
- fflush(stderr);
return base::error;
}
}
}
iconv_t in_cd_;
iconv_t out_cd_;
+ /// The narrow encoding
+ std::string encoding_;
};
-}
+} // namespace anon
namespace lyx {
-idocfstream::idocfstream() : base()
+const char * iconv_codecvt_facet_exception::what() const throw()
+{
+ return "iconv problem in iconv_codecvt_facet initialization";
+}
+
+
+idocfstream::idocfstream(string const & encoding) : base()
{
std::locale global;
- std::locale locale(global, new utf8_codecvt_facet(in));
+ std::locale locale(global, new iconv_codecvt_facet(encoding, in));
imbue(locale);
}
-idocfstream::idocfstream(const char* s, std::ios_base::openmode mode)
+idocfstream::idocfstream(const char* s, std::ios_base::openmode mode,
+ string const & encoding)
: base()
{
// We must imbue the stream before openening the file
std::locale global;
- std::locale locale(global, new utf8_codecvt_facet(in));
+ std::locale locale(global, new iconv_codecvt_facet(encoding, in));
imbue(locale);
open(s, mode);
}
-odocfstream::odocfstream() : base()
+odocfstream::odocfstream(string const & encoding) : base()
{
std::locale global;
- std::locale locale(global, new utf8_codecvt_facet(out));
+ std::locale locale(global, new iconv_codecvt_facet(encoding, out));
imbue(locale);
}
-
-odocfstream::odocfstream(const char* s, std::ios_base::openmode mode)
+
+odocfstream::odocfstream(const char* s, std::ios_base::openmode mode,
+ string const & encoding)
: base()
{
// We must imbue the stream before openening the file
std::locale global;
- std::locale locale(global, new utf8_codecvt_facet(out));
+ std::locale locale(global, new iconv_codecvt_facet(encoding, out));
imbue(locale);
open(s, mode);
}
+
+SetEnc setEncoding(string const & encoding)
+{
+ return SetEnc(encoding);
+}
+
+
+odocstream & operator<<(odocstream & os, SetEnc e)
+{
+ if (std::has_facet<iconv_codecvt_facet>(os.rdbuf()->getloc())) {
+ // This stream must be a file stream, since we never imbue
+ // any other stream with a locale having a iconv_codecvt_facet.
+ // Flush the stream so that all pending output is written
+ // with the old encoding.
+ os.flush();
+ std::locale locale(os.rdbuf()->getloc(),
+ new iconv_codecvt_facet(e.encoding, std::ios_base::out));
+ // FIXME Does changing the codecvt facet of an open file
+ // stream always work? It does with gcc 4.1, but I have read
+ // somewhere that it does not with MSVC.
+ // What does the standard say?
+ os.imbue(locale);
+ }
+ return os;
+}
+
}
#if (!defined(HAVE_WCHAR_T) || SIZEOF_WCHAR_T != 4) && defined(__GNUC__)
// We get undefined references to these virtual methods. This looks like
// a bug in gcc. The implementation here does not do anything useful, since
-// it is overriden in utf8_codecvt_facet.
+// it is overriden in iconv_codecvt_facet.
namespace std {
template<> codecvt<lyx::char_type, char, mbstate_t>::result
codecvt<lyx::char_type, char, mbstate_t>::do_out(mbstate_t &, const lyx::char_type *, const lyx::char_type *, const lyx::char_type *&,