X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2Fsupport%2Funicode.cpp;h=995fa10c5979f0c2224add047f8bf312fc235040;hb=8d640dc77608bedddb5b00982c23665584f52d21;hp=39090769d2993b261445c3d71adec1417cb9ad2b;hpb=f1cba8ff64b369792fd49f5ddf90e8126ab476ac;p=lyx.git diff --git a/src/support/unicode.cpp b/src/support/unicode.cpp index 39090769d2..995fa10c59 100644 --- a/src/support/unicode.cpp +++ b/src/support/unicode.cpp @@ -15,16 +15,19 @@ #include "support/unicode.h" #include "support/debug.h" +#include + #include #include #include -#include #include #include +//Needed in MSVC #include + using namespace std; namespace { @@ -49,8 +52,13 @@ namespace lyx { static const iconv_t invalid_cd = (iconv_t)(-1); -struct IconvProcessor::Impl +class IconvProcessor::Impl { +public: + // noncopyable because iconv_close() is called in destructor + Impl(Impl const &) = delete; + Impl & operator=(Impl const &) = delete; + Impl(string const & to, string const & from) : cd(invalid_cd), tocode_(to), fromcode_(from) {} @@ -58,7 +66,7 @@ struct IconvProcessor::Impl ~Impl() { if (cd != invalid_cd && iconv_close(cd) == -1) - LYXERR0("Error returned from iconv_close(" << errno << ")"); + LYXERR0("Error returned from iconv_close(" << errno << ')'); } iconv_t cd; @@ -85,10 +93,13 @@ IconvProcessor::~IconvProcessor() } -void IconvProcessor::operator=(IconvProcessor const & other) +IconvProcessor & IconvProcessor::operator=(IconvProcessor const & other) { - if (&other != this) + if (&other != this) { + delete pimpl_; pimpl_ = new Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_); + } + return *this; } @@ -197,6 +208,18 @@ int IconvProcessor::convert(char const * buf, size_t buflen, } +std::string IconvProcessor::from() const +{ + return pimpl_->fromcode_; +} + + +std::string IconvProcessor::to() const +{ + return pimpl_->tocode_; +} + + namespace { @@ -210,23 +233,39 @@ iconv_convert(IconvProcessor & processor, InType const * buf, size_t buflen) char const * inbuf = reinterpret_cast(buf); size_t inbytesleft = buflen * sizeof(InType); - size_t const outsize = 32768; - static char out[outsize]; - char * outbuf = out; - - int bytes = processor.convert(inbuf, inbytesleft, outbuf, outsize); + static QThreadStorage *> static_outbuf; + if (!static_outbuf.hasLocalData()) + static_outbuf.setLocalData(new std::vector(32768)); + std::vector & outbuf = *static_outbuf.localData(); + // The number of UCS4 code points in buf is at most inbytesleft. + // The output encoding will use at most + // max_encoded_bytes(pimpl_->tocode_) per UCS4 code point. + size_t maxoutbufsize = max_encoded_bytes(processor.to()) * inbytesleft; + if (outbuf.size() < maxoutbufsize) + outbuf.resize(maxoutbufsize); + + int bytes = processor.convert(inbuf, inbytesleft, &outbuf[0], outbuf.size()); if (bytes <= 0) // Conversion failed // FIXME Maybe throw an exception and handle that in the caller? return vector(); - RetType const * tmp = reinterpret_cast(out); + RetType const * tmp = reinterpret_cast(&outbuf[0]); return vector(tmp, tmp + bytes / sizeof(RetType)); } } // anon namespace +IconvProcessor & utf8ToUcs4() +{ + static QThreadStorage processor; + if (!processor.hasLocalData()) + processor.setLocalData(new IconvProcessor(ucs4_codeset, "UTF-8")); + return *processor.localData(); +} + + vector utf8_to_ucs4(vector const & utf8str) { if (utf8str.empty()) @@ -239,32 +278,43 @@ vector utf8_to_ucs4(vector const & utf8str) vector utf8_to_ucs4(char const * utf8str, size_t ls) { - static IconvProcessor processor(ucs4_codeset, "UTF-8"); - return iconv_convert(processor, utf8str, ls); + return iconv_convert(utf8ToUcs4(), utf8str, ls); } vector utf16_to_ucs4(unsigned short const * s, size_t ls) { - static IconvProcessor processor(ucs4_codeset, utf16_codeset); - return iconv_convert(processor, s, ls); + static QThreadStorage processor; + if (!processor.hasLocalData()) + processor.setLocalData(new IconvProcessor(ucs4_codeset, utf16_codeset)); + return iconv_convert(*processor.localData(), s, ls); } vector ucs4_to_utf16(char_type const * s, size_t ls) { - static IconvProcessor processor(utf16_codeset, ucs4_codeset); - return iconv_convert(processor, s, ls); + static QThreadStorage processor; + if (!processor.hasLocalData()) + processor.setLocalData(new IconvProcessor(utf16_codeset, ucs4_codeset)); + return iconv_convert(*processor.localData(), s, ls); +} + + +IconvProcessor & ucs4ToUtf8() +{ + static QThreadStorage processor; + if (!processor.hasLocalData()) + processor.setLocalData(new IconvProcessor("UTF-8", ucs4_codeset)); + return *processor.localData(); } vector ucs4_to_utf8(char_type c) { - static IconvProcessor processor("UTF-8", ucs4_codeset); - return iconv_convert(processor, &c, 1); + return iconv_convert(ucs4ToUtf8(), &c, 1); } @@ -281,15 +331,17 @@ ucs4_to_utf8(vector const & ucs4str) vector ucs4_to_utf8(char_type const * ucs4str, size_t ls) { - static IconvProcessor processor("UTF-8", ucs4_codeset); - return iconv_convert(processor, ucs4str, ls); + return iconv_convert(ucs4ToUtf8(), ucs4str, ls); } vector eightbit_to_ucs4(char const * s, size_t ls, string const & encoding) { - static map processors; + static QThreadStorage *> static_processors; + if (!static_processors.hasLocalData()) + static_processors.setLocalData(new map); + map & processors = *static_processors.localData(); if (processors.find(encoding) == processors.end()) { IconvProcessor processor(ucs4_codeset, encoding.c_str()); processors.insert(make_pair(encoding, processor)); @@ -298,10 +350,23 @@ eightbit_to_ucs4(char const * s, size_t ls, string const & encoding) } +namespace { + +map & ucs4To8bitProcessors() +{ + static QThreadStorage *> processors; + if (!processors.hasLocalData()) + processors.setLocalData(new map); + return *processors.localData(); +} + +} + + vector ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding) { - static map processors; + map & processors(ucs4To8bitProcessors()); if (processors.find(encoding) == processors.end()) { IconvProcessor processor(encoding.c_str(), ucs4_codeset); processors.insert(make_pair(encoding, processor)); @@ -312,7 +377,7 @@ ucs4_to_eightbit(char_type const * ucs4str, size_t ls, string const & encoding) char ucs4_to_eightbit(char_type ucs4, string const & encoding) { - static map processors; + map & processors(ucs4To8bitProcessors()); map::iterator it = processors.find(encoding); if (it == processors.end()) { IconvProcessor processor(encoding.c_str(), ucs4_codeset); @@ -330,7 +395,10 @@ char ucs4_to_eightbit(char_type ucs4, string const & encoding) void ucs4_to_multibytes(char_type ucs4, vector & out, string const & encoding) { - static map processors; + static QThreadStorage *> static_processors; + if (!static_processors.hasLocalData()) + static_processors.setLocalData(new map); + map & processors = *static_processors.localData(); map::iterator it = processors.find(encoding); if (it == processors.end()) { IconvProcessor processor(encoding.c_str(), ucs4_codeset); @@ -345,4 +413,34 @@ void ucs4_to_multibytes(char_type ucs4, vector & out, out.clear(); } +int max_encoded_bytes(std::string const & encoding) +{ + // FIXME: this information should be transferred to lib/encodings + // UTF8 uses at most 4 bytes to represent one UCS4 code point + // (see RFC 3629). RFC 2279 specifies 6 bytes, but that + // information is outdated, and RFC 2279 has been superseded by + // RFC 3629. + // The CJK encodings use (different) multibyte representation as well. + // All other encodings encode one UCS4 code point in one byte + // (and can therefore only encode a subset of UCS4) + // Furthermore, all encodings that use shifting (like SJIS) do not work with + // iconv_codecvt_facet. + if (encoding == "UTF-8" || + encoding == "GB" || + encoding == "EUC-TW") + return 4; + else if (encoding == "EUC-JP") + return 3; + else if (encoding == "ISO-2022-JP") + return 8; + else if (encoding == "BIG5" || + encoding == "EUC-KR" || + encoding == "EUC-CN" || + encoding == "SJIS" || + encoding == "GBK") + return 2; + else + return 1; +} + } // namespace lyx