X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2Fsupport%2Funicode.cpp;h=b48f10609f7fe9315e257575aec432b9d8fcb1dc;hb=f3ff5e083a6e89861db5fce9eea4532504f8341d;hp=f929d5665378ba122c15fb1d9a6f9d407ebf361f;hpb=0e40512c861cf69088d422de7bc159cddbde5c64;p=lyx.git diff --git a/src/support/unicode.cpp b/src/support/unicode.cpp index f929d56653..b48f10609f 100644 --- a/src/support/unicode.cpp +++ b/src/support/unicode.cpp @@ -12,9 +12,9 @@ #include -#include "unicode.h" - -#include "debug.h" +#include "support/unicode.h" +#include "support/debug.h" +#include "support/mutex.h" #include @@ -22,14 +22,12 @@ #include #include -#include #include +#include +#include + -using std::endl; -using std::map; -using std::make_pair; -using std::string; -using std::vector; +using namespace std; namespace { @@ -53,47 +51,49 @@ namespace lyx { static const iconv_t invalid_cd = (iconv_t)(-1); -struct IconvProcessor::Private { - Private(): cd(invalid_cd) {} - ~Private() +struct IconvProcessor::Impl +{ + Impl(string const & to, string const & from) + : cd(invalid_cd), tocode_(to), fromcode_(from) + {} + + ~Impl() { - if (cd != invalid_cd) { - if (iconv_close(cd) == -1) { - lyxerr << "Error returned from iconv_close(" - << errno << ")" << endl; - } - } + if (cd != invalid_cd && iconv_close(cd) == -1) + LYXERR0("Error returned from iconv_close(" << errno << ")"); } + iconv_t cd; + string tocode_; + string fromcode_; + + Mutex mutex_; // iconv() is not thread save, see #7240 }; IconvProcessor::IconvProcessor(char const * tocode, char const * fromcode) - : tocode_(tocode), fromcode_(fromcode), - pimpl_(new IconvProcessor::Private) + : pimpl_(new IconvProcessor::Impl(tocode, fromcode)) { } IconvProcessor::IconvProcessor(IconvProcessor const & other) - : tocode_(other.tocode_), fromcode_(other.fromcode_), - pimpl_(new IconvProcessor::Private) + : pimpl_(new IconvProcessor::Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_)) { } -IconvProcessor & IconvProcessor::operator=(IconvProcessor const & other) +IconvProcessor::~IconvProcessor() { - if (&other == this) - return *this; - tocode_ = other.tocode_; - fromcode_ = other.fromcode_; - pimpl_.reset(new Private); - return *this; + delete pimpl_; } -IconvProcessor::~IconvProcessor() {} +void IconvProcessor::operator=(IconvProcessor const & other) +{ + if (&other != this) + pimpl_ = new Impl(other.pimpl_->tocode_, other.pimpl_->fromcode_); +} bool IconvProcessor::init() @@ -101,15 +101,15 @@ bool IconvProcessor::init() if (pimpl_->cd != invalid_cd) return true; - pimpl_->cd = iconv_open(tocode_.c_str(), fromcode_.c_str()); + pimpl_->cd = iconv_open(pimpl_->tocode_.c_str(), pimpl_->fromcode_.c_str()); if (pimpl_->cd != invalid_cd) return true; lyxerr << "Error returned from iconv_open" << endl; switch (errno) { case EINVAL: - lyxerr << "EINVAL The conversion from " << fromcode_ - << " to " << tocode_ + lyxerr << "EINVAL The conversion from " << pimpl_->fromcode_ + << " to " << pimpl_->tocode_ << " is not supported by the implementation." << endl; break; @@ -124,6 +124,8 @@ bool IconvProcessor::init() int IconvProcessor::convert(char const * buf, size_t buflen, char * outbuf, size_t maxoutsize) { + Mutex::Locker lock(&pimpl_->mutex_); + if (buflen == 0) return 0; @@ -138,7 +140,12 @@ int IconvProcessor::convert(char const * buf, size_t buflen, int res = iconv(pimpl_->cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - //lyxerr << std::dec; + // flush out remaining data. This is needed because iconv sometimes + // holds back chars in the stream, waiting for a combination character + // (see e.g. http://sources.redhat.com/bugzilla/show_bug.cgi?id=1124) + iconv(pimpl_->cd, NULL, NULL, &outbuf, &outbytesleft); + + //lyxerr << dec; //lyxerr << "Inbytesleft: " << inbytesleft << endl; //lyxerr << "Outbytesleft: " << outbytesleft << endl; @@ -155,9 +162,9 @@ int IconvProcessor::convert(char const * buf, size_t buflen, case EILSEQ: lyxerr << "EILSEQ An invalid multibyte sequence" << " has been encountered in the input.\n" - << "When converting from " << fromcode_ - << " to " << tocode_ << ".\n"; - lyxerr << "Input:" << std::hex; + << "When converting from " << pimpl_->fromcode_ + << " to " << pimpl_->tocode_ << ".\n"; + lyxerr << "Input:" << hex; for (size_t i = 0; i < buflen; ++i) { // char may be signed, avoid output of // something like 0xffffffc2 @@ -165,14 +172,14 @@ int IconvProcessor::convert(char const * buf, size_t buflen, *reinterpret_cast(buf + i); lyxerr << " 0x" << (unsigned int)b; } - lyxerr << std::dec << endl; + lyxerr << dec << endl; break; case EINVAL: lyxerr << "EINVAL An incomplete multibyte sequence" << " has been encountered in the input.\n" - << "When converting from " << fromcode_ - << " to " << tocode_ << ".\n"; - lyxerr << "Input:" << std::hex; + << "When converting from " << pimpl_->fromcode_ + << " to " << pimpl_->tocode_ << ".\n"; + lyxerr << "Input:" << hex; for (size_t i = 0; i < buflen; ++i) { // char may be signed, avoid output of // something like 0xffffffc2 @@ -180,7 +187,7 @@ int IconvProcessor::convert(char const * buf, size_t buflen, *reinterpret_cast(buf + i); lyxerr << " 0x" << (unsigned int)b; } - lyxerr << std::dec << endl; + lyxerr << dec << endl; break; default: lyxerr << "\tSome other error: " << errno << endl; @@ -196,14 +203,24 @@ int IconvProcessor::convert(char const * buf, size_t buflen, } +std::string IconvProcessor::from() const +{ + return pimpl_->fromcode_; +} + + +std::string IconvProcessor::to() const +{ + return pimpl_->tocode_; +} + + namespace { template vector -iconv_convert(IconvProcessor & processor, - InType const * buf, - size_t buflen) +iconv_convert(IconvProcessor & processor, InType const * buf, size_t buflen) { if (buflen == 0) return vector(); @@ -211,17 +228,21 @@ iconv_convert(IconvProcessor & processor, char const * inbuf = reinterpret_cast(buf); size_t inbytesleft = buflen * sizeof(InType); - size_t const outsize = 32768; - static char out[outsize]; - char * outbuf = out; + static std::vector outbuf(32768); + // The number of UCS4 code points in buf is at most inbytesleft. + // The output encoding will use at most + // max_encoded_bytes(pimpl_->tocode_) per UCS4 code point. + size_t maxoutbufsize = max_encoded_bytes(processor.to()) * inbytesleft; + if (outbuf.size() < maxoutbufsize) + outbuf.resize(maxoutbufsize); - int bytes = processor.convert(inbuf, inbytesleft, outbuf, outsize); + int bytes = processor.convert(inbuf, inbytesleft, &outbuf[0], outbuf.size()); if (bytes <= 0) // Conversion failed // FIXME Maybe throw an exception and handle that in the caller? return vector(); - RetType const * tmp = reinterpret_cast(out); + RetType const * tmp = reinterpret_cast(&outbuf[0]); return vector(tmp, tmp + bytes / sizeof(RetType)); } @@ -346,4 +367,35 @@ void ucs4_to_multibytes(char_type ucs4, vector & out, out.clear(); } +int max_encoded_bytes(std::string const & encoding) +{ + // FIXME: this information should be transferred to lib/encodings + // UTF8 uses at most 4 bytes to represent one UCS4 code point + // (see RFC 3629). RFC 2279 specifies 6 bytes, but that + // information is outdated, and RFC 2279 has been superseded by + // RFC 3629. + // The CJK encodings use (different) multibyte representation as well. + // All other encodings encode one UCS4 code point in one byte + // (and can therefore only encode a subset of UCS4) + // Note that BIG5 and SJIS do not work with LaTeX (see lib/encodings). + // Furthermore, all encodings that use shifting (like SJIS) do not work with + // iconv_codecvt_facet. + if (encoding == "UTF-8" || + encoding == "GB" || + encoding == "EUC-TW") + return 4; + else if (encoding == "EUC-JP") + return 3; + else if (encoding == "ISO-2022-JP") + return 8; + else if (encoding == "BIG5" || + encoding == "EUC-KR" || + encoding == "EUC-CN" || + encoding == "SJIS" || + encoding == "GBK") + return 2; + else + return 1; +} + } // namespace lyx