X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=src%2FEncoding.cpp;h=3aa31d0099ab9a771afa860f54b42203706677cf;hb=b271e3e06d778dbe7f20973c718f06d8aac0cf6f;hp=a1beb9d7270cb82a9b5370a0b551ee523dce1a59;hpb=d88aac0e1f7ae9fa54186c8ea733f2de5e6c18ed;p=lyx.git diff --git a/src/Encoding.cpp b/src/Encoding.cpp index a1beb9d727..3aa31d0099 100644 --- a/src/Encoding.cpp +++ b/src/Encoding.cpp @@ -14,28 +14,24 @@ #include "Encoding.h" -#include "debug.h" #include "LaTeXFeatures.h" #include "Lexer.h" #include "LyXRC.h" +#include "support/debug.h" #include "support/FileName.h" #include "support/lstrings.h" #include "support/unicode.h" -#include +#include -#ifndef CXX_GLOBAL_CSTD -using std::strtol; -#endif -using std::endl; -using std::string; +#include +using namespace std; +using namespace lyx::support; namespace lyx { -using support::FileName; - Encodings encodings; namespace { @@ -241,7 +237,7 @@ struct CharInfo { }; -typedef std::map CharInfoMap; +typedef map CharInfoMap; CharInfoMap unicodesymbols; @@ -251,9 +247,21 @@ char_type const max_ucs4 = 0x110000; } // namespace anon +EncodingException::EncodingException(char_type c) + : failed_char(c), par_id(0), pos(0) +{ +} + + +const char * EncodingException::what() const throw() +{ + return "Could not find LaTeX command for a character"; +} + + Encoding::Encoding(string const & n, string const & l, string const & i, bool f, Encoding::Package p) - : Name_(n), LatexName_(l), iconvName_(i), fixedwidth_(f), package_(p) + : name_(n), latexName_(l), iconvName_(i), fixedwidth_(f), package_(p) { if (n == "ascii") { // ASCII can encode 128 code points and nothing else @@ -271,6 +279,9 @@ Encoding::Encoding(string const & n, string const & l, string const & i, void Encoding::init() const { + if (complete_) + return; + start_encodable_ = 0; // temporarily switch off lyxerr, since we will generate iconv errors lyxerr.disable(); @@ -278,14 +289,14 @@ void Encoding::init() const // We do not need to check all UCS4 code points, it is enough // if we check all 256 code points of this encoding. for (unsigned short j = 0; j < 256; ++j) { - char const c = j; - std::vector const ucs4 = eightbit_to_ucs4(&c, 1, iconvName_); - if (ucs4.size() == 1) { - char_type const c = ucs4[0]; - CharInfoMap::const_iterator const it = unicodesymbols.find(c); - if (it == unicodesymbols.end() || !it->second.force) - encodable_.insert(c); - } + char const c = char(j); + vector const ucs4 = eightbit_to_ucs4(&c, 1, iconvName_); + if (ucs4.size() != 1) + continue; + char_type const uc = ucs4[0]; + CharInfoMap::const_iterator const it = unicodesymbols.find(uc); + if (it == unicodesymbols.end() || !it->second.force) + encodable_.insert(uc); } } else { // We do not know how many code points this encoding has, and @@ -293,7 +304,7 @@ void Encoding::init() const // therefore we need to check all UCS4 code points. // This is expensive! for (char_type c = 0; c < max_ucs4; ++c) { - std::vector const eightbit = ucs4_to_eightbit(&c, 1, iconvName_); + vector const eightbit = ucs4_to_eightbit(&c, 1, iconvName_); if (!eightbit.empty()) { CharInfoMap::const_iterator const it = unicodesymbols.find(c); if (it == unicodesymbols.end() || !it->second.force) @@ -312,27 +323,40 @@ void Encoding::init() const } -docstring const Encoding::latexChar(char_type c) const +docstring Encoding::latexChar(char_type c) const { // assure the used encoding is properly initialized - if (!complete_) - init(); - BOOST_ASSERT(complete_); + init(); if (c < start_encodable_) return docstring(1, c); - if (encodable_.find(c) == encodable_.end()) { - // c cannot be encoded in this encoding - CharInfoMap::const_iterator const it = unicodesymbols.find(c); - if (it == unicodesymbols.end()) - lyxerr << "Could not find LaTeX command for character 0x" - << std::hex << c << std::dec - << ".\nLaTeX export will fail." - << endl; - else - return it->second.command; - } - return docstring(1, c); + if (encodable_.find(c) != encodable_.end()) + return docstring(1, c); + + // c cannot be encoded in this encoding + CharInfoMap::const_iterator const it = unicodesymbols.find(c); + if (it == unicodesymbols.end()) + throw EncodingException(c); + return it->second.command; +} + + +vector Encoding::symbolsList() const +{ + // assure the used encoding is properly initialized + init(); + + // first all encodable characters + vector symbols(encodable_.begin(), encodable_.end()); + // add those below start_encodable_ + for (char_type c = 0; c < start_encodable_; ++c) + symbols.push_back(c); + // now the ones from the unicodesymbols file + CharInfoMap::const_iterator const end = unicodesymbols.end(); + CharInfoMap::const_iterator it = unicodesymbols.begin(); + for (; it != end; ++it) + symbols.push_back(it->first); + return symbols; } @@ -348,10 +372,9 @@ void Encodings::validate(char_type c, LaTeXFeatures & features) } -bool Encodings::isComposeChar_hebrew(char_type c) +bool Encodings::isHebrewComposeChar(char_type c) { - return c <= 0x05c2 && c >= 0x05b0 && - c != 0x05be && c != 0x05c0; + return c <= 0x05c2 && c >= 0x05b0 && c != 0x05be && c != 0x05c0; } @@ -359,36 +382,30 @@ bool Encodings::isComposeChar_hebrew(char_type c) // they are hamza, alef_madda, alef_hamza, waw_hamza, alef_hamza_under, // alef, tah_marbota, dal, thal, rah, zai, wow, alef_maksoura -bool Encodings::is_arabic_special(char_type c) +bool Encodings::isArabicSpecialChar(char_type c) { - return (c >= 0x0621 && c <= 0x0625) || - c == 0x0627 || c == 0x0629 || - c == 0x062f || c == 0x0648 || - (c >= 0x0630 && c <= 0x0632) || - c == 0x0649 || c == 0x0698; + return (c >= 0x0621 && c <= 0x0625) || (c >= 0x0630 && c <= 0x0632) + || c == 0x0627 || c == 0x0629 || c == 0x062f || c == 0x0648 + || c == 0x0649 || c == 0x0698; } -bool Encodings::isComposeChar_arabic(char_type c) +bool Encodings::isArabicComposeChar(char_type c) { return c >= 0x064b && c <= 0x0652; } -bool Encodings::is_arabic(char_type c) +bool Encodings::isArabicChar(char_type c) { - return c >= arabic_start && c <= arabic_end && - arabic_table[c-arabic_start][0]; + return c >= arabic_start && c <= arabic_end + && arabic_table[c-arabic_start][0]; } -char_type Encodings::transformChar(char_type c, - Encodings::Letter_Form form) +char_type Encodings::transformChar(char_type c, Encodings::LetterForm form) { - if (!is_arabic(c)) - return c; - - return arabic_table[c-arabic_start][form]; + return isArabicChar(c) ? arabic_table[c-arabic_start][form] : c; } @@ -401,36 +418,34 @@ bool Encodings::isCombiningChar(char_type c) } -bool Encodings::isKnownLangChar(char_type c, string & preamble) +bool Encodings::isKnownScriptChar(char_type const c, string & preamble) { CharInfoMap::const_iterator const it = unicodesymbols.find(c); - if (it != unicodesymbols.end()) { - if (it->second.preamble != "textgreek" && - it->second.preamble != "textcyr") - return false; - if (preamble.empty()) { - preamble = it->second.preamble; - return true; - } - return it->second.preamble == preamble; + + if (it == unicodesymbols.end()) + return false; + + if (it->second.preamble != "textgreek" && it->second.preamble != "textcyr") + return false; + + if (preamble.empty()) { + preamble = it->second.preamble; + return true; } - return false; + return it->second.preamble == preamble; } -Encoding const * Encodings::getFromLyXName(string const & name) const +Encoding const * Encodings::fromLyXName(string const & name) const { - EncodingList::const_iterator it = encodinglist.find(name); - if (it != encodinglist.end()) - return &it->second; - else - return 0; + EncodingList::const_iterator const it = encodinglist.find(name); + return it != encodinglist.end() ? &it->second : 0; } -Encoding const * Encodings::getFromLaTeXName(string const & name) const +Encoding const * Encodings::fromLaTeXName(string const & name) const { - // We don't use std::find_if because it makes copies of the pairs in + // We don't use find_if because it makes copies of the pairs in // the map. // This linear search is OK since we don't have many encodings. // Users could even optimize it by putting the encodings they use @@ -459,35 +474,33 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) CharInfo info; string flags; - if (symbolslex.next(true)) { - std::istringstream is(symbolslex.getString()); - // reading symbol directly does not work if - // char_type == std::wchar_t. - boost::uint32_t tmp; - if(!(is >> std::hex >> tmp)) - break; - symbol = tmp; - } else + if (!symbolslex.next(true)) break; - if (symbolslex.next(true)) - info.command = symbolslex.getDocString(); - else + + istringstream is(symbolslex.getString()); + // reading symbol directly does not work if + // char_type == wchar_t. + boost::uint32_t tmp; + if(!(is >> hex >> tmp)) break; - if (symbolslex.next(true)) - info.preamble = symbolslex.getString(); - else + symbol = tmp; + + if (!symbolslex.next(true)) break; - if (symbolslex.next(true)) - flags = symbolslex.getString(); - else + info.command = symbolslex.getDocString(); + if (!symbolslex.next(true)) + break; + info.preamble = symbolslex.getString(); + if (!symbolslex.next(true)) break; + flags = symbolslex.getString(); info.combining = false; info.feature = false; info.force = false; while (!flags.empty()) { string flag; - flags = support::split(flags, flag, ','); + flags = split(flags, flag, ','); if (flag == "combining") info.combining = true; else if (flag == "force") @@ -495,18 +508,16 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) else lyxerr << "Ignoring unknown flag `" << flag << "' for symbol `0x" - << std::hex << symbol << std::dec + << hex << symbol << dec << "'." << endl; } if (!info.preamble.empty()) info.feature = info.preamble[0] != '\\'; - LYXERR(Debug::INFO) - << "Read unicode symbol " << symbol << " '" + LYXERR(Debug::INFO, "Read unicode symbol " << symbol << " '" << to_utf8(info.command) << "' '" << info.preamble - << "' " << info.combining << ' ' << info.feature - << endl; + << "' " << info.combining << ' ' << info.feature); unicodesymbols[symbol] = info; } @@ -541,11 +552,10 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) fixedwidth = true; else if (width == "variable") fixedwidth = false; - else { + else lex.printError("Encodings::read: " "Unknown width: `$$Token'"); - } - + lex.next(); string const p = lex.getString(); Encoding::Package package = Encoding::none; @@ -555,15 +565,14 @@ void Encodings::read(FileName const & encfile, FileName const & symbolsfile) package = Encoding::inputenc; else if (p == "CJK") package = Encoding::CJK; - else { + else lex.printError("Encodings::read: " "Unknown package: `$$Token'"); - } - - LYXERR(Debug::INFO) << "Reading encoding " << name << endl; + + LYXERR(Debug::INFO, "Reading encoding " << name); encodinglist[name] = Encoding(name, latexname, - iconvname, fixedwidth, - package); + iconvname, fixedwidth, package); + if (lex.lex() != et_end) lex.printError("Encodings::read: " "missing end");