#include "Encoding.h"
-#include "debug.h"
#include "LaTeXFeatures.h"
#include "Lexer.h"
#include "LyXRC.h"
+#include "support/debug.h"
#include "support/FileName.h"
#include "support/lstrings.h"
#include "support/unicode.h"
-#include <sstream>
+#include <boost/cstdint.hpp>
-#ifndef CXX_GLOBAL_CSTD
-using std::strtol;
-#endif
-using std::endl;
-using std::string;
+#include <sstream>
+using namespace std;
+using namespace lyx::support;
namespace lyx {
-using support::FileName;
-
Encodings encodings;
namespace {
};
-typedef std::map<char_type, CharInfo> CharInfoMap;
+typedef map<char_type, CharInfo> CharInfoMap;
CharInfoMap unicodesymbols;
} // namespace anon
+EncodingException::EncodingException(char_type c)
+ : failed_char(c), par_id(0), pos(0)
+{
+}
+
+
+const char * EncodingException::what() const throw()
+{
+ return "Could not find LaTeX command for a character";
+}
+
+
Encoding::Encoding(string const & n, string const & l, string const & i,
bool f, Encoding::Package p)
: Name_(n), LatexName_(l), iconvName_(i), fixedwidth_(f), package_(p)
void Encoding::init() const
{
+ if (complete_)
+ return;
+
start_encodable_ = 0;
// temporarily switch off lyxerr, since we will generate iconv errors
lyxerr.disable();
// We do not need to check all UCS4 code points, it is enough
// if we check all 256 code points of this encoding.
for (unsigned short j = 0; j < 256; ++j) {
- char const c = j;
- std::vector<char_type> const ucs4 = eightbit_to_ucs4(&c, 1, iconvName_);
- if (ucs4.size() == 1) {
- char_type const c = ucs4[0];
- CharInfoMap::const_iterator const it = unicodesymbols.find(c);
- if (it == unicodesymbols.end() || !it->second.force)
- encodable_.insert(c);
- }
+ char const c = char(j);
+ vector<char_type> const ucs4 = eightbit_to_ucs4(&c, 1, iconvName_);
+ if (ucs4.size() != 1)
+ continue;
+ char_type const uc = ucs4[0];
+ CharInfoMap::const_iterator const it = unicodesymbols.find(uc);
+ if (it == unicodesymbols.end() || !it->second.force)
+ encodable_.insert(uc);
}
} else {
// We do not know how many code points this encoding has, and
// therefore we need to check all UCS4 code points.
// This is expensive!
for (char_type c = 0; c < max_ucs4; ++c) {
- std::vector<char> const eightbit = ucs4_to_eightbit(&c, 1, iconvName_);
+ vector<char> const eightbit = ucs4_to_eightbit(&c, 1, iconvName_);
if (!eightbit.empty()) {
CharInfoMap::const_iterator const it = unicodesymbols.find(c);
if (it == unicodesymbols.end() || !it->second.force)
docstring const Encoding::latexChar(char_type c) const
{
// assure the used encoding is properly initialized
- if (!complete_)
- init();
- BOOST_ASSERT(complete_);
+ init();
if (c < start_encodable_)
return docstring(1, c);
- if (encodable_.find(c) == encodable_.end()) {
- // c cannot be encoded in this encoding
- CharInfoMap::const_iterator const it = unicodesymbols.find(c);
- if (it == unicodesymbols.end())
- lyxerr << "Could not find LaTeX command for character 0x"
- << std::hex << c << std::dec
- << ".\nLaTeX export will fail."
- << endl;
- else
- return it->second.command;
- }
- return docstring(1, c);
+ if (encodable_.find(c) != encodable_.end())
+ return docstring(1, c);
+
+ // c cannot be encoded in this encoding
+ CharInfoMap::const_iterator const it = unicodesymbols.find(c);
+ if (it == unicodesymbols.end())
+ throw EncodingException(c);
+ else
+ return it->second.command;
+}
+
+
+set<char_type> Encoding::getSymbolsList() const
+{
+ // assure the used encoding is properly initialized
+ init();
+
+ // first all encodable characters
+ CharSet symbols = encodable_;
+ // add those below start_encodable_
+ for (char_type c = 0; c < start_encodable_; ++c)
+ symbols.insert(c);
+ // now the ones from the unicodesymbols file
+ CharInfoMap::const_iterator const end = unicodesymbols.end();
+ CharInfoMap::const_iterator it = unicodesymbols.begin();
+ for (; it != end; ++it)
+ symbols.insert(it->first);
+ return symbols;
}
bool Encodings::isComposeChar_hebrew(char_type c)
{
- return c <= 0x05c2 && c >= 0x05b0 &&
- c != 0x05be && c != 0x05c0;
+ return c <= 0x05c2 && c >= 0x05b0 && c != 0x05be && c != 0x05c0;
}
bool Encodings::is_arabic_special(char_type c)
{
- return (c >= 0x0621 && c <= 0x0625) ||
- c == 0x0627 || c == 0x0629 ||
- c == 0x062f || c == 0x0648 ||
- (c >= 0x0630 && c <= 0x0632) ||
- c == 0x0649 || c == 0x0698;
+ return (c >= 0x0621 && c <= 0x0625) || (c >= 0x0630 && c <= 0x0632)
+ || c == 0x0627 || c == 0x0629 || c == 0x062f || c == 0x0648
+ || c == 0x0649 || c == 0x0698;
}
bool Encodings::is_arabic(char_type c)
{
- return c >= arabic_start && c <= arabic_end &&
- arabic_table[c-arabic_start][0];
+ return c >= arabic_start && c <= arabic_end
+ && arabic_table[c-arabic_start][0];
}
}
+bool Encodings::isKnownScriptChar(char_type const c, string & preamble)
+{
+ CharInfoMap::const_iterator const it = unicodesymbols.find(c);
+
+ if (it == unicodesymbols.end())
+ return false;
+
+ if (it->second.preamble != "textgreek" && it->second.preamble != "textcyr")
+ return false;
+
+ if (preamble.empty()) {
+ preamble = it->second.preamble;
+ return true;
+ }
+ return it->second.preamble == preamble;
+}
+
+
Encoding const * Encodings::getFromLyXName(string const & name) const
{
- EncodingList::const_iterator it = encodinglist.find(name);
- if (it != encodinglist.end())
- return &it->second;
- else
- return 0;
+ EncodingList::const_iterator const it = encodinglist.find(name);
+ return it != encodinglist.end() ? &it->second : 0;
}
Encoding const * Encodings::getFromLaTeXName(string const & name) const
{
- // We don't use std::find_if because it makes copies of the pairs in
+ // We don't use find_if because it makes copies of the pairs in
// the map.
// This linear search is OK since we don't have many encodings.
// Users could even optimize it by putting the encodings they use
CharInfo info;
string flags;
- if (symbolslex.next(true)) {
- std::istringstream is(symbolslex.getString());
- // reading symbol directly does not work if
- // char_type == std::wchar_t.
- boost::uint32_t tmp;
- if(!(is >> std::hex >> tmp))
- break;
- symbol = tmp;
- } else
+ if (!symbolslex.next(true))
break;
- if (symbolslex.next(true))
- info.command = symbolslex.getDocString();
- else
+
+ istringstream is(symbolslex.getString());
+ // reading symbol directly does not work if
+ // char_type == wchar_t.
+ boost::uint32_t tmp;
+ if(!(is >> hex >> tmp))
break;
- if (symbolslex.next(true))
- info.preamble = symbolslex.getString();
- else
+ symbol = tmp;
+
+ if (!symbolslex.next(true))
break;
- if (symbolslex.next(true))
- flags = symbolslex.getString();
- else
+ info.command = symbolslex.getDocString();
+ if (!symbolslex.next(true))
+ break;
+ info.preamble = symbolslex.getString();
+ if (!symbolslex.next(true))
break;
+ flags = symbolslex.getString();
info.combining = false;
info.feature = false;
info.force = false;
while (!flags.empty()) {
string flag;
- flags = support::split(flags, flag, ',');
+ flags = split(flags, flag, ',');
if (flag == "combining")
info.combining = true;
else if (flag == "force")
else
lyxerr << "Ignoring unknown flag `" << flag
<< "' for symbol `0x"
- << std::hex << symbol << std::dec
+ << hex << symbol << dec
<< "'." << endl;
}
if (!info.preamble.empty())
info.feature = info.preamble[0] != '\\';
- LYXERR(Debug::INFO)
- << "Read unicode symbol " << symbol << " '"
+ LYXERR(Debug::INFO, "Read unicode symbol " << symbol << " '"
<< to_utf8(info.command) << "' '" << info.preamble
- << "' " << info.combining << ' ' << info.feature
- << endl;
+ << "' " << info.combining << ' ' << info.feature);
unicodesymbols[symbol] = info;
}
"Unknown package: `$$Token'");
}
- LYXERR(Debug::INFO) << "Reading encoding " << name << endl;
+ LYXERR(Debug::INFO, "Reading encoding " << name);
encodinglist[name] = Encoding(name, latexname,
iconvname, fixedwidth,
package);