4 * This file is part of LyX, the document processor.
5 * Licence details can be found in the file COPYING.
7 * \author Lars Gullik Bjønnes
8 * \author Jean-Marc Lasgouttes
10 * Full author contact details are available in file CREDITS.
16 #include "support/docstring.h"
17 #include "support/types.h"
25 namespace support { class FileName; }
27 class EncodingException : public std::exception {
29 EncodingException(char_type c);
30 virtual ~EncodingException() throw() {}
31 virtual const char * what() const throw();
33 char_type failed_char;
41 CharInfoCombining = 1,
43 CharInfoTextFeature = 2,
45 CharInfoMathFeature = 4,
49 CharInfoTextNoTermination = 16,
51 CharInfoMathNoTermination = 32,
53 CharInfoForceSelected = 64,
57 /// Information about a single UCS4 character
60 // we assume that at least one command is nonempty when using unicodesymbols
61 bool isUnicodeSymbol() const { return !textcommand.empty() || !mathcommand.empty(); }
62 /// LaTeX command (text mode) for this character
63 docstring textcommand;
64 /// LaTeX command (math mode) for this character
65 docstring mathcommand;
66 /// Needed LaTeX preamble (or feature) for text mode
67 std::string textpreamble;
68 /// Needed LaTeX preamble (or feature) for math mode
69 std::string mathpreamble;
70 /// Is this a combining character?
71 bool combining() const { return flags & CharInfoCombining ? true : false; }
72 /// Is \c textpreamble a feature known by LaTeXFeatures, or a raw LaTeX
74 bool textfeature() const { return flags & CharInfoTextFeature ? true : false; }
75 /// Is \c mathpreamble a feature known by LaTeXFeatures, or a raw LaTeX
77 bool mathfeature() const { return flags & CharInfoMathFeature ? true : false; }
78 /// Always force the LaTeX command, even if the encoding contains
80 bool force() const { return flags & CharInfoForce ? true : false; }
81 /// Force the LaTeX command for some encodings?
82 bool forceselected() const { return flags & CharInfoForceSelected ? true : false; }
84 std::string tipashortcut;
85 /// \c textcommand needs no termination (such as {} or space).
86 bool textnotermination() const { return flags & CharInfoTextNoTermination ? true : false; }
87 /// \c mathcommand needs no termination (such as {} or space).
88 bool mathnotermination() const { return flags & CharInfoMathNoTermination ? true : false; }
97 /// Which LaTeX package handles this encoding?
104 /// Represent any of the above packages
105 static int const any;
109 Encoding(std::string const & n, std::string const & l,
110 std::string const & g, std::string const & i,
111 bool f, bool u, Package p);
115 std::string const & name() const { return name_; }
117 std::string const & latexName() const { return latexName_; }
119 std::string const & guiName() const { return guiName_; }
121 std::string const & iconvName() const { return iconvName_; }
123 bool hasFixedWidth() const { return fixedwidth_; }
125 bool unsafe() const { return unsafe_; }
126 /// \p c is representable in this encoding without a LaTeX macro
127 bool encodable(char_type c) const;
129 * Convert \p c to something that LaTeX can understand.
130 * This is either the character itself (if it is representable
131 * in this encoding), or a LaTeX macro.
132 * If the character is not representable in this encoding, but no
133 * LaTeX macro is known, a warning is given of lyxerr, and the
134 * character is returned.
135 * \return the converted character and a flag indicating whether
136 * the command needs to be terminated by {} or a space.
138 std::pair<docstring, bool> latexChar(char_type c) const;
140 * Convert \p input to something that LaTeX can understand.
141 * This is either the string itself (if it is representable
142 * in this encoding), or a LaTeX macro.
143 * If a character is not representable in this encoding, but no
144 * LaTeX macro is known, a warning is given of lyxerr, and the
145 * character is returned in the second string of the pair and
146 * omitted in the first.
147 * \p dryrun specifies whether the string is used within source
148 * preview (which yields a special warning).
150 std::pair<docstring, docstring> latexString(docstring const input,
151 bool dryrun = false) const;
152 /// Which LaTeX package handles this encoding?
153 Package package() const { return package_; }
154 /// A list of all characters usable in this encoding
155 std::vector<char_type> symbolsList() const;
158 * Do we have to output this character as LaTeX command in any case?
159 * This is true if the "force" flag is set.
160 * We need this if the inputencoding does not support a certain glyph.
162 bool isForced(char_type c) const;
166 std::string latexName_;
168 std::string guiName_;
170 std::string iconvName_;
171 /// Is this a fixed width encoding?
173 /// Is this encoding TeX unsafe, e.g. control characters like {, }
174 /// and \\ may appear in high bytes?
177 typedef std::set<char_type> CharSet;
178 /// Set of UCS4 characters that we can encode (for singlebyte
180 mutable CharSet encodable_;
181 /// Set of UCS4 characters that we can't encode
182 CharSet const * forced_;
183 /// All code points below this are encodable. This helps us to avoid
184 /// lokup of ASCII characters in encodable_ and gives about 1 sec
185 /// speedup on export of the Userguide.
186 mutable char_type start_encodable_;
187 /// Which LaTeX package handles this encoding?
190 * If this is true the stored information about the encoding covers
191 * all encodable characters. We set this to false initially so that
192 * we only need to query iconv for the actually used encodings.
193 * This is needed especially for the multibyte encodings, if we
194 * complete all encoding info on startup it takes 2-3 minutes.
196 mutable bool complete_;
202 typedef std::set<char_type> MathCommandSet;
204 typedef std::set<char_type> TextCommandSet;
206 typedef std::set<char_type> MathSymbolSet;
208 typedef std::map<std::string, Encoding> EncodingList;
209 /// iterator to iterate over all encodings.
210 /// We hide the fact that our encoding list is implemented as a map.
211 class const_iterator : public EncodingList::const_iterator {
212 typedef EncodingList::const_iterator base;
214 const_iterator() : base() {}
215 const_iterator(base const & b) : base(b) {}
216 Encoding const & operator*() const { return base::operator*().second; }
217 Encoding const * operator->() const { return &(base::operator*().second); }
221 /// Read the encodings.
222 /// \param encfile encodings definition file
223 /// \param symbolsfile unicode->LaTeX mapping file
224 void read(support::FileName const & encfile,
225 support::FileName const & symbolsfile);
226 /// Get encoding from LyX name \p name
228 fromLyXName(std::string const & name, bool allowUnsafe = false) const;
229 /// Get encoding from LaTeX name \p name and package \p package
230 Encoding const * fromLaTeXName(std::string const & name,
231 int const & package = Encoding::any, bool allowUnsafe = false) const;
232 /// Get encoding from iconv name \p name and package \p package
233 Encoding const * fromIconvName(std::string const & name,
234 int const & package = Encoding::any, bool allowUnsafe = false) const;
237 const_iterator begin() const { return encodinglist.begin(); }
239 const_iterator end() const { return encodinglist.end(); }
253 static bool isHebrewComposeChar(char_type c);
255 static bool isArabicComposeChar(char_type c);
257 static bool isArabicSpecialChar(char_type c);
259 static bool isArabicChar(char_type c);
260 /// Accessor for the unicode information table.
261 static CharInfo const & unicodeCharInfo(char_type c);
263 static char_type transformChar(char_type c, LetterForm form);
264 /// Is this a combining char?
265 static bool isCombiningChar(char_type c);
266 /// Return the TIPA shortcut
267 static std::string const TIPAShortcut(char_type c);
269 * Is this a known char from some language?
270 * If \p preamble is empty and code point \p c is known to belong
271 * to a supported script, true is returned and \p preamble is set
272 * to the corresponding entry in the unicodesymbols file.
273 * If \p preamble is not empty, a check is made whether code point
274 * \p c is a known character matching the preamble entry.
276 static bool isKnownScriptChar(char_type const c, std::string & preamble);
278 * Do we have to display in italics this character when in mathmode?
279 * This is true if the "mathalpha" flag is set. We use this for
280 * letters and accented characters that are output as math commands.
282 static bool isMathAlpha(char_type c);
284 * Register \p c as a mathmode command.
286 static void addMathCmd(char_type c) { mathcmd.insert(c); }
288 * Register \p c as a textmode command.
290 static void addTextCmd(char_type c) { textcmd.insert(c); }
292 * Register \p c as a mathmode symbol.
294 static void addMathSym(char_type c) { mathsym.insert(c); }
296 * Tell whether \p c is registered as a mathmode command.
298 static bool isMathCmd(char_type c) { return mathcmd.count(c); }
300 * Tell whether \p c is registered as a textmode command.
302 static bool isTextCmd(char_type c) { return textcmd.count(c); }
304 * Tell whether \p c is registered as a mathmode symbol.
306 static bool isMathSym(char_type c) { return mathsym.count(c); }
308 * If \p c cannot be encoded in the given \p encoding, convert
309 * it to something that LaTeX can understand in mathmode.
310 * \p needsTermination indicates whether the command needs to be
311 * terminated by {} or a space.
312 * \return whether \p command is a mathmode command
314 static bool latexMathChar(char_type c, bool mathmode,
315 Encoding const * encoding, docstring & command,
316 bool & needsTermination);
318 * Convert the LaTeX command in \p cmd to the corresponding unicode
319 * point and set \p combining to true if it is a combining symbol.
320 * \p needsTermination indicates whether the command needs to be
321 * terminated by {} or a space.
323 static char_type fromLaTeXCommand(docstring const & cmd, int cmdtype,
324 bool & combining, bool & needsTermination,
325 std::set<std::string> * req = 0);
334 * Convert the LaTeX commands in \p cmd and \return a docstring
335 * of corresponding unicode points. The conversion stops at the
336 * first command which could not be converted, and the remaining
337 * unconverted commands are returned in \p rem.
338 * The \p cmdtype parameter can be used to limit recognized
339 * commands to math or text mode commands only.
340 * \p needsTermination indicates whether the command needs to be
341 * terminated by {} or a space.
343 static docstring fromLaTeXCommand(docstring const & cmd, int cmdtype,
344 bool & needsTermination, docstring & rem,
345 std::set<std::string> * req = 0);
349 EncodingList encodinglist;
351 static MathCommandSet mathcmd;
353 static TextCommandSet textcmd;
355 static MathSymbolSet mathsym;
358 extern Encodings encodings;