Convert to unicode.

[lyx.git] / src / mathed / MathParser.C
diff --git a/src/mathed/MathParser.C b/src/mathed/MathParser.C

index 826d61529ec680da64093d3e2c68862a57830c65..cb208ffca3c78798280fe2cb0b0a89a2f54185af 100644 (file)
--- a/src/mathed/MathParser.C
+++ b/src/mathed/MathParser.C
@@ -39,6 +39,7 @@ following hack as starting point to write some macros:
  #include <config.h>
  
  #include "MathParser.h"
+
  #include "InsetMathArray.h"
  #include "InsetMathBig.h"
  #include "InsetMathBrace.h"
@@ -47,20 +48,19 @@ following hack as starting point to write some macros:
  #include "InsetMathComment.h"
  #include "InsetMathDelim.h"
  #include "InsetMathEnv.h"
-#include "MathFactory.h"
  #include "InsetMathKern.h"
  #include "InsetMathMacro.h"
-#include "MathMacroArgument.h"
-#include "MathMacroTemplate.h"
  #include "InsetMathPar.h"
+#include "InsetMathRef.h"
  #include "InsetMathRoot.h"
  #include "InsetMathScript.h"
+#include "InsetMathSplit.h"
  #include "InsetMathSqrt.h"
-#include "MathSupport.h"
  #include "InsetMathTabular.h"
-
-//#include "insets/insetref.h"
-#include "InsetMathRef.h"
+#include "MathMacroTemplate.h"
+#include "MathFactory.h"
+#include "MathMacroArgument.h"
+#include "MathSupport.h"
  
  #include "lyxlex.h"
  #include "debug.h"
@@ -69,6 +69,9 @@ following hack as starting point to write some macros:
  
  #include <sstream>
  
+
+namespace lyx {
+
  using std::endl;
  using std::fill;
  
@@ -85,7 +88,7 @@ using std::vector;
  
  namespace {
  
-InsetMath::mode_type asMode(InsetMath::mode_type oldmode, string const & str)
+InsetMath::mode_type asMode(InsetMath::mode_type oldmode, docstring const & str)
  {
         //lyxerr << "handling mode: '" << str << "'" << endl;
         if (str == "mathmode")
@@ -96,9 +99,9 @@ InsetMath::mode_type asMode(InsetMath::mode_type oldmode, string const & str)
  }
  
  
-bool stared(string const & s)
+bool stared(docstring const & s)
  {
-       string::size_type const n = s.size();
+       size_t const n = s.size();
         return n && s[n - 1] == '*';
  }
  
@@ -109,7 +112,7 @@ bool stared(string const & s)
   * environments like "equation" that have a fixed number of rows.
   */
  bool addRow(InsetMathGrid & grid, InsetMathGrid::row_type & cellrow,
-           string const & vskip)
+           docstring const & vskip)
  {
         ++cellrow;
         if (cellrow == grid.nrows()) {
@@ -125,12 +128,12 @@ bool addRow(InsetMathGrid & grid, InsetMathGrid::row_type & cellrow,
                         --cellrow;
                         lyxerr << "ignoring extra row";
                         if (!vskip.empty())
-                               lyxerr << " with extra space " << vskip;
+                               lyxerr << " with extra space " << to_utf8(vskip);
                         lyxerr << '.' << endl;
                         return false;
                 }
         }
-       grid.vcrskip(LyXLength(vskip), cellrow - 1);
+       grid.vcrskip(LyXLength(to_utf8(vskip)), cellrow - 1);
         return true;
  }
  
@@ -210,11 +213,19 @@ enum CatCode {
         catInvalid     // 15   <delete>
  };
  
-CatCode theCatcode[256];
+CatCode theCatcode[128];
  
  
-inline CatCode catcode(unsigned char c)
+inline CatCode catcode(lyx::char_type c)
  {
+       /* The fact that we use unicode internally does not change Knuth's TeX
+       engine. It is still 7bit only, not even latin1 or something like that.
+       Therefore, the catcode table needs only to have 128 entries.
+       Everything not in that range is catOther.
+       */
+       if (c >= 128)
+               return catOther;
+
         return theCatcode[c];
  }
  
@@ -245,26 +256,26 @@ public:
         ///
         Token() : cs_(), char_(0), cat_(catIgnore) {}
         ///
-       Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {}
+       Token(char_type c, CatCode cat) : cs_(), char_(c), cat_(cat) {}
         ///
-       Token(string const & cs) : cs_(cs), char_(0), cat_(catIgnore) {}
+       Token(docstring const & cs) : cs_(cs), char_(0), cat_(catIgnore) {}
  
         ///
-       string const & cs() const { return cs_; }
+       docstring const & cs() const { return cs_; }
         ///
         CatCode cat() const { return cat_; }
         ///
-       char character() const { return char_; }
+       char_type character() const { return char_; }
         ///
-       string asString() const { return cs_.size() ? cs_ : string(1, char_); }
+       docstring asString() const { return cs_.size() ? cs_ : docstring(1, char_); }
         ///
-       string asInput() const { return cs_.size() ? '\\' + cs_ : string(1, char_); }
+       docstring asInput() const { return cs_.size() ? '\\' + cs_ : docstring(1, char_); }
  
  private:
         ///
-       string cs_;
+       docstring cs_;
         ///
-       char char_;
+       char_type char_;
         ///
         CatCode cat_;
  };
@@ -309,19 +320,20 @@ private:
         ///
         void parse2(MathAtom & at, unsigned flags, mode_type mode, bool numbered);
         /// get arg delimited by 'left' and 'right'
-       string getArg(char left, char right);
+       docstring getArg(char_type left, char_type right);
         ///
-       char getChar();
+       char_type getChar();
         ///
         void error(string const & msg);
+       void error(docstring const & msg) { error(to_utf8(msg)); }
         /// dump contents to screen
         void dump() const;
         ///
         void tokenize(istream & is);
         ///
-       void tokenize(string const & s);
+       void tokenize(docstring const & s);
         ///
-       void skipSpaceTokens(istream & is, char c);
+       void skipSpaceTokens(idocstream & is, char_type c);
         ///
         void push_back(Token const & t);
         ///
@@ -335,13 +347,13 @@ private:
         /// skips spaces if any
         void skipSpaces();
         ///
-       void lex(string const & s);
+       void lex(docstring const & s);
         ///
         bool good() const;
         ///
-       string parse_verbatim_item();
+       docstring parse_verbatim_item();
         ///
-       string parse_verbatim_option();
+       docstring parse_verbatim_option();
  
         ///
         int lineno_;
@@ -350,7 +362,7 @@ private:
         ///
         unsigned pos_;
         /// Stack of active environments
-       vector<string> environments_;
+       vector<docstring> environments_;
  };
  
  
@@ -422,7 +434,7 @@ bool Parser::good() const
  }
  
  
-char Parser::getChar()
+char_type Parser::getChar()
  {
         if (!good())
                 error("The input stream is not well...");
@@ -430,12 +442,12 @@ char Parser::getChar()
  }
  
  
-string Parser::getArg(char left, char right)
+docstring Parser::getArg(char_type left, char_type right)
  {
         skipSpaces();
  
-       string result;
-       char c = getChar();
+       docstring result;
+       char_type c = getChar();
  
         if (c != left)
                 putback();
@@ -447,7 +459,7 @@ string Parser::getArg(char left, char right)
  }
  
  
-void Parser::skipSpaceTokens(istream & is, char c)
+void Parser::skipSpaceTokens(idocstream & is, char_type c)
  {
         // skip trailing spaces
         while (catcode(c) == catSpace || catcode(c) == catNewline)
@@ -462,7 +474,7 @@ void Parser::tokenize(istream & is)
  {
         // eat everything up to the next \end_inset or end of stream
         // and store it in s for further tokenization
-       string s;
+       std::string s;
         char c;
         while (is.get(c)) {
                 s += c;
@@ -476,15 +488,15 @@ void Parser::tokenize(istream & is)
                 is.unget();
  
         // tokenize buffer
-       tokenize(s);
+       tokenize(from_utf8(s));
  }
  
  
-void Parser::tokenize(string const & buffer)
+void Parser::tokenize(docstring const & buffer)
  {
-       istringstream is(buffer, ios::in | ios::binary);
+       idocstringstream is(buffer, ios::in | ios::binary);
  
-       char c;
+       char_type c;
         while (is.get(c)) {
                 //lyxerr << "reading c: " << c << endl;
  
@@ -515,7 +527,7 @@ void Parser::tokenize(string const & buffer)
                                 if (!is) {
                                         error("unexpected end of input");
                                 } else {
-                                       string s(1, c);
+                                       docstring s(1, c);
                                         if (catcode(c) == catLetter) {
                                                 // collect letters
                                                 while (is.get(c) && catcode(c) == catLetter)
@@ -590,10 +602,10 @@ bool Parser::parse(MathAtom & at)
  }
  
  
-string Parser::parse_verbatim_option()
+docstring Parser::parse_verbatim_option()
  {
         skipSpaces();
-       string res;
+       docstring res;
         if (nextToken().character() == '[') {
                 Token t = getToken();
                 for (Token t = getToken(); t.character() != ']' && good(); t = getToken()) {
@@ -608,10 +620,10 @@ string Parser::parse_verbatim_option()
  }
  
  
-string Parser::parse_verbatim_item()
+docstring Parser::parse_verbatim_item()
  {
         skipSpaces();
-       string res;
+       docstring res;
         if (nextToken().cat() == catBegin) {
                 Token t = getToken();
                 for (Token t = getToken(); t.cat() != catEnd && good(); t = getToken()) {
@@ -812,9 +824,13 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                                 cell->back() = MathAtom(new InsetMathScript(cell->back(), up));
                         InsetMathScript * p = cell->back().nucleus()->asScriptInset();
                         // special handling of {}-bases
+                       // Test for empty brace inset, otherwise \xxx{\vec{H}}_{0}
+                       // where \xxx is an unknown command gets misparsed to
+                       // \xxx\vec{H}_{0}, and that is invalid LaTeX.
                         // is this always correct?
-                       if (p->nuc().size() == 1 
-                           && p->nuc().back()->asBraceInset())
+                       if (p->nuc().size() == 1 &&
+                           p->nuc().back()->asBraceInset() &&
+                           p->nuc().back()->asBraceInset()->cell(0).empty())
                                 p->nuc() = p->nuc().back()->asNestInset()->cell(0);
                         parse(p->cell(p->idxOfScript(up)), FLAG_ITEM, mode);
                         if (limits) {
@@ -832,7 +848,7 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                         cell->push_back(MathAtom(new InsetMathChar(t.character())));
  
                 else if (t.cat() == catComment) {
-                       string s;
+                       docstring s;
                         while (good()) {
                                 Token const & t = getToken();
                                 if (t.cat() == catNewline)
@@ -856,15 +872,15 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                         t.cs() == "newcommand" ||
                         t.cs() == "renewcommand")
                 {
-                       string const type = t.cs();
-                       string name;
+                       docstring const type = t.cs();
+                       docstring name;
                         int nargs = 0;
                         if (t.cs() == "def") {
                                 // get name
                                 name = getToken().cs();
  
                                 // read parameter
-                               string pars;
+                               docstring pars;
                                 while (good() && nextToken().cat() != catBegin) {
                                         pars += getToken().cs();
                                         ++nargs;
@@ -886,7 +902,7 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                                         return;
                                 }
  
-                               string const arg  = getArg('[', ']');
+                               docstring const arg  = getArg('[', ']');
                                 if (!arg.empty())
                                         nargs = convert<int>(arg);
  
@@ -930,7 +946,7 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                 else if (t.cs() == "end") {
                         if (flags & FLAG_END) {
                                 // eat environment name
-                               string const name = getArg('{', '}');
+                               docstring const name = getArg('{', '}');
                                 if (environments_.empty())
                                         error("'found \\end{" + name +
                                               "}' without matching '\\begin{" +
@@ -1065,12 +1081,12 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                         // \| and \Vert are equivalent, and InsetMathDelim
                         // can't handle \|
                         // FIXME: fix this in InsetMathDelim itself!
-                       string const l = tl.cs() == "|" ? "Vert" : tl.asString();
+                       docstring const l = tl.cs() == "|" ? from_ascii("Vert") : tl.asString();
                         MathArray ar;
                         parse(ar, FLAG_RIGHT, mode);
                         skipSpaces();
                         Token const & tr = getToken();
-                       string const r = tr.cs() == "|" ? "Vert" : tr.asString();
+                       docstring const r = tr.cs() == "|" ? from_ascii("Vert") : tr.asString();
                         cell->push_back(MathAtom(new InsetMathDelim(l, r, ar)));
                 }
  
@@ -1083,33 +1099,33 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                 }
  
                 else if (t.cs() == "begin") {
-                       string const name = getArg('{', '}');
+                       docstring const name = getArg('{', '}');
                         environments_.push_back(name);
  
                         if (name == "array" || name == "subarray") {
-                               string const valign = parse_verbatim_option() + 'c';
-                               string const halign = parse_verbatim_item();
-                               cell->push_back(MathAtom(new InsetMathArray(name, valign[0], halign)));
+                               docstring const valign = parse_verbatim_option() + 'c';
+                               docstring const halign = parse_verbatim_item();
+                               cell->push_back(MathAtom(new InsetMathArray(name, (char)valign[0], halign)));
                                 parse2(cell->back(), FLAG_END, mode, false);
                         }
  
                         else if (name == "tabular") {
-                               string const valign = parse_verbatim_option() + 'c';
-                               string const halign = parse_verbatim_item();
-                               cell->push_back(MathAtom(new InsetMathTabular(name, valign[0], halign)));
+                               docstring const valign = parse_verbatim_option() + 'c';
+                               docstring const halign = parse_verbatim_item();
+                               cell->push_back(MathAtom(new InsetMathTabular(name, (char)valign[0], halign)));
                                 parse2(cell->back(), FLAG_END, InsetMath::TEXT_MODE, false);
                         }
  
-                       else if (name == "split" || name == "cases" ||
-                                name == "gathered" || name == "aligned") {
+                       else if (name == "split" || name == "cases") {
                                 cell->push_back(createInsetMath(name));
                                 parse2(cell->back(), FLAG_END, mode, false);
                         }
  
                         else if (name == "alignedat") {
+                               docstring const valign = parse_verbatim_option() + 'c';
                                 // ignore this for a while
                                 getArg('{', '}');
-                               cell->push_back(createInsetMath(name));
+                               cell->push_back(MathAtom(new InsetMathSplit(name, (char)valign[0])));
                                 parse2(cell->back(), FLAG_END, mode, false);
                         }
  
@@ -1174,6 +1190,18 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                                 if (l->inset == "matrix") {
                                         cell->push_back(createInsetMath(name));
                                         parse2(cell->back(), FLAG_END, mode, false);
+                               } else if (l->inset == "split") {
+                                       docstring const valign = parse_verbatim_option() + 'c';
+                                       cell->push_back(MathAtom(new InsetMathSplit(name, (char)valign[0])));
+                                       parse2(cell->back(), FLAG_END, mode, false);
+                               } else {
+                                       dump();
+                                       lyxerr << "found math environment `" << name
+                                              << "' in symbols file with unsupported inset `"
+                                              << l->inset << "'." << endl;
+                                       // create generic environment inset
+                                       cell->push_back(MathAtom(new InsetMathEnv(name)));
+                                       parse(cell->back().nucleus()->cell(0), FLAG_ITEM, mode);
                                 }
                         }
  
@@ -1190,7 +1218,7 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
  #ifdef WITH_WARNINGS
  #warning A hack...
  #endif
-                       string s;
+                       docstring s;
                         while (true) {
                                 Token const & t = getToken();
                                 if (!good()) {
@@ -1198,7 +1226,7 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                                         break;
                                 }
                                 s += t.character();
-                               if (isValidLength(s))
+                               if (isValidLength(to_utf8(s)))
                                         break;
                         }
                         cell->push_back(MathAtom(new InsetMathKern(s)));
@@ -1206,7 +1234,7 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
  
                 else if (t.cs() == "label") {
                         // FIXME: This is swallowed in inline formulas
-                       string label = parse_verbatim_item();
+                       docstring label = parse_verbatim_item();
                         MathArray ar;
                         asArray(label, ar);
                         if (grid.asHullInset()) {
@@ -1227,14 +1255,14 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                 }
  
                 else if (t.cs() == "color") {
-                       string const color = parse_verbatim_item();
+                       docstring const color = parse_verbatim_item();
                         cell->push_back(MathAtom(new InsetMathColor(true, color)));
                         parse(cell->back().nucleus()->cell(0), flags, mode);
                         return;
                 }
  
                 else if (t.cs() == "textcolor") {
-                       string const color = parse_verbatim_item();
+                       docstring const color = parse_verbatim_item();
                         cell->push_back(MathAtom(new InsetMathColor(false, color)));
                         parse(cell->back().nucleus()->cell(0), FLAG_ITEM, InsetMath::TEXT_MODE);
                 }
@@ -1251,7 +1279,10 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                 }
  
                 else if (t.cs() == "xymatrix") {
-                       cell->push_back(createInsetMath(t.cs()));
+                       odocstringstream os;
+                       while (good() && nextToken().cat() != catBegin)
+                               os << getToken().asInput();
+                       cell->push_back(createInsetMath(t.cs() + os.str()));
                         parse2(cell->back(), FLAG_ITEM, mode, false);
                 }
  
@@ -1302,14 +1333,13 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
                         if (l) {
                                 if (l->inset == "big") {
                                         skipSpaces();
-                                       string const delim = getToken().asInput();
+                                       docstring const delim = getToken().asInput();
                                         if (InsetMathBig::isBigInsetDelim(delim))
                                                 cell->push_back(MathAtom(
                                                         new InsetMathBig(t.cs(), delim)));
                                         else {
                                                 cell->push_back(createInsetMath(t.cs()));
-                                               cell->push_back(createInsetMath(
-                                                               delim.substr(1)));
+                                               putback();
                                         }
                                 }
  
@@ -1385,9 +1415,9 @@ void Parser::parse1(InsetMathGrid & grid, unsigned flags,
  } // anonymous namespace
  
  
-void mathed_parse_cell(MathArray & ar, string const & str)
+void mathed_parse_cell(MathArray & ar, docstring const & str)
  {
-       istringstream is(str);
+       istringstream is(to_utf8(str));
         mathed_parse_cell(ar, is);
  }
  
@@ -1426,7 +1456,7 @@ void mathed_parse_normal(InsetMathGrid & grid, string const & str)
  
  void initParser()
  {
-       fill(theCatcode, theCatcode + 256, catOther);
+       fill(theCatcode, theCatcode + 128, catOther);
         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
  
@@ -1446,3 +1476,6 @@ void initParser()
         theCatcode[int('~')]  = catActive;
         theCatcode[int('%')]  = catComment;
  }
+
+
+} // namespace lyx