First serious step for utf8 file format format. tex2lyx is now able to

author Jean-Marc Lasgouttes <lasgouttes@lyx.org>

Sun, 16 Nov 2008 17:02:00 +0000 (17:02 +0000)

committer Jean-Marc Lasgouttes <lasgouttes@lyx.org>

Sun, 16 Nov 2008 17:02:00 +0000 (17:02 +0000)
author Jean-Marc Lasgouttes <lasgouttes@lyx.org>
Sun, 16 Nov 2008 17:02:00 +0000 (17:02 +0000)
committer Jean-Marc Lasgouttes <lasgouttes@lyx.org>
Sun, 16 Nov 2008 17:02:00 +0000 (17:02 +0000)
diff --git a/src/tex2lyx/Parser.cpp b/src/tex2lyx/Parser.cpp

index b386d3c79ec9867c190ae6d28b4a845cb0d3fab7..83f4c479805e7aac7f16ae1f538a87221ed1d945 100644 (file)
--- a/src/tex2lyx/Parser.cpp
+++ b/src/tex2lyx/Parser.cpp
@@ -13,7 +13,6 @@
  #include "Parser.h"
  
  #include <iostream>
  #include "Parser.h"
  
  #include <iostream>
-#include <sstream>
  
  using namespace std;
  
  
  using namespace std;
  
@@ -54,13 +53,12 @@ void catInit()
         theCatcode[int('@')]  = catLetter;
  }
  
         theCatcode[int('@')]  = catLetter;
  }
  
-
  /*!
   * Translate a line ending to '\n'.
   * \p c must have catcode catNewline, and it must be the last character read
   * from \p is.
   */
  /*!
   * Translate a line ending to '\n'.
   * \p c must have catcode catNewline, and it must be the last character read
   * from \p is.
   */
-char getNewline(istream & is, char c)
+char getNewline(idocstream & is, char c)
  {
         // we have to handle 3 different line endings:
         // - UNIX (\n)
  {
         // we have to handle 3 different line endings:
         // - UNIX (\n)
@@ -68,9 +66,10 @@ char getNewline(istream & is, char c)
         // - DOS  (\r\n)
         if (c == '\r') {
                 // MAC or DOS
         // - DOS  (\r\n)
         if (c == '\r') {
                 // MAC or DOS
-               if (is.get(c) && c != '\n') {
+               char_type wc;
+               if (is.get(wc) && wc != '\n') {
                         // MAC
                         // MAC
-                       is.putback(c);
+                       is.putback(wc);
                 }
                 return '\n';
         }
                 }
                 return '\n';
         }
@@ -78,18 +77,14 @@ char getNewline(istream & is, char c)
         return c;
  }
  
         return c;
  }
  
-}
-
-
-//
-// catcodes
-//
-
-CatCode catcode(unsigned char c)
+CatCode catcode(char_type c)
  {
  {
-       return theCatcode[c];
+       if (c < 256)
+               return theCatcode[(unsigned char)c];
+       return catOther;
  }
  
  }
  
+}
  
  
  //
  
  
  //
@@ -105,18 +100,18 @@ ostream & operator<<(ostream & os, Token const & t)
         else if (t.cat() == catEscape)
                 os << '\\' << t.cs() << ' ';
         else if (t.cat() == catLetter)
         else if (t.cat() == catEscape)
                 os << '\\' << t.cs() << ' ';
         else if (t.cat() == catLetter)
-               os << t.character();
+               os << t.cs();
         else if (t.cat() == catNewline)
                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
         else
         else if (t.cat() == catNewline)
                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
         else
-               os << '[' << t.character() << ',' << t.cat() << ']';
+               os << '[' << t.cs() << ',' << t.cat() << ']';
         return os;
  }
  
  
  string Token::asString() const
  {
         return os;
  }
  
  
  string Token::asString() const
  {
-       return cs_.size() ? cs_ : string(1, char_);
+       return cs_;
  }
  
  
  }
  
  
@@ -124,9 +119,9 @@ string Token::asInput() const
  {
         if (cat_ == catComment)
                 return '%' + cs_ + '\n';
  {
         if (cat_ == catComment)
                 return '%' + cs_ + '\n';
-       if (cat_ == catSpace || cat_ == catNewline)
-               return cs_;
-       return char_ ? string(1, char_) : '\\' + cs_;
+       if (cat_ == catEscape)
+               return '\\' + cs_;
+       return cs_;
  }
  
  
  }
  
  
@@ -135,14 +130,15 @@ string Token::asInput() const
  //
  
  
  //
  
  
-Parser::Parser(istream & is)
+Parser::Parser(idocstream & is)
         : lineno_(0), pos_(0), iss_(0), is_(is)
  {
  }
  
  
  Parser::Parser(string const & s)
         : lineno_(0), pos_(0), iss_(0), is_(is)
  {
  }
  
  
  Parser::Parser(string const & s)
-       : lineno_(0), pos_(0), iss_(new istringstream(s)), is_(*iss_)
+       : lineno_(0), pos_(0), 
+         iss_(new idocstringstream(from_utf8(s))), is_(*iss_)
  {
  }
  
  {
  }
  
@@ -267,7 +263,7 @@ char Parser::getChar()
  {
         if (!good())
                 error("The input stream is not well...");
  {
         if (!good())
                 error("The input stream is not well...");
-       return tokens_[pos_++].character();
+       return get_token().character();
  }
  
  
  }
  
  
@@ -365,14 +361,13 @@ string const Parser::verbatimEnvironment(string const & name)
  void Parser::tokenize_one()
  {
         catInit();
  void Parser::tokenize_one()
  {
         catInit();
-       char c;
+       char_type c;
         if (!is_.get(c)) 
                 return;
         if (!is_.get(c)) 
                 return;
-       //cerr << "reading c: " << c << "\n";
  
         switch (catcode(c)) {
         case catSpace: {
  
         switch (catcode(c)) {
         case catSpace: {
-               string s(1, c);
+               docstring s(1, c);
                 while (is_.get(c) && catcode(c) == catSpace)
                         s += c;
                 if (catcode(c) != catSpace)
                 while (is_.get(c) && catcode(c) == catSpace)
                         s += c;
                 if (catcode(c) != catSpace)
@@ -383,7 +378,7 @@ void Parser::tokenize_one()
                 
         case catNewline: {
                 ++lineno_;
                 
         case catNewline: {
                 ++lineno_;
-               string s(1, getNewline(is_, c));
+               docstring s(1, getNewline(is_, c));
                 while (is_.get(c) && catcode(c) == catNewline) {
                         ++lineno_;
                         s += getNewline(is_, c);
                 while (is_.get(c) && catcode(c) == catNewline) {
                         ++lineno_;
                         s += getNewline(is_, c);
@@ -397,7 +392,7 @@ void Parser::tokenize_one()
         case catComment: {
                 // We don't treat "%\n" combinations here specially because
                 // we want to preserve them in the preamble
         case catComment: {
                 // We don't treat "%\n" combinations here specially because
                 // we want to preserve them in the preamble
-               string s;
+               docstring s;
                 while (is_.get(c) && catcode(c) != catNewline)
                         s += c;
                 // handle possible DOS line ending
                 while (is_.get(c) && catcode(c) != catNewline)
                         s += c;
                 // handle possible DOS line ending
@@ -415,7 +410,7 @@ void Parser::tokenize_one()
                 if (!is_) {
                         error("unexpected end of input");
                 } else {
                 if (!is_) {
                         error("unexpected end of input");
                 } else {
-                       string s(1, c);
+                       docstring s(1, c);
                         if (catcode(c) == catLetter) {
                                 // collect letters
                                 while (is_.get(c) && catcode(c) == catLetter)
                         if (catcode(c) == catLetter) {
                                 // collect letters
                                 while (is_.get(c) && catcode(c) == catLetter)
@@ -429,13 +424,14 @@ void Parser::tokenize_one()
         }
                 
         case catIgnore: {
         }
                 
         case catIgnore: {
-               cerr << "ignoring a char: " << int(c) << "\n";
+               cerr << "ignoring a char: " << c << "\n";
                 break;
         }
                 
         default:
                 break;
         }
                 
         default:
-               push_back(Token(c, catcode(c)));
+               push_back(Token(docstring(1, c), catcode(c)));
         }
         }
+       //cerr << tokens_.back();
  }
  
  
  }
  
  
@@ -464,7 +460,7 @@ string Parser::verbatimOption()
         string res;
         if (next_token().character() == '[') {
                 Token t = get_token();
         string res;
         if (next_token().character() == '[') {
                 Token t = get_token();
-               for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
+               for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
                         if (t.cat() == catBegin) {
                                 putback();
                                 res += '{' + verbatim_item() + '}';
                         if (t.cat() == catBegin) {
                                 putback();
                                 res += '{' + verbatim_item() + '}';
diff --git a/src/tex2lyx/Parser.h b/src/tex2lyx/Parser.h

index 94c1100aae4f7dbda7550c7199ffe4c8907975bd..7c70ecca97c1a0cae3a5ddb0b4f81b221718b5bf 100644 (file)
--- a/src/tex2lyx/Parser.h
+++ b/src/tex2lyx/Parser.h
@@ -12,10 +12,11 @@
  #ifndef PARSER_H
  #define PARSER_H
  
  #ifndef PARSER_H
  #define PARSER_H
  
-#include <vector>
  #include <string>
  #include <utility>
  #include <string>
  #include <utility>
+#include <vector>
  
  
+#include "support/docstream.h"
  
  namespace lyx {
  
  
  namespace lyx {
  
@@ -46,9 +47,6 @@ enum CatCode {
  };
  
  
  };
  
  
-CatCode catcode(unsigned char c);
-
-
  enum {
         FLAG_BRACE_LAST = 1 << 1,  //  last closing brace ends the parsing
         FLAG_RIGHT      = 1 << 2,  //  next \\right ends the parsing process
  enum {
         FLAG_BRACE_LAST = 1 << 1,  //  last closing brace ends the parsing
         FLAG_RIGHT      = 1 << 2,  //  next \\right ends the parsing process
@@ -75,18 +73,16 @@ enum {
  class Token {
  public:
         ///
  class Token {
  public:
         ///
-       Token() : cs_(), char_(0), cat_(catIgnore) {}
-       ///
-       Token(char c, CatCode cat) : cs_(), char_(c), cat_(cat) {}
+       Token() : cs_(), cat_(catIgnore) {}
         ///
         ///
-       Token(std::string const & cs, CatCode cat) : cs_(cs), char_(0), cat_(cat) {}
+       Token(docstring const & cs, CatCode cat) : cs_(to_utf8(cs)), cat_(cat) {}
  
         ///
         std::string const & cs() const { return cs_; }
         /// Returns the catcode of the token
         CatCode cat() const { return cat_; }
         ///
  
         ///
         std::string const & cs() const { return cs_; }
         /// Returns the catcode of the token
         CatCode cat() const { return cat_; }
         ///
-       char character() const { return char_; }
+       char character() const { return cs_.empty() ? 0 : cs_[0]; }
         /// Returns the token as string
         std::string asString() const;
         /// Returns the token verbatim
         /// Returns the token as string
         std::string asString() const;
         /// Returns the token verbatim
@@ -96,8 +92,6 @@ private:
         ///
         std::string cs_;
         ///
         ///
         std::string cs_;
         ///
-       char char_;
-       ///
         CatCode cat_;
  };
  
         CatCode cat_;
  };
  
@@ -119,7 +113,7 @@ class Parser {
  
  public:
         ///
  
  public:
         ///
-       Parser(std::istream & is);
+       Parser(idocstream & is);
         ///
         Parser(std::string const & s);
         ///
         ///
         Parser(std::string const & s);
         ///
@@ -217,9 +211,9 @@ private:
         ///
         unsigned pos_;
         ///
         ///
         unsigned pos_;
         ///
-       std::istringstream * iss_;
+       idocstringstream * iss_;
         ///
         ///
-       std::istream & is_;
+       idocstream & is_;
  };
  
  
  };
  
  
diff --git a/src/tex2lyx/math.cpp b/src/tex2lyx/math.cpp

index be2800643e7f6cc30fc58e083bab2180f633351b..44b0f083897a0dbac249416194b1982255e03e00 100644 (file)
--- a/src/tex2lyx/math.cpp
+++ b/src/tex2lyx/math.cpp
@@ -94,7 +94,7 @@ void parse_math(Parser & p, ostream & os, unsigned flags, const mode_type mode)
                                t.cat() == catAlign ||
                                t.cat() == catActive ||
                                t.cat() == catParameter)
                                t.cat() == catAlign ||
                                t.cat() == catActive ||
                                t.cat() == catParameter)
-                       os << t.character();
+                       os << t.cs();
  
                 else if (t.cat() == catBegin) {
                         os << '{';
  
                 else if (t.cat() == catBegin) {
                         os << '{';
diff --git a/src/tex2lyx/preamble.cpp b/src/tex2lyx/preamble.cpp

index 28f44ee84449c3fdc812be2413a1b7fa2ed34613..b911b373aec9fbd00003e1e7b0817d3840992339 100644 (file)
--- a/src/tex2lyx/preamble.cpp
+++ b/src/tex2lyx/preamble.cpp
@@ -413,7 +413,7 @@ void handle_package(string const & name, string const & opts,
  void end_preamble(ostream & os, TextClass const & /*textclass*/)
  {
         os << "#LyX file created by tex2lyx " << PACKAGE_VERSION << "\n"
  void end_preamble(ostream & os, TextClass const & /*textclass*/)
  {
         os << "#LyX file created by tex2lyx " << PACKAGE_VERSION << "\n"
-          << "\\lyxformat 247\n"
+          << "\\lyxformat 249\n"
            << "\\begin_document\n"
            << "\\begin_header\n"
            << "\\textclass " << h_textclass << "\n";
            << "\\begin_document\n"
            << "\\begin_header\n"
            << "\\textclass " << h_textclass << "\n";
diff --git a/src/tex2lyx/table.cpp b/src/tex2lyx/table.cpp

index e04be70bb232110b34dd96dea5118a2ffa754c21..d23db7088a1bd700bc85be7a7ca5db4b4c671c59 100644 (file)
--- a/src/tex2lyx/table.cpp
+++ b/src/tex2lyx/table.cpp
@@ -661,16 +661,15 @@ void parse_table(Parser & p, ostream & os, bool is_long_tabular,
                         }
                 }
  
                         }
                 }
  
-               else if (t.cat() == catSpace || t.cat() == catNewline)
-                               os << t.cs();
-
-               else if (t.cat() == catLetter ||
-                              t.cat() == catSuper ||
-                              t.cat() == catSub ||
-                              t.cat() == catOther ||
-                              t.cat() == catActive ||
-                              t.cat() == catParameter)
-                       os << t.character();
+               else if (t.cat() == catSpace 
+                        || t.cat() == catNewline
+                        || t.cat() == catLetter 
+                        || t.cat() == catSuper 
+                        || t.cat() == catSub 
+                        || t.cat() == catOther 
+                        || t.cat() == catActive 
+                        || t.cat() == catParameter)
+                       os << t.cs();
  
                 else if (t.cat() == catBegin) {
                         os << '{';
  
                 else if (t.cat() == catBegin) {
                         os << '{';
diff --git a/src/tex2lyx/tex2lyx.cpp b/src/tex2lyx/tex2lyx.cpp

index 4c1366d85bee49791371f7a0a0f02db8e004f983..eacea79ac8a52c8caf98ee759028e9dbc3921ba9 100644 (file)
--- a/src/tex2lyx/tex2lyx.cpp
+++ b/src/tex2lyx/tex2lyx.cpp
@@ -18,17 +18,16 @@
  #include "TextClass.h"
  #include "Layout.h"
  
  #include "TextClass.h"
  #include "Layout.h"
  
-#include "support/lassert.h"
  #include "support/convert.h"
  #include "support/debug.h"
  #include "support/ExceptionMessage.h"
  #include "support/filetools.h"
  #include "support/convert.h"
  #include "support/debug.h"
  #include "support/ExceptionMessage.h"
  #include "support/filetools.h"
+#include "support/lassert.h"
  #include "support/lstrings.h"
  #include "support/os.h"
  #include "support/Package.h"
  
  #include <cstdlib>
  #include "support/lstrings.h"
  #include "support/os.h"
  #include "support/Package.h"
  
  #include <cstdlib>
-#include <fstream>
  #include <iostream>
  #include <string>
  #include <sstream>
  #include <iostream>
  #include <string>
  #include <sstream>
@@ -202,7 +201,7 @@ void read_environment(Parser & p, string const & begin,
   */
  void read_syntaxfile(FileName const & file_name)
  {
   */
  void read_syntaxfile(FileName const & file_name)
  {
-       ifstream is(file_name.toFilesystemEncoding().c_str());
+       ifdocstream is(file_name.toFilesystemEncoding().c_str());
         if (!is.good()) {
                 cerr << "Could not open syntax file \"" << file_name
                      << "\" for reading." << endl;
         if (!is.good()) {
                 cerr << "Could not open syntax file \"" << file_name
                      << "\" for reading." << endl;
@@ -389,7 +388,7 @@ namespace {
   *  You must ensure that \p parentFilePath is properly set before calling
   *  this function!
   */
   *  You must ensure that \p parentFilePath is properly set before calling
   *  this function!
   */
-void tex2lyx(istream & is, ostream & os)
+void tex2lyx(idocstream & is, ostream & os)
  {
         Parser p(is);
         //p.dump();
  {
         Parser p(is);
         //p.dump();
@@ -411,7 +410,7 @@ void tex2lyx(istream & is, ostream & os)
         os << ss.str();
  #ifdef TEST_PARSER
         p.reset();
         os << ss.str();
  #ifdef TEST_PARSER
         p.reset();
-       ofstream parsertest("parsertest.tex");
+       ofdocstream parsertest("parsertest.tex");
         while (p.good())
                 parsertest << p.get_token().asInput();
         // <origfile> and parsertest.tex should now have identical content
         while (p.good())
                 parsertest << p.get_token().asInput();
         // <origfile> and parsertest.tex should now have identical content
@@ -422,7 +421,7 @@ void tex2lyx(istream & is, ostream & os)
  /// convert TeX from \p infilename to LyX and write it to \p os
  bool tex2lyx(FileName const & infilename, ostream & os)
  {
  /// convert TeX from \p infilename to LyX and write it to \p os
  bool tex2lyx(FileName const & infilename, ostream & os)
  {
-       ifstream is(infilename.toFilesystemEncoding().c_str());
+       ifdocstream is(infilename.toFilesystemEncoding().c_str());
         if (!is.good()) {
                 cerr << "Could not open input file \"" << infilename
                      << "\" for reading." << endl;
         if (!is.good()) {
                 cerr << "Could not open input file \"" << infilename
                      << "\" for reading." << endl;
diff --git a/src/tex2lyx/text.cpp b/src/tex2lyx/text.cpp

index df5f51175cff3479f49f66749f5104e0da199811..dfb9c1fe51abcd99cf4643e7d998dd8d5ca57096 100644 (file)
--- a/src/tex2lyx/text.cpp
+++ b/src/tex2lyx/text.cpp
@@ -1262,7 +1262,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
                                t.cat() == catParameter) {
                         // This translates "&" to "\\&" which may be wrong...
                         context.check_layout(os);
                                t.cat() == catParameter) {
                         // This translates "&" to "\\&" which may be wrong...
                         context.check_layout(os);
-                       os << t.character();
+                       os << t.cs();
                 }
  
                 else if (p.isParagraph()) {
                 }
  
                 else if (p.isParagraph()) {
@@ -1281,7 +1281,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
                                 else
                                         os << "\\InsetSpace ~\n";
                         } else
                                 else
                                         os << "\\InsetSpace ~\n";
                         } else
-                               os << t.character();
+                               os << t.cs();
                 }
  
                 else if (t.cat() == catBegin &&
                 }
  
                 else if (t.cat() == catBegin &&
@@ -1309,7 +1309,7 @@ void parse_text(Parser & p, ostream & os, unsigned flags, bool outer,
                             next.character() == '*') {
                                 p.get_token();
                                 if (p.next_token().cat() == catEnd) {
                             next.character() == '*') {
                                 p.get_token();
                                 if (p.next_token().cat() == catEnd) {
-                                       os << next.character();
+                                       os << next.cs();
                                         p.get_token();
                                 } else {
                                         p.putback();
                                         p.get_token();
                                 } else {
                                         p.putback();
author	Jean-Marc Lasgouttes <lasgouttes@lyx.org>
	Sun, 16 Nov 2008 17:02:00 +0000 (17:02 +0000)
committer	Jean-Marc Lasgouttes <lasgouttes@lyx.org>
	Sun, 16 Nov 2008 17:02:00 +0000 (17:02 +0000)
src/tex2lyx/Parser.cpp		patch \| blob \| history
src/tex2lyx/Parser.h		patch \| blob \| history
src/tex2lyx/math.cpp		patch \| blob \| history
src/tex2lyx/preamble.cpp		patch \| blob \| history
src/tex2lyx/table.cpp		patch \| blob \| history
src/tex2lyx/tex2lyx.cpp		patch \| blob \| history
src/tex2lyx/text.cpp		patch \| blob \| history