]> git.lyx.org Git - features.git/blob - src/tex2lyx/Parser.cpp
Now tex2lyx is able to set the encoding from what it reads in the preamble.
[features.git] / src / tex2lyx / Parser.cpp
1 /**
2  * \file Parser.cpp
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author André Pönitz 
7  *
8  * Full author contact details are available in file CREDITS.
9  */
10
11 #include <config.h>
12
13 #include "Encoding.h"
14 #include "Parser.h"
15
16 #include <iostream>
17
18 using namespace std;
19
20 namespace lyx {
21
22 namespace {
23
24 CatCode theCatcode[256];
25
26 void catInit()
27 {
28         static bool init_done = false;
29         if (init_done) 
30                 return;
31         init_done = true;
32
33         fill(theCatcode, theCatcode + 256, catOther);
34         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
35         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
36
37         theCatcode[int('\\')] = catEscape;
38         theCatcode[int('{')]  = catBegin;
39         theCatcode[int('}')]  = catEnd;
40         theCatcode[int('$')]  = catMath;
41         theCatcode[int('&')]  = catAlign;
42         theCatcode[int('\n')] = catNewline;
43         theCatcode[int('#')]  = catParameter;
44         theCatcode[int('^')]  = catSuper;
45         theCatcode[int('_')]  = catSub;
46         theCatcode[0x7f]      = catIgnore;
47         theCatcode[int(' ')]  = catSpace;
48         theCatcode[int('\t')] = catSpace;
49         theCatcode[int('\r')] = catNewline;
50         theCatcode[int('~')]  = catActive;
51         theCatcode[int('%')]  = catComment;
52
53         // This is wrong!
54         theCatcode[int('@')]  = catLetter;
55 }
56
57 /*!
58  * Translate a line ending to '\n'.
59  * \p c must have catcode catNewline, and it must be the last character read
60  * from \p is.
61  */
62 char getNewline(idocstream & is, char c)
63 {
64         // we have to handle 3 different line endings:
65         // - UNIX (\n)
66         // - MAC  (\r)
67         // - DOS  (\r\n)
68         if (c == '\r') {
69                 // MAC or DOS
70                 char_type wc;
71                 if (is.get(wc) && wc != '\n') {
72                         // MAC
73                         is.putback(wc);
74                 }
75                 return '\n';
76         }
77         // UNIX
78         return c;
79 }
80
81 CatCode catcode(char_type c)
82 {
83         if (c < 256)
84                 return theCatcode[(unsigned char)c];
85         return catOther;
86 }
87
88 }
89
90
91 //
92 // Token
93 //
94
95 ostream & operator<<(ostream & os, Token const & t)
96 {
97         if (t.cat() == catComment)
98                 os << '%' << t.cs() << '\n';
99         else if (t.cat() == catSpace)
100                 os << t.cs();
101         else if (t.cat() == catEscape)
102                 os << '\\' << t.cs() << ' ';
103         else if (t.cat() == catLetter)
104                 os << t.cs();
105         else if (t.cat() == catNewline)
106                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
107         else
108                 os << '[' << t.cs() << ',' << t.cat() << ']';
109         return os;
110 }
111
112
113 string Token::asString() const
114 {
115         return cs_;
116 }
117
118
119 string Token::asInput() const
120 {
121         if (cat_ == catComment)
122                 return '%' + cs_ + '\n';
123         if (cat_ == catEscape)
124                 return '\\' + cs_;
125         return cs_;
126 }
127
128
129 //
130 // Parser
131 //
132
133
134 Parser::Parser(idocstream & is)
135         : lineno_(0), pos_(0), iss_(0), is_(is)
136 {
137 }
138
139
140 Parser::Parser(string const & s)
141         : lineno_(0), pos_(0), 
142           iss_(new idocstringstream(from_utf8(s))), is_(*iss_)
143 {
144 }
145
146
147 Parser::~Parser()
148 {
149         delete iss_;
150 }
151
152
153 void Parser::setEncoding(std::string const & e)
154 {
155         Encoding const * enc = encodings.fromLaTeXName(e);
156         cerr << "setting encoding to " << enc->iconvName();
157         is_ << lyx::setEncoding(enc->iconvName());
158 }
159
160
161 void Parser::push_back(Token const & t)
162 {
163         tokens_.push_back(t);
164 }
165
166
167 Token const & Parser::prev_token() const
168 {
169         static const Token dummy;
170         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
171 }
172
173
174 Token const & Parser::curr_token() const
175 {
176         static const Token dummy;
177         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
178 }
179
180
181 Token const & Parser::next_token()
182 {
183         static const Token dummy;
184         return good() ? tokens_[pos_] : dummy;
185 }
186
187
188 Token const & Parser::get_token()
189 {
190         static const Token dummy;
191         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
192         return good() ? tokens_[pos_++] : dummy;
193 }
194
195
196 bool Parser::isParagraph()
197 {
198         // A new paragraph in TeX ist started
199         // - either by a newline, following any amount of whitespace
200         //   characters (including zero), and another newline
201         // - or the token \par
202         if (curr_token().cat() == catNewline &&
203             (curr_token().cs().size() > 1 ||
204              (next_token().cat() == catSpace &&
205               pos_ < tokens_.size() - 1 &&
206               tokens_[pos_ + 1].cat() == catNewline)))
207                 return true;
208         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
209                 return true;
210         return false;
211 }
212
213
214 void Parser::skip_spaces(bool skip_comments)
215 {
216         // We just silently return if we have no more tokens.
217         // skip_spaces() should be callable at any time,
218         // the caller must check p::good() anyway.
219         while (good()) {
220                 get_token();
221                 if (isParagraph()) {
222                         putback();
223                         break;
224                 }
225                 if ( curr_token().cat() == catSpace ||
226                      curr_token().cat() == catNewline ||
227                     (curr_token().cat() == catComment && curr_token().cs().empty()))
228                         continue;
229                 if (skip_comments && curr_token().cat() == catComment)
230                         cerr << "  Ignoring comment: " << curr_token().asInput();
231                 else {
232                         putback();
233                         break;
234                 }
235         }
236 }
237
238
239 void Parser::unskip_spaces(bool skip_comments)
240 {
241         while (pos_ > 0) {
242                 if ( curr_token().cat() == catSpace ||
243                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
244                         putback();
245                 else if (skip_comments && curr_token().cat() == catComment) {
246                         // TODO: Get rid of this
247                         cerr << "Unignoring comment: " << curr_token().asInput();
248                         putback();
249                 }
250                 else
251                         break;
252         }
253 }
254
255
256 void Parser::putback()
257 {
258         --pos_;
259 }
260
261
262 bool Parser::good()
263 {
264         if (pos_ < tokens_.size())
265                 return true;
266         tokenize_one();
267         return pos_ < tokens_.size();
268 }
269
270
271 char Parser::getChar()
272 {
273         if (!good())
274                 error("The input stream is not well...");
275         return get_token().character();
276 }
277
278
279 Parser::Arg Parser::getFullArg(char left, char right)
280 {
281         skip_spaces(true);
282
283         // This is needed if a partial file ends with a command without arguments,
284         // e. g. \medskip
285         if (! good())
286                 return make_pair(false, string());
287
288         string result;
289         char c = getChar();
290
291         if (c != left) {
292                 putback();
293                 return make_pair(false, string());
294         } else
295                 while ((c = getChar()) != right && good()) {
296                         // Ignore comments
297                         if (curr_token().cat() == catComment) {
298                                 if (!curr_token().cs().empty())
299                                         cerr << "Ignoring comment: " << curr_token().asInput();
300                         }
301                         else
302                                 result += curr_token().asInput();
303                 }
304
305         return make_pair(true, result);
306 }
307
308
309 string Parser::getArg(char left, char right)
310 {
311         return getFullArg(left, right).second;
312 }
313
314
315 string Parser::getFullOpt()
316 {
317         Arg arg = getFullArg('[', ']');
318         if (arg.first)
319                 return '[' + arg.second + ']';
320         return string();
321 }
322
323
324 string Parser::getOpt()
325 {
326         string const res = getArg('[', ']');
327         return res.empty() ? string() : '[' + res + ']';
328 }
329
330
331 string Parser::getFullParentheseArg()
332 {
333         Arg arg = getFullArg('(', ')');
334         if (arg.first)
335                 return '(' + arg.second + ')';
336         return string();
337 }
338
339
340 string const Parser::verbatimEnvironment(string const & name)
341 {
342         if (!good())
343                 return string();
344
345         ostringstream os;
346         for (Token t = get_token(); good(); t = get_token()) {
347                 if (t.cat() == catBegin) {
348                         putback();
349                         os << '{' << verbatim_item() << '}';
350                 } else if (t.asInput() == "\\begin") {
351                         string const env = getArg('{', '}');
352                         os << "\\begin{" << env << '}'
353                            << verbatimEnvironment(env)
354                            << "\\end{" << env << '}';
355                 } else if (t.asInput() == "\\end") {
356                         string const end = getArg('{', '}');
357                         if (end != name)
358                                 cerr << "\\end{" << end
359                                      << "} does not match \\begin{" << name
360                                      << "}." << endl;
361                         return os.str();
362                 } else
363                         os << t.asInput();
364         }
365         cerr << "unexpected end of input" << endl;
366         return os.str();
367 }
368
369
370 void Parser::tokenize_one()
371 {
372         catInit();
373         char_type c;
374         if (!is_.get(c)) 
375                 return;
376
377         switch (catcode(c)) {
378         case catSpace: {
379                 docstring s(1, c);
380                 while (is_.get(c) && catcode(c) == catSpace)
381                         s += c;
382                 if (catcode(c) != catSpace)
383                         is_.putback(c);
384                 push_back(Token(s, catSpace));
385                 break;
386         }
387                 
388         case catNewline: {
389                 ++lineno_;
390                 docstring s(1, getNewline(is_, c));
391                 while (is_.get(c) && catcode(c) == catNewline) {
392                         ++lineno_;
393                         s += getNewline(is_, c);
394                 }
395                 if (catcode(c) != catNewline)
396                         is_.putback(c);
397                 push_back(Token(s, catNewline));
398                 break;
399         }
400                 
401         case catComment: {
402                 // We don't treat "%\n" combinations here specially because
403                 // we want to preserve them in the preamble
404                 docstring s;
405                 while (is_.get(c) && catcode(c) != catNewline)
406                         s += c;
407                 // handle possible DOS line ending
408                 if (catcode(c) == catNewline)
409                         c = getNewline(is_, c);
410                 // Note: The '%' at the beginning and the '\n' at the end
411                 // of the comment are not stored.
412                 ++lineno_;
413                 push_back(Token(s, catComment));
414                 break;
415         }
416                 
417         case catEscape: {
418                 is_.get(c);
419                 if (!is_) {
420                         error("unexpected end of input");
421                 } else {
422                         docstring s(1, c);
423                         if (catcode(c) == catLetter) {
424                                 // collect letters
425                                 while (is_.get(c) && catcode(c) == catLetter)
426                                         s += c;
427                                 if (catcode(c) != catLetter)
428                                         is_.putback(c);
429                         }
430                         push_back(Token(s, catEscape));
431                 }
432                 break;
433         }
434                 
435         case catIgnore: {
436                 cerr << "ignoring a char: " << c << "\n";
437                 break;
438         }
439                 
440         default:
441                 push_back(Token(docstring(1, c), catcode(c)));
442         }
443         //cerr << tokens_.back();
444 }
445
446
447 void Parser::dump() const
448 {
449         cerr << "\nTokens: ";
450         for (unsigned i = 0; i < tokens_.size(); ++i) {
451                 if (i == pos_)
452                         cerr << " <#> ";
453                 cerr << tokens_[i];
454         }
455         cerr << " pos: " << pos_ << "\n";
456 }
457
458
459 void Parser::error(string const & msg)
460 {
461         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
462         dump();
463         //exit(1);
464 }
465
466
467 string Parser::verbatimOption()
468 {
469         string res;
470         if (next_token().character() == '[') {
471                 Token t = get_token();
472                 for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
473                         if (t.cat() == catBegin) {
474                                 putback();
475                                 res += '{' + verbatim_item() + '}';
476                         } else
477                                 res += t.asString();
478                 }
479         }
480         return res;
481 }
482
483
484 string Parser::verbatim_item()
485 {
486         if (!good())
487                 error("stream bad");
488         skip_spaces();
489         if (next_token().cat() == catBegin) {
490                 Token t = get_token(); // skip brace
491                 string res;
492                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
493                         if (t.cat() == catBegin) {
494                                 putback();
495                                 res += '{' + verbatim_item() + '}';
496                         }
497                         else
498                                 res += t.asInput();
499                 }
500                 return res;
501         }
502         return get_token().asInput();
503 }
504
505
506 void Parser::reset()
507 {
508         pos_ = 0;
509 }
510
511
512 void Parser::setCatCode(char c, CatCode cat)
513 {
514         theCatcode[(unsigned char)c] = cat;
515 }
516
517
518 CatCode Parser::getCatCode(char c) const
519 {
520         return theCatcode[(unsigned char)c];
521 }
522
523
524 } // namespace lyx