]> git.lyx.org Git - lyx.git/blob - src/tex2lyx/Parser.cpp
5759cd544d310a38626fb808924332642160cf09
[lyx.git] / src / tex2lyx / Parser.cpp
1 /**
2  * \file Parser.cpp
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author André Pönitz 
7  *
8  * Full author contact details are available in file CREDITS.
9  */
10
11 #include <config.h>
12
13 #include "Parser.h"
14
15 #include <iostream>
16 #include <sstream>
17
18 using namespace std;
19
20 namespace lyx {
21
22 namespace {
23
24 CatCode theCatcode[256];
25
26 void catInit()
27 {
28         fill(theCatcode, theCatcode + 256, catOther);
29         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
30         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
31
32         theCatcode[int('\\')] = catEscape;
33         theCatcode[int('{')]  = catBegin;
34         theCatcode[int('}')]  = catEnd;
35         theCatcode[int('$')]  = catMath;
36         theCatcode[int('&')]  = catAlign;
37         theCatcode[int('\n')] = catNewline;
38         theCatcode[int('#')]  = catParameter;
39         theCatcode[int('^')]  = catSuper;
40         theCatcode[int('_')]  = catSub;
41         theCatcode[0x7f]      = catIgnore;
42         theCatcode[int(' ')]  = catSpace;
43         theCatcode[int('\t')] = catSpace;
44         theCatcode[int('\r')] = catNewline;
45         theCatcode[int('~')]  = catActive;
46         theCatcode[int('%')]  = catComment;
47
48         // This is wrong!
49         theCatcode[int('@')]  = catLetter;
50 }
51
52
53 /*!
54  * Translate a line ending to '\n'.
55  * \p c must have catcode catNewline, and it must be the last character read
56  * from \p is.
57  */
58 char getNewline(istream & is, char c)
59 {
60         // we have to handle 3 different line endings:
61         // - UNIX (\n)
62         // - MAC  (\r)
63         // - DOS  (\r\n)
64         if (c == '\r') {
65                 // MAC or DOS
66                 if (is.get(c) && c != '\n') {
67                         // MAC
68                         is.putback(c);
69                 }
70                 return '\n';
71         }
72         // UNIX
73         return c;
74 }
75
76 }
77
78
79 //
80 // catcodes
81 //
82
83 CatCode catcode(unsigned char c)
84 {
85         return theCatcode[c];
86 }
87
88
89
90 //
91 // Token
92 //
93
94 ostream & operator<<(ostream & os, Token const & t)
95 {
96         if (t.cat() == catComment)
97                 os << '%' << t.cs() << '\n';
98         else if (t.cat() == catSpace)
99                 os << t.cs();
100         else if (t.cat() == catEscape)
101                 os << '\\' << t.cs() << ' ';
102         else if (t.cat() == catLetter)
103                 os << t.character();
104         else if (t.cat() == catNewline)
105                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
106         else
107                 os << '[' << t.character() << ',' << t.cat() << ']';
108         return os;
109 }
110
111
112 string Token::asString() const
113 {
114         return cs_.size() ? cs_ : string(1, char_);
115 }
116
117
118 string Token::asInput() const
119 {
120         if (cat_ == catComment)
121                 return '%' + cs_ + '\n';
122         if (cat_ == catSpace || cat_ == catNewline)
123                 return cs_;
124         return char_ ? string(1, char_) : '\\' + cs_;
125 }
126
127
128 //
129 // Parser
130 //
131
132
133 Parser::Parser(istream & is)
134         : lineno_(0), pos_(0)
135 {
136         tokenize(is);
137 }
138
139
140 Parser::Parser(string const & s)
141         : lineno_(0), pos_(0)
142 {
143         istringstream is(s);
144         tokenize(is);
145 }
146
147
148 void Parser::push_back(Token const & t)
149 {
150         tokens_.push_back(t);
151 }
152
153
154 Token const & Parser::prev_token() const
155 {
156         static const Token dummy;
157         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
158 }
159
160
161 Token const & Parser::curr_token() const
162 {
163         static const Token dummy;
164         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
165 }
166
167
168 Token const & Parser::next_token() const
169 {
170         static const Token dummy;
171         return good() ? tokens_[pos_] : dummy;
172 }
173
174
175 Token const & Parser::get_token()
176 {
177         static const Token dummy;
178         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
179         return good() ? tokens_[pos_++] : dummy;
180 }
181
182
183 bool Parser::isParagraph() const
184 {
185         // A new paragraph in TeX ist started
186         // - either by a newline, following any amount of whitespace
187         //   characters (including zero), and another newline
188         // - or the token \par
189         if (curr_token().cat() == catNewline &&
190             (curr_token().cs().size() > 1 ||
191              (next_token().cat() == catSpace &&
192               pos_ < tokens_.size() - 1 &&
193               tokens_[pos_ + 1].cat() == catNewline)))
194                 return true;
195         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
196                 return true;
197         return false;
198 }
199
200
201 void Parser::skip_spaces(bool skip_comments)
202 {
203         // We just silently return if we have no more tokens.
204         // skip_spaces() should be callable at any time,
205         // the caller must check p::good() anyway.
206         while (good()) {
207                 get_token();
208                 if (isParagraph()) {
209                         putback();
210                         break;
211                 }
212                 if ( curr_token().cat() == catSpace ||
213                      curr_token().cat() == catNewline ||
214                     (curr_token().cat() == catComment && curr_token().cs().empty()))
215                         continue;
216                 if (skip_comments && curr_token().cat() == catComment)
217                         cerr << "  Ignoring comment: " << curr_token().asInput();
218                 else {
219                         putback();
220                         break;
221                 }
222         }
223 }
224
225
226 void Parser::unskip_spaces(bool skip_comments)
227 {
228         while (pos_ > 0) {
229                 if ( curr_token().cat() == catSpace ||
230                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
231                         putback();
232                 else if (skip_comments && curr_token().cat() == catComment) {
233                         // TODO: Get rid of this
234                         cerr << "Unignoring comment: " << curr_token().asInput();
235                         putback();
236                 }
237                 else
238                         break;
239         }
240 }
241
242
243 void Parser::putback()
244 {
245         --pos_;
246 }
247
248
249 bool Parser::good() const
250 {
251         return pos_ < tokens_.size();
252 }
253
254
255 char Parser::getChar()
256 {
257         if (!good())
258                 error("The input stream is not well...");
259         return tokens_[pos_++].character();
260 }
261
262
263 Parser::Arg Parser::getFullArg(char left, char right)
264 {
265         skip_spaces(true);
266
267         // This is needed if a partial file ends with a command without arguments,
268         // e. g. \medskip
269         if (! good())
270                 return make_pair(false, string());
271
272         string result;
273         char c = getChar();
274
275         if (c != left) {
276                 putback();
277                 return make_pair(false, string());
278         } else
279                 while ((c = getChar()) != right && good()) {
280                         // Ignore comments
281                         if (curr_token().cat() == catComment) {
282                                 if (!curr_token().cs().empty())
283                                         cerr << "Ignoring comment: " << curr_token().asInput();
284                         }
285                         else
286                                 result += curr_token().asInput();
287                 }
288
289         return make_pair(true, result);
290 }
291
292
293 string Parser::getArg(char left, char right)
294 {
295         return getFullArg(left, right).second;
296 }
297
298
299 string Parser::getFullOpt()
300 {
301         Arg arg = getFullArg('[', ']');
302         if (arg.first)
303                 return '[' + arg.second + ']';
304         return string();
305 }
306
307
308 string Parser::getOpt()
309 {
310         string const res = getArg('[', ']');
311         return res.empty() ? string() : '[' + res + ']';
312 }
313
314
315 string Parser::getFullParentheseArg()
316 {
317         Arg arg = getFullArg('(', ')');
318         if (arg.first)
319                 return '(' + arg.second + ')';
320         return string();
321 }
322
323
324 string const Parser::verbatimEnvironment(string const & name)
325 {
326         if (!good())
327                 return string();
328
329         ostringstream os;
330         for (Token t = get_token(); good(); t = get_token()) {
331                 if (t.cat() == catBegin) {
332                         putback();
333                         os << '{' << verbatim_item() << '}';
334                 } else if (t.asInput() == "\\begin") {
335                         string const env = getArg('{', '}');
336                         os << "\\begin{" << env << '}'
337                            << verbatimEnvironment(env)
338                            << "\\end{" << env << '}';
339                 } else if (t.asInput() == "\\end") {
340                         string const end = getArg('{', '}');
341                         if (end != name)
342                                 cerr << "\\end{" << end
343                                      << "} does not match \\begin{" << name
344                                      << "}." << endl;
345                         return os.str();
346                 } else
347                         os << t.asInput();
348         }
349         cerr << "unexpected end of input" << endl;
350         return os.str();
351 }
352
353
354 void Parser::tokenize(istream & is)
355 {
356         static bool init_done = false;
357
358         if (!init_done) {
359                 catInit();
360                 init_done = true;
361         }
362
363         char c;
364         while (is.get(c)) {
365                 //cerr << "reading c: " << c << "\n";
366
367                 switch (catcode(c)) {
368                         case catSpace: {
369                                 string s(1, c);
370                                 while (is.get(c) && catcode(c) == catSpace)
371                                         s += c;
372                                 if (catcode(c) != catSpace)
373                                         is.putback(c);
374                                 push_back(Token(s, catSpace));
375                                 break;
376                         }
377
378                         case catNewline: {
379                                 ++lineno_;
380                                 string s(1, getNewline(is, c));
381                                 while (is.get(c) && catcode(c) == catNewline) {
382                                         ++lineno_;
383                                         s += getNewline(is, c);
384                                 }
385                                 if (catcode(c) != catNewline)
386                                         is.putback(c);
387                                 push_back(Token(s, catNewline));
388                                 break;
389                         }
390
391                         case catComment: {
392                                 // We don't treat "%\n" combinations here specially because
393                                 // we want to preserve them in the preamble
394                                 string s;
395                                 while (is.get(c) && catcode(c) != catNewline)
396                                         s += c;
397                                 // handle possible DOS line ending
398                                 if (catcode(c) == catNewline)
399                                         c = getNewline(is, c);
400                                 // Note: The '%' at the beginning and the '\n' at the end
401                                 // of the comment are not stored.
402                                 ++lineno_;
403                                 push_back(Token(s, catComment));
404                                 break;
405                         }
406
407                         case catEscape: {
408                                 is.get(c);
409                                 if (!is) {
410                                         error("unexpected end of input");
411                                 } else {
412                                         string s(1, c);
413                                         if (catcode(c) == catLetter) {
414                                                 // collect letters
415                                                 while (is.get(c) && catcode(c) == catLetter)
416                                                         s += c;
417                                                 if (catcode(c) != catLetter)
418                                                         is.putback(c);
419                                         }
420                                         push_back(Token(s, catEscape));
421                                 }
422                                 break;
423                         }
424
425                         case catIgnore: {
426                                 cerr << "ignoring a char: " << int(c) << "\n";
427                                 break;
428                         }
429
430                         default:
431                                 push_back(Token(c, catcode(c)));
432                 }
433         }
434 }
435
436
437 void Parser::dump() const
438 {
439         cerr << "\nTokens: ";
440         for (unsigned i = 0; i < tokens_.size(); ++i) {
441                 if (i == pos_)
442                         cerr << " <#> ";
443                 cerr << tokens_[i];
444         }
445         cerr << " pos: " << pos_ << "\n";
446 }
447
448
449 void Parser::error(string const & msg)
450 {
451         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
452         dump();
453         //exit(1);
454 }
455
456
457 string Parser::verbatimOption()
458 {
459         string res;
460         if (next_token().character() == '[') {
461                 Token t = get_token();
462                 for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
463                         if (t.cat() == catBegin) {
464                                 putback();
465                                 res += '{' + verbatim_item() + '}';
466                         } else
467                                 res += t.asString();
468                 }
469         }
470         return res;
471 }
472
473
474 string Parser::verbatim_item()
475 {
476         if (!good())
477                 error("stream bad");
478         skip_spaces();
479         if (next_token().cat() == catBegin) {
480                 Token t = get_token(); // skip brace
481                 string res;
482                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
483                         if (t.cat() == catBegin) {
484                                 putback();
485                                 res += '{' + verbatim_item() + '}';
486                         }
487                         else
488                                 res += t.asInput();
489                 }
490                 return res;
491         }
492         return get_token().asInput();
493 }
494
495
496 void Parser::reset()
497 {
498         pos_ = 0;
499 }
500
501
502 void Parser::setCatCode(char c, CatCode cat)
503 {
504         theCatcode[(unsigned char)c] = cat;
505 }
506
507
508 CatCode Parser::getCatCode(char c) const
509 {
510         return theCatcode[(unsigned char)c];
511 }
512
513
514 } // namespace lyx