]> git.lyx.org Git - lyx.git/blob - src/tex2lyx/texparser.C
Fix bug 2667
[lyx.git] / src / tex2lyx / texparser.C
1 /**
2  * \file texparser.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author André Pönitz
7  *
8  * Full author contact details are available in file CREDITS.
9  */
10
11 #include <config.h>
12
13 #include "texparser.h"
14
15 #include <iostream>
16 #include <sstream>
17
18 using std::cerr;
19 using std::endl;
20 using std::fill;
21 using std::istream;
22 using std::istringstream;
23 using std::ostringstream;
24 using std::ostream;
25 using std::string;
26
27
28 namespace {
29
30 CatCode theCatcode[256];
31
32 void catInit()
33 {
34         fill(theCatcode, theCatcode + 256, catOther);
35         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
36         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
37
38         theCatcode[int('\\')] = catEscape;
39         theCatcode[int('{')]  = catBegin;
40         theCatcode[int('}')]  = catEnd;
41         theCatcode[int('$')]  = catMath;
42         theCatcode[int('&')]  = catAlign;
43         theCatcode[int('\n')] = catNewline;
44         theCatcode[int('#')]  = catParameter;
45         theCatcode[int('^')]  = catSuper;
46         theCatcode[int('_')]  = catSub;
47         theCatcode[0x7f]      = catIgnore;
48         theCatcode[int(' ')]  = catSpace;
49         theCatcode[int('\t')] = catSpace;
50         theCatcode[int('\r')] = catNewline;
51         theCatcode[int('~')]  = catActive;
52         theCatcode[int('%')]  = catComment;
53
54         // This is wrong!
55         theCatcode[int('@')]  = catLetter;
56 }
57
58
59 /*!
60  * Translate a line ending to '\n'.
61  * \p c must have catcode catNewline, and it must be the last character read
62  * from \p is.
63  */
64 char getNewline(istream & is, char c)
65 {
66         // we have to handle 3 different line endings:
67         // - UNIX (\n)
68         // - MAC  (\r)
69         // - DOS  (\r\n)
70         if (c == '\r') {
71                 // MAC or DOS
72                 if (is.get(c) && c != '\n') {
73                         // MAC
74                         is.putback(c);
75                 }
76                 return '\n';
77         }
78         // UNIX
79         return c;
80 }
81
82 }
83
84
85 //
86 // catcodes
87 //
88
89 CatCode catcode(unsigned char c)
90 {
91         return theCatcode[c];
92 }
93
94
95
96 //
97 // Token
98 //
99
100 ostream & operator<<(ostream & os, Token const & t)
101 {
102         if (t.cat() == catComment)
103                 os << '%' << t.cs() << '\n';
104         else if (t.cat() == catSpace)
105                 os << t.cs();
106         else if (t.cat() == catEscape)
107                 os << '\\' << t.cs() << ' ';
108         else if (t.cat() == catLetter)
109                 os << t.character();
110         else if (t.cat() == catNewline)
111                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
112         else
113                 os << '[' << t.character() << ',' << t.cat() << ']';
114         return os;
115 }
116
117
118 string Token::asString() const
119 {
120         return cs_.size() ? cs_ : string(1, char_);
121 }
122
123
124 string Token::asInput() const
125 {
126         if (cat_ == catComment)
127                 return '%' + cs_ + '\n';
128         if (cat_ == catSpace || cat_ == catNewline)
129                 return cs_;
130         return char_ ? string(1, char_) : '\\' + cs_;
131 }
132
133
134 //
135 // Parser
136 //
137
138
139 Parser::Parser(istream & is)
140         : lineno_(0), pos_(0)
141 {
142         tokenize(is);
143 }
144
145
146 Parser::Parser(string const & s)
147         : lineno_(0), pos_(0)
148 {
149         istringstream is(s);
150         tokenize(is);
151 }
152
153
154 void Parser::push_back(Token const & t)
155 {
156         tokens_.push_back(t);
157 }
158
159
160 void Parser::pop_back()
161 {
162         tokens_.pop_back();
163 }
164
165
166 Token const & Parser::prev_token() const
167 {
168         static const Token dummy;
169         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
170 }
171
172
173 Token const & Parser::curr_token() const
174 {
175         static const Token dummy;
176         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
177 }
178
179
180 Token const & Parser::next_token() const
181 {
182         static const Token dummy;
183         return good() ? tokens_[pos_] : dummy;
184 }
185
186
187 Token const & Parser::get_token()
188 {
189         static const Token dummy;
190         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
191         return good() ? tokens_[pos_++] : dummy;
192 }
193
194
195 bool Parser::isParagraph() const
196 {
197         // A new paragraph in TeX ist started
198         // - either by a newline, following any amount of whitespace
199         //   characters (including zero), and another newline
200         // - or the token \par
201         if (curr_token().cat() == catNewline &&
202             (curr_token().cs().size() > 1 ||
203              (next_token().cat() == catSpace &&
204               pos_ < tokens_.size() - 1 &&
205               tokens_[pos_ + 1].cat() == catNewline)))
206                 return true;
207         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
208                 return true;
209         return false;
210 }
211
212
213 void Parser::skip_spaces(bool skip_comments)
214 {
215         // We just silently return if we have no more tokens.
216         // skip_spaces() should be callable at any time,
217         // the caller must check p::good() anyway.
218         while (good()) {
219                 get_token();
220                 if (isParagraph()) {
221                         putback();
222                         break;
223                 }
224                 if ( curr_token().cat() == catSpace ||
225                      curr_token().cat() == catNewline ||
226                     (curr_token().cat() == catComment && curr_token().cs().empty()))
227                         continue;
228                 if (skip_comments && curr_token().cat() == catComment)
229                         cerr << "  Ignoring comment: " << curr_token().asInput();
230                 else {
231                         putback();
232                         break;
233                 }
234         }
235 }
236
237
238 void Parser::unskip_spaces(bool skip_comments)
239 {
240         while (pos_ > 0) {
241                 if ( curr_token().cat() == catSpace ||
242                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
243                         putback();
244                 else if (skip_comments && curr_token().cat() == catComment) {
245                         // TODO: Get rid of this
246                         cerr << "Unignoring comment: " << curr_token().asInput();
247                         putback();
248                 }
249                 else
250                         break;
251         }
252 }
253
254
255 void Parser::putback()
256 {
257         --pos_;
258 }
259
260
261 bool Parser::good() const
262 {
263         return pos_ < tokens_.size();
264 }
265
266
267 char Parser::getChar()
268 {
269         if (!good())
270                 error("The input stream is not well...");
271         return tokens_[pos_++].character();
272 }
273
274
275 Parser::Arg Parser::getFullArg(char left, char right)
276 {
277         skip_spaces(true);
278
279         // This is needed if a partial file ends with a command without arguments,
280         // e. g. \medskip
281         if (! good())
282                 return std::make_pair(false, string());
283
284         string result;
285         char c = getChar();
286
287         if (c != left) {
288                 putback();
289                 return std::make_pair(false, string());
290         } else
291                 while ((c = getChar()) != right && good()) {
292                         // Ignore comments
293                         if (curr_token().cat() == catComment) {
294                                 if (!curr_token().cs().empty())
295                                         cerr << "Ignoring comment: " << curr_token().asInput();
296                         }
297                         else
298                                 result += curr_token().asInput();
299                 }
300
301         return std::make_pair(true, result);
302 }
303
304
305 string Parser::getArg(char left, char right)
306 {
307         return getFullArg(left, right).second;
308 }
309
310
311 string Parser::getFullOpt()
312 {
313         Arg arg = getFullArg('[', ']');
314         if (arg.first)
315                 return '[' + arg.second + ']';
316         return arg.second;
317 }
318
319
320 string Parser::getOpt()
321 {
322         string const res = getArg('[', ']');
323         return res.empty() ? string() : '[' + res + ']';
324 }
325
326
327 string const Parser::verbatimEnvironment(string const & name)
328 {
329         if (!good())
330                 return string();
331
332         ostringstream os;
333         for (Token t = get_token(); good(); t = get_token()) {
334                 if (t.cat() == catBegin) {
335                         putback();
336                         os << '{' << verbatim_item() << '}';
337                 } else if (t.asInput() == "\\begin") {
338                         string const env = getArg('{', '}');
339                         os << "\\begin{" << env << '}'
340                            << verbatimEnvironment(env)
341                            << "\\end{" << env << '}';
342                 } else if (t.asInput() == "\\end") {
343                         string const end = getArg('{', '}');
344                         if (end != name)
345                                 cerr << "\\end{" << end
346                                      << "} does not match \\begin{" << name
347                                      << "}." << endl;
348                         return os.str();
349                 } else
350                         os << t.asInput();
351         }
352         cerr << "unexpected end of input" << endl;
353         return os.str();
354 }
355
356
357 void Parser::tokenize(istream & is)
358 {
359         static bool init_done = false;
360
361         if (!init_done) {
362                 catInit();
363                 init_done = true;
364         }
365
366         char c;
367         while (is.get(c)) {
368                 //cerr << "reading c: " << c << "\n";
369
370                 switch (catcode(c)) {
371                         case catSpace: {
372                                 string s(1, c);
373                                 while (is.get(c) && catcode(c) == catSpace)
374                                         s += c;
375                                 if (catcode(c) != catSpace)
376                                         is.putback(c);
377                                 push_back(Token(s, catSpace));
378                                 break;
379                         }
380
381                         case catNewline: {
382                                 ++lineno_;
383                                 string s(1, getNewline(is, c));
384                                 while (is.get(c) && catcode(c) == catNewline) {
385                                         ++lineno_;
386                                         s += getNewline(is, c);
387                                 }
388                                 if (catcode(c) != catNewline)
389                                         is.putback(c);
390                                 push_back(Token(s, catNewline));
391                                 break;
392                         }
393
394                         case catComment: {
395                                 // We don't treat "%\n" combinations here specially because
396                                 // we want to preserve them in the preamble
397                                 string s;
398                                 while (is.get(c) && catcode(c) != catNewline)
399                                         s += c;
400                                 // handle possible DOS line ending
401                                 if (catcode(c) == catNewline)
402                                         c = getNewline(is, c);
403                                 // Note: The '%' at the beginning and the '\n' at the end
404                                 // of the comment are not stored.
405                                 ++lineno_;
406                                 push_back(Token(s, catComment));
407                                 break;
408                         }
409
410                         case catEscape: {
411                                 is.get(c);
412                                 if (!is) {
413                                         error("unexpected end of input");
414                                 } else {
415                                         string s(1, c);
416                                         if (catcode(c) == catLetter) {
417                                                 // collect letters
418                                                 while (is.get(c) && catcode(c) == catLetter)
419                                                         s += c;
420                                                 if (catcode(c) != catLetter)
421                                                         is.putback(c);
422                                         }
423                                         push_back(Token(s, catEscape));
424                                 }
425                                 break;
426                         }
427
428                         case catIgnore: {
429                                 cerr << "ignoring a char: " << int(c) << "\n";
430                                 break;
431                         }
432
433                         default:
434                                 push_back(Token(c, catcode(c)));
435                 }
436         }
437 }
438
439
440 void Parser::dump() const
441 {
442         cerr << "\nTokens: ";
443         for (unsigned i = 0; i < tokens_.size(); ++i) {
444                 if (i == pos_)
445                         cerr << " <#> ";
446                 cerr << tokens_[i];
447         }
448         cerr << " pos: " << pos_ << "\n";
449 }
450
451
452 void Parser::error(string const & msg)
453 {
454         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
455         dump();
456         //exit(1);
457 }
458
459
460 string Parser::verbatimOption()
461 {
462         string res;
463         if (next_token().character() == '[') {
464                 Token t = get_token();
465                 for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
466                         if (t.cat() == catBegin) {
467                                 putback();
468                                 res += '{' + verbatim_item() + '}';
469                         } else
470                                 res += t.asString();
471                 }
472         }
473         return res;
474 }
475
476
477 string Parser::verbatim_item()
478 {
479         if (!good())
480                 error("stream bad");
481         skip_spaces();
482         if (next_token().cat() == catBegin) {
483                 Token t = get_token(); // skip brace
484                 string res;
485                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
486                         if (t.cat() == catBegin) {
487                                 putback();
488                                 res += '{' + verbatim_item() + '}';
489                         }
490                         else
491                                 res += t.asInput();
492                 }
493                 return res;
494         }
495         return get_token().asInput();
496 }
497
498
499 void Parser::reset()
500 {
501         pos_ = 0;
502 }
503
504
505 void Parser::setCatCode(char c, CatCode cat)
506 {
507         theCatcode[(unsigned char)c] = cat;
508 }
509
510
511 CatCode Parser::getCatCode(char c) const
512 {
513         return theCatcode[(unsigned char)c];
514 }