]> git.lyx.org Git - lyx.git/blob - src/tex2lyx/texparser.C
fix bug 1730
[lyx.git] / src / tex2lyx / texparser.C
1 /**
2  * \file texparser.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author André Pönitz
7  *
8  * Full author contact details are available in file CREDITS.
9  */
10
11 #include <config.h>
12
13 #include "texparser.h"
14
15 #include <iostream>
16 #include <sstream>
17
18 using std::cerr;
19 using std::endl;
20 using std::fill;
21 using std::istream;
22 using std::istringstream;
23 using std::ostream;
24 using std::string;
25
26
27 namespace {
28
29 CatCode theCatcode[256];
30
31 void catInit()
32 {
33         fill(theCatcode, theCatcode + 256, catOther);
34         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
35         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
36
37         theCatcode[int('\\')] = catEscape;
38         theCatcode[int('{')]  = catBegin;
39         theCatcode[int('}')]  = catEnd;
40         theCatcode[int('$')]  = catMath;
41         theCatcode[int('&')]  = catAlign;
42         theCatcode[int('\n')] = catNewline;
43         theCatcode[int('#')]  = catParameter;
44         theCatcode[int('^')]  = catSuper;
45         theCatcode[int('_')]  = catSub;
46         theCatcode[0x7f]      = catIgnore;
47         theCatcode[int(' ')]  = catSpace;
48         theCatcode[int('\t')] = catSpace;
49         theCatcode[int('\r')] = catNewline;
50         theCatcode[int('~')]  = catActive;
51         theCatcode[int('%')]  = catComment;
52
53         // This is wrong!
54         theCatcode[int('@')]  = catLetter;
55 }
56
57
58 /*!
59  * Translate a line ending to '\n'.
60  * \p c must have catcode catNewline, and it must be the last character read
61  * from \p is.
62  */
63 char getNewline(istream & is, char c)
64 {
65         // we have to handle 3 different line endings:
66         // - UNIX (\n)
67         // - MAC  (\r)
68         // - DOS  (\r\n)
69         if (c == '\r') {
70                 // MAC or DOS
71                 if (is.get(c) && c != '\n') {
72                         // MAC
73                         is.putback(c);
74                 }
75                 return '\n';
76         }
77         // UNIX
78         return c;
79 }
80
81 }
82
83
84 //
85 // catcodes
86 //
87
88 CatCode catcode(unsigned char c)
89 {
90         return theCatcode[c];
91 }
92
93
94
95 //
96 // Token
97 //
98
99 ostream & operator<<(ostream & os, Token const & t)
100 {
101         if (t.cat() == catComment)
102                 os << '%' << t.cs() << '\n';
103         else if (t.cat() == catSpace)
104                 os << t.cs();
105         else if (t.cat() == catEscape)
106                 os << '\\' << t.cs() << ' ';
107         else if (t.cat() == catLetter)
108                 os << t.character();
109         else if (t.cat() == catNewline)
110                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
111         else
112                 os << '[' << t.character() << ',' << t.cat() << ']';
113         return os;
114 }
115
116
117 string Token::asString() const
118 {
119         return cs_.size() ? cs_ : string(1, char_);
120 }
121
122
123 string Token::asInput() const
124 {
125         if (cat_ == catComment)
126                 return '%' + cs_ + '\n';
127         if (cat_ == catSpace || cat_ == catNewline)
128                 return cs_;
129         return char_ ? string(1, char_) : '\\' + cs_;
130 }
131
132
133 //
134 // Parser
135 //
136
137
138 Parser::Parser(istream & is)
139         : lineno_(0), pos_(0)
140 {
141         tokenize(is);
142 }
143
144
145 Parser::Parser(string const & s)
146         : lineno_(0), pos_(0)
147 {
148         istringstream is(s);
149         tokenize(is);
150 }
151
152
153 void Parser::push_back(Token const & t)
154 {
155         tokens_.push_back(t);
156 }
157
158
159 void Parser::pop_back()
160 {
161         tokens_.pop_back();
162 }
163
164
165 Token const & Parser::prev_token() const
166 {
167         static const Token dummy;
168         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
169 }
170
171
172 Token const & Parser::curr_token() const
173 {
174         static const Token dummy;
175         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
176 }
177
178
179 Token const & Parser::next_token() const
180 {
181         static const Token dummy;
182         return good() ? tokens_[pos_] : dummy;
183 }
184
185
186 Token const & Parser::get_token()
187 {
188         static const Token dummy;
189         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
190         return good() ? tokens_[pos_++] : dummy;
191 }
192
193
194 bool Parser::isParagraph() const
195 {
196         // A new paragraph in TeX ist started
197         // - either by a newline, following any amount of whitespace
198         //   characters (including zero), and another newline
199         // - or the token \par
200         if (curr_token().cat() == catNewline &&
201             (curr_token().cs().size() > 1 ||
202              (next_token().cat() == catSpace &&
203               pos_ < tokens_.size() - 1 &&
204               tokens_[pos_ + 1].cat() == catNewline)))
205                 return true;
206         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
207                 return true;
208         return false;
209 }
210
211
212 void Parser::skip_spaces(bool skip_comments)
213 {
214         // We just silently return if we have no more tokens.
215         // skip_spaces() should be callable at any time,
216         // the caller must check p::good() anyway.
217         while (good()) {
218                 get_token();
219                 if (isParagraph()) {
220                         putback();
221                         break;
222                 }
223                 if ( curr_token().cat() == catSpace ||
224                      curr_token().cat() == catNewline ||
225                     (curr_token().cat() == catComment && curr_token().cs().empty()))
226                         continue;
227                 if (skip_comments && curr_token().cat() == catComment)
228                         cerr << "  Ignoring comment: " << curr_token().asInput();
229                 else {
230                         putback();
231                         break;
232                 }
233         }
234 }
235
236
237 void Parser::unskip_spaces(bool skip_comments)
238 {
239         while (pos_ > 0) {
240                 if ( curr_token().cat() == catSpace ||
241                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
242                         putback();
243                 else if (skip_comments && curr_token().cat() == catComment) {
244                         // TODO: Get rid of this
245                         cerr << "Unignoring comment: " << curr_token().asInput();
246                         putback();
247                 }
248                 else
249                         break;
250         }
251 }
252
253
254 void Parser::putback()
255 {
256         --pos_;
257 }
258
259
260 bool Parser::good() const
261 {
262         return pos_ < tokens_.size();
263 }
264
265
266 char Parser::getChar()
267 {
268         if (!good())
269                 error("The input stream is not well...");
270         return tokens_[pos_++].character();
271 }
272
273
274 string Parser::getArg(char left, char right)
275 {
276         skip_spaces(true);
277
278         // This is needed if a partial file ends with a command without arguments,
279         // e. g. \medskip
280         if (! good())
281                 return string();
282
283         string result;
284         char c = getChar();
285
286         if (c != left)
287                 putback();
288         else
289                 while ((c = getChar()) != right && good()) {
290                         // Ignore comments
291                         if (curr_token().cat() == catComment) {
292                                 if (!curr_token().cs().empty())
293                                         cerr << "Ignoring comment: " << curr_token().asInput();
294                         }
295                         else
296                                 result += curr_token().asInput();
297                 }
298
299         return result;
300 }
301
302
303 string Parser::getOpt()
304 {
305         string const res = getArg('[', ']');
306         return res.size() ? '[' + res + ']' : string();
307 }
308
309
310 void Parser::tokenize(istream & is)
311 {
312         static bool init_done = false;
313
314         if (!init_done) {
315                 catInit();
316                 init_done = true;
317         }
318
319         char c;
320         while (is.get(c)) {
321                 //cerr << "reading c: " << c << "\n";
322
323                 switch (catcode(c)) {
324                         case catSpace: {
325                                 string s(1, c);
326                                 while (is.get(c) && catcode(c) == catSpace)
327                                         s += c;
328                                 if (catcode(c) != catSpace)
329                                         is.putback(c);
330                                 push_back(Token(s, catSpace));
331                                 break;
332                         }
333
334                         case catNewline: {
335                                 ++lineno_;
336                                 string s(1, getNewline(is, c));
337                                 while (is.get(c) && catcode(c) == catNewline) {
338                                         ++lineno_;
339                                         s += getNewline(is, c);
340                                 }
341                                 if (catcode(c) != catNewline)
342                                         is.putback(c);
343                                 push_back(Token(s, catNewline));
344                                 break;
345                         }
346
347                         case catComment: {
348                                 // We don't treat "%\n" combinations here specially because
349                                 // we want to preserve them in the preamble
350                                 string s;
351                                 while (is.get(c) && catcode(c) != catNewline)
352                                         s += c;
353                                 // handle possible DOS line ending
354                                 if (catcode(c) == catNewline)
355                                         c = getNewline(is, c);
356                                 // Note: The '%' at the beginning and the '\n' at the end
357                                 // of the comment are not stored.
358                                 ++lineno_;
359                                 push_back(Token(s, catComment));
360                                 break;
361                         }
362
363                         case catEscape: {
364                                 is.get(c);
365                                 if (!is) {
366                                         error("unexpected end of input");
367                                 } else {
368                                         string s(1, c);
369                                         if (catcode(c) == catLetter) {
370                                                 // collect letters
371                                                 while (is.get(c) && catcode(c) == catLetter)
372                                                         s += c;
373                                                 if (catcode(c) != catLetter)
374                                                         is.putback(c);
375                                         }
376                                         push_back(Token(s, catEscape));
377                                 }
378                                 break;
379                         }
380
381                         case catIgnore: {
382                                 cerr << "ignoring a char: " << int(c) << "\n";
383                                 break;
384                         }
385
386                         default:
387                                 push_back(Token(c, catcode(c)));
388                 }
389         }
390 }
391
392
393 void Parser::dump() const
394 {
395         cerr << "\nTokens: ";
396         for (unsigned i = 0; i < tokens_.size(); ++i) {
397                 if (i == pos_)
398                         cerr << " <#> ";
399                 cerr << tokens_[i];
400         }
401         cerr << " pos: " << pos_ << "\n";
402 }
403
404
405 void Parser::error(string const & msg)
406 {
407         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
408         dump();
409         //exit(1);
410 }
411
412
413 string Parser::verbatimOption()
414 {
415         string res;
416         if (next_token().character() == '[') {
417                 Token t = get_token();
418                 for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
419                         if (t.cat() == catBegin) {
420                                 putback();
421                                 res += '{' + verbatim_item() + '}';
422                         } else
423                                 res += t.asString();
424                 }
425         }
426         return res;
427 }
428
429
430 string Parser::verbatim_item()
431 {
432         if (!good())
433                 error("stream bad");
434         skip_spaces();
435         if (next_token().cat() == catBegin) {
436                 Token t = get_token(); // skip brace
437                 string res;
438                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
439                         if (t.cat() == catBegin) {
440                                 putback();
441                                 res += '{' + verbatim_item() + '}';
442                         }
443                         else
444                                 res += t.asInput();
445                 }
446                 return res;
447         }
448         return get_token().asInput();
449 }
450
451
452 void Parser::reset()
453 {
454         pos_ = 0;
455 }
456
457
458 void Parser::setCatCode(char c, CatCode cat)
459 {
460         theCatcode[(unsigned char)c] = cat;
461 }
462
463
464 CatCode Parser::getCatCode(char c) const
465 {
466         return theCatcode[(unsigned char)c];
467 }