]> git.lyx.org Git - lyx.git/blob - src/tex2lyx/texparser.C
The package reworking.
[lyx.git] / src / tex2lyx / texparser.C
1 /**
2  * \file texparser.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author André Pönitz
7  *
8  * Full author contact details are available in file CREDITS.
9  */
10
11 #include <config.h>
12
13 #include "texparser.h"
14
15 #include <iostream>
16 #include <sstream>
17
18 using std::cerr;
19 using std::endl;
20 using std::fill;
21 using std::istream;
22 using std::istringstream;
23 using std::ostream;
24 using std::string;
25
26
27 namespace {
28
29 CatCode theCatcode[256];
30
31 void catInit()
32 {
33         fill(theCatcode, theCatcode + 256, catOther);
34         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
35         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
36
37         theCatcode[int('\\')] = catEscape;
38         theCatcode[int('{')]  = catBegin;
39         theCatcode[int('}')]  = catEnd;
40         theCatcode[int('$')]  = catMath;
41         theCatcode[int('&')]  = catAlign;
42         theCatcode[int('\n')] = catNewline;
43         theCatcode[int('#')]  = catParameter;
44         theCatcode[int('^')]  = catSuper;
45         theCatcode[int('_')]  = catSub;
46         theCatcode[0x7f]      = catIgnore;
47         theCatcode[int(' ')]  = catSpace;
48         theCatcode[int('\t')] = catSpace;
49         theCatcode[int('\r')] = catNewline;
50         theCatcode[int('~')]  = catActive;
51         theCatcode[int('%')]  = catComment;
52
53         // This is wrong!
54         theCatcode[int('@')]  = catLetter;
55 }
56
57
58 /*!
59  * Translate a line ending to '\n'.
60  * \p c must have catcode catNewline, and it must be the last character read
61  * from \p is.
62  */
63 char getNewline(istream & is, char c)
64 {
65         // we have to handle 3 different line endings:
66         // - UNIX (\n)
67         // - MAC  (\r)
68         // - DOS  (\r\n)
69         if (c == '\r') {
70                 // MAC or DOS
71                 if (is.get(c) && c != '\n') {
72                         // MAC
73                         is.putback(c);
74                 }
75                 return '\n';
76         }
77         // UNIX
78         return c;
79 }
80
81 }
82
83
84 //
85 // catcodes
86 //
87
88 CatCode catcode(unsigned char c)
89 {
90         return theCatcode[c];
91 }
92
93
94
95 //
96 // Token
97 //
98
99 ostream & operator<<(ostream & os, Token const & t)
100 {
101         if (t.cat() == catComment)
102                 os << '%' << t.cs() << '\n';
103         else if (t.cat() == catSpace)
104                 os << t.cs();
105         else if (t.cat() == catEscape)
106                 os << '\\' << t.cs() << ' ';
107         else if (t.cat() == catLetter)
108                 os << t.character();
109         else if (t.cat() == catNewline)
110                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
111         else
112                 os << '[' << t.character() << ',' << t.cat() << ']';
113         return os;
114 }
115
116
117 string Token::asString() const
118 {
119         return cs_.size() ? cs_ : string(1, char_);
120 }
121
122
123 string Token::asInput() const
124 {
125         if (cat_ == catComment)
126                 return '%' + cs_ + '\n';
127         if (cat_ == catSpace || cat_ == catNewline)
128                 return cs_;
129         return char_ ? string(1, char_) : '\\' + cs_;
130 }
131
132
133 //
134 // Parser
135 //
136
137
138 Parser::Parser(istream & is)
139         : lineno_(0), pos_(0)
140 {
141         tokenize(is);
142 }
143
144
145 Parser::Parser(string const & s)
146         : lineno_(0), pos_(0)
147 {
148         istringstream is(s);
149         tokenize(is);
150 }
151
152
153 void Parser::push_back(Token const & t)
154 {
155         tokens_.push_back(t);
156 }
157
158
159 void Parser::pop_back()
160 {
161         tokens_.pop_back();
162 }
163
164
165 Token const & Parser::prev_token() const
166 {
167         static const Token dummy;
168         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
169 }
170
171
172 Token const & Parser::curr_token() const
173 {
174         static const Token dummy;
175         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
176 }
177
178
179 Token const & Parser::next_token() const
180 {
181         static const Token dummy;
182         return good() ? tokens_[pos_] : dummy;
183 }
184
185
186 Token const & Parser::get_token()
187 {
188         static const Token dummy;
189         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
190         return good() ? tokens_[pos_++] : dummy;
191 }
192
193
194 bool Parser::isParagraph() const
195 {
196         // A new paragraph in TeX ist started
197         // - either by a newline, following any amount of whitespace
198         //   characters (including zero), and another newline
199         // - or the token \par
200         if (curr_token().cat() == catNewline &&
201             (curr_token().cs().size() > 1 ||
202              (next_token().cat() == catSpace &&
203               pos_ < tokens_.size() - 1 &&
204               tokens_[pos_ + 1].cat() == catNewline)))
205                 return true;
206         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
207                 return true;
208         return false;
209 }
210
211
212 void Parser::skip_spaces(bool skip_comments)
213 {
214         // We just silently return if we have no more tokens.
215         // skip_spaces() should be callable at any time,
216         // the caller must check p::good() anyway.
217         while (good()) {
218                 get_token();
219                 if (isParagraph()) {
220                         putback();
221                         break;
222                 }
223                 if ( curr_token().cat() == catSpace ||
224                      curr_token().cat() == catNewline ||
225                     (curr_token().cat() == catComment && curr_token().cs().empty()))
226                         continue;
227                 if (skip_comments && curr_token().cat() == catComment)
228                         cerr << "  Ignoring comment: " << curr_token().asInput();
229                 else {
230                         putback();
231                         break;
232                 }
233         }
234 }
235
236
237 void Parser::unskip_spaces(bool skip_comments)
238 {
239         while (pos_ > 0) {
240                 if ( curr_token().cat() == catSpace ||
241                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
242                         putback();
243                 else if (skip_comments && curr_token().cat() == catComment) {
244                         // TODO: Get rid of this
245                         cerr << "Unignoring comment: " << curr_token().asInput();
246                         putback();
247                 }
248                 else
249                         break;
250         }
251 }
252
253
254 void Parser::putback()
255 {
256         --pos_;
257 }
258
259
260 bool Parser::good() const
261 {
262         return pos_ < tokens_.size();
263 }
264
265
266 char Parser::getChar()
267 {
268         if (!good())
269                 error("The input stream is not well...");
270         return tokens_[pos_++].character();
271 }
272
273
274 Parser::Arg Parser::getFullArg(char left, char right)
275 {
276         skip_spaces(true);
277
278         // This is needed if a partial file ends with a command without arguments,
279         // e. g. \medskip
280         if (! good())
281                 return std::make_pair(false, string());
282
283         string result;
284         char c = getChar();
285
286         if (c != left) {
287                 putback();
288                 return std::make_pair(false, string());
289         } else
290                 while ((c = getChar()) != right && good()) {
291                         // Ignore comments
292                         if (curr_token().cat() == catComment) {
293                                 if (!curr_token().cs().empty())
294                                         cerr << "Ignoring comment: " << curr_token().asInput();
295                         }
296                         else
297                                 result += curr_token().asInput();
298                 }
299
300         return std::make_pair(true, result);
301 }
302
303
304 string Parser::getArg(char left, char right)
305 {
306         return getFullArg(left, right).second;
307 }
308
309
310 string Parser::getFullOpt()
311 {
312         Arg arg = getFullArg('[', ']');
313         if (arg.first)
314                 return '[' + arg.second + ']';
315         return arg.second;
316 }
317
318
319 string Parser::getOpt()
320 {
321         string const res = getArg('[', ']');
322         return res.empty() ? string() : '[' + res + ']';
323 }
324
325
326 void Parser::tokenize(istream & is)
327 {
328         static bool init_done = false;
329
330         if (!init_done) {
331                 catInit();
332                 init_done = true;
333         }
334
335         char c;
336         while (is.get(c)) {
337                 //cerr << "reading c: " << c << "\n";
338
339                 switch (catcode(c)) {
340                         case catSpace: {
341                                 string s(1, c);
342                                 while (is.get(c) && catcode(c) == catSpace)
343                                         s += c;
344                                 if (catcode(c) != catSpace)
345                                         is.putback(c);
346                                 push_back(Token(s, catSpace));
347                                 break;
348                         }
349
350                         case catNewline: {
351                                 ++lineno_;
352                                 string s(1, getNewline(is, c));
353                                 while (is.get(c) && catcode(c) == catNewline) {
354                                         ++lineno_;
355                                         s += getNewline(is, c);
356                                 }
357                                 if (catcode(c) != catNewline)
358                                         is.putback(c);
359                                 push_back(Token(s, catNewline));
360                                 break;
361                         }
362
363                         case catComment: {
364                                 // We don't treat "%\n" combinations here specially because
365                                 // we want to preserve them in the preamble
366                                 string s;
367                                 while (is.get(c) && catcode(c) != catNewline)
368                                         s += c;
369                                 // handle possible DOS line ending
370                                 if (catcode(c) == catNewline)
371                                         c = getNewline(is, c);
372                                 // Note: The '%' at the beginning and the '\n' at the end
373                                 // of the comment are not stored.
374                                 ++lineno_;
375                                 push_back(Token(s, catComment));
376                                 break;
377                         }
378
379                         case catEscape: {
380                                 is.get(c);
381                                 if (!is) {
382                                         error("unexpected end of input");
383                                 } else {
384                                         string s(1, c);
385                                         if (catcode(c) == catLetter) {
386                                                 // collect letters
387                                                 while (is.get(c) && catcode(c) == catLetter)
388                                                         s += c;
389                                                 if (catcode(c) != catLetter)
390                                                         is.putback(c);
391                                         }
392                                         push_back(Token(s, catEscape));
393                                 }
394                                 break;
395                         }
396
397                         case catIgnore: {
398                                 cerr << "ignoring a char: " << int(c) << "\n";
399                                 break;
400                         }
401
402                         default:
403                                 push_back(Token(c, catcode(c)));
404                 }
405         }
406 }
407
408
409 void Parser::dump() const
410 {
411         cerr << "\nTokens: ";
412         for (unsigned i = 0; i < tokens_.size(); ++i) {
413                 if (i == pos_)
414                         cerr << " <#> ";
415                 cerr << tokens_[i];
416         }
417         cerr << " pos: " << pos_ << "\n";
418 }
419
420
421 void Parser::error(string const & msg)
422 {
423         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
424         dump();
425         //exit(1);
426 }
427
428
429 string Parser::verbatimOption()
430 {
431         string res;
432         if (next_token().character() == '[') {
433                 Token t = get_token();
434                 for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
435                         if (t.cat() == catBegin) {
436                                 putback();
437                                 res += '{' + verbatim_item() + '}';
438                         } else
439                                 res += t.asString();
440                 }
441         }
442         return res;
443 }
444
445
446 string Parser::verbatim_item()
447 {
448         if (!good())
449                 error("stream bad");
450         skip_spaces();
451         if (next_token().cat() == catBegin) {
452                 Token t = get_token(); // skip brace
453                 string res;
454                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
455                         if (t.cat() == catBegin) {
456                                 putback();
457                                 res += '{' + verbatim_item() + '}';
458                         }
459                         else
460                                 res += t.asInput();
461                 }
462                 return res;
463         }
464         return get_token().asInput();
465 }
466
467
468 void Parser::reset()
469 {
470         pos_ = 0;
471 }
472
473
474 void Parser::setCatCode(char c, CatCode cat)
475 {
476         theCatcode[(unsigned char)c] = cat;
477 }
478
479
480 CatCode Parser::getCatCode(char c) const
481 {
482         return theCatcode[(unsigned char)c];
483 }