]> git.lyx.org Git - lyx.git/blob - src/tex2lyx/texparser.C
* support/qstring_helpers.h: erase ucs4_to_qstring() method.
[lyx.git] / src / tex2lyx / texparser.C
1 /**
2  * \file texparser.C
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author André Pönitz
7  *
8  * Full author contact details are available in file CREDITS.
9  */
10
11 #include <config.h>
12
13 #include "texparser.h"
14
15 #include <iostream>
16 #include <sstream>
17
18
19 namespace lyx {
20
21 using std::cerr;
22 using std::endl;
23 using std::fill;
24 using std::istream;
25 using std::istringstream;
26 using std::ostringstream;
27 using std::ostream;
28 using std::string;
29
30
31 namespace {
32
33 CatCode theCatcode[256];
34
35 void catInit()
36 {
37         fill(theCatcode, theCatcode + 256, catOther);
38         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
39         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
40
41         theCatcode[int('\\')] = catEscape;
42         theCatcode[int('{')]  = catBegin;
43         theCatcode[int('}')]  = catEnd;
44         theCatcode[int('$')]  = catMath;
45         theCatcode[int('&')]  = catAlign;
46         theCatcode[int('\n')] = catNewline;
47         theCatcode[int('#')]  = catParameter;
48         theCatcode[int('^')]  = catSuper;
49         theCatcode[int('_')]  = catSub;
50         theCatcode[0x7f]      = catIgnore;
51         theCatcode[int(' ')]  = catSpace;
52         theCatcode[int('\t')] = catSpace;
53         theCatcode[int('\r')] = catNewline;
54         theCatcode[int('~')]  = catActive;
55         theCatcode[int('%')]  = catComment;
56
57         // This is wrong!
58         theCatcode[int('@')]  = catLetter;
59 }
60
61
62 /*!
63  * Translate a line ending to '\n'.
64  * \p c must have catcode catNewline, and it must be the last character read
65  * from \p is.
66  */
67 char getNewline(istream & is, char c)
68 {
69         // we have to handle 3 different line endings:
70         // - UNIX (\n)
71         // - MAC  (\r)
72         // - DOS  (\r\n)
73         if (c == '\r') {
74                 // MAC or DOS
75                 if (is.get(c) && c != '\n') {
76                         // MAC
77                         is.putback(c);
78                 }
79                 return '\n';
80         }
81         // UNIX
82         return c;
83 }
84
85 }
86
87
88 //
89 // catcodes
90 //
91
92 CatCode catcode(unsigned char c)
93 {
94         return theCatcode[c];
95 }
96
97
98
99 //
100 // Token
101 //
102
103 ostream & operator<<(ostream & os, Token const & t)
104 {
105         if (t.cat() == catComment)
106                 os << '%' << t.cs() << '\n';
107         else if (t.cat() == catSpace)
108                 os << t.cs();
109         else if (t.cat() == catEscape)
110                 os << '\\' << t.cs() << ' ';
111         else if (t.cat() == catLetter)
112                 os << t.character();
113         else if (t.cat() == catNewline)
114                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
115         else
116                 os << '[' << t.character() << ',' << t.cat() << ']';
117         return os;
118 }
119
120
121 string Token::asString() const
122 {
123         return cs_.size() ? cs_ : string(1, char_);
124 }
125
126
127 string Token::asInput() const
128 {
129         if (cat_ == catComment)
130                 return '%' + cs_ + '\n';
131         if (cat_ == catSpace || cat_ == catNewline)
132                 return cs_;
133         return char_ ? string(1, char_) : '\\' + cs_;
134 }
135
136
137 //
138 // Parser
139 //
140
141
142 Parser::Parser(istream & is)
143         : lineno_(0), pos_(0)
144 {
145         tokenize(is);
146 }
147
148
149 Parser::Parser(string const & s)
150         : lineno_(0), pos_(0)
151 {
152         istringstream is(s);
153         tokenize(is);
154 }
155
156
157 void Parser::push_back(Token const & t)
158 {
159         tokens_.push_back(t);
160 }
161
162
163 void Parser::pop_back()
164 {
165         tokens_.pop_back();
166 }
167
168
169 Token const & Parser::prev_token() const
170 {
171         static const Token dummy;
172         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
173 }
174
175
176 Token const & Parser::curr_token() const
177 {
178         static const Token dummy;
179         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
180 }
181
182
183 Token const & Parser::next_token() const
184 {
185         static const Token dummy;
186         return good() ? tokens_[pos_] : dummy;
187 }
188
189
190 Token const & Parser::get_token()
191 {
192         static const Token dummy;
193         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
194         return good() ? tokens_[pos_++] : dummy;
195 }
196
197
198 bool Parser::isParagraph() const
199 {
200         // A new paragraph in TeX ist started
201         // - either by a newline, following any amount of whitespace
202         //   characters (including zero), and another newline
203         // - or the token \par
204         if (curr_token().cat() == catNewline &&
205             (curr_token().cs().size() > 1 ||
206              (next_token().cat() == catSpace &&
207               pos_ < tokens_.size() - 1 &&
208               tokens_[pos_ + 1].cat() == catNewline)))
209                 return true;
210         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
211                 return true;
212         return false;
213 }
214
215
216 void Parser::skip_spaces(bool skip_comments)
217 {
218         // We just silently return if we have no more tokens.
219         // skip_spaces() should be callable at any time,
220         // the caller must check p::good() anyway.
221         while (good()) {
222                 get_token();
223                 if (isParagraph()) {
224                         putback();
225                         break;
226                 }
227                 if ( curr_token().cat() == catSpace ||
228                      curr_token().cat() == catNewline ||
229                     (curr_token().cat() == catComment && curr_token().cs().empty()))
230                         continue;
231                 if (skip_comments && curr_token().cat() == catComment)
232                         cerr << "  Ignoring comment: " << curr_token().asInput();
233                 else {
234                         putback();
235                         break;
236                 }
237         }
238 }
239
240
241 void Parser::unskip_spaces(bool skip_comments)
242 {
243         while (pos_ > 0) {
244                 if ( curr_token().cat() == catSpace ||
245                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
246                         putback();
247                 else if (skip_comments && curr_token().cat() == catComment) {
248                         // TODO: Get rid of this
249                         cerr << "Unignoring comment: " << curr_token().asInput();
250                         putback();
251                 }
252                 else
253                         break;
254         }
255 }
256
257
258 void Parser::putback()
259 {
260         --pos_;
261 }
262
263
264 bool Parser::good() const
265 {
266         return pos_ < tokens_.size();
267 }
268
269
270 char Parser::getChar()
271 {
272         if (!good())
273                 error("The input stream is not well...");
274         return tokens_[pos_++].character();
275 }
276
277
278 Parser::Arg Parser::getFullArg(char left, char right)
279 {
280         skip_spaces(true);
281
282         // This is needed if a partial file ends with a command without arguments,
283         // e. g. \medskip
284         if (! good())
285                 return std::make_pair(false, string());
286
287         string result;
288         char c = getChar();
289
290         if (c != left) {
291                 putback();
292                 return std::make_pair(false, string());
293         } else
294                 while ((c = getChar()) != right && good()) {
295                         // Ignore comments
296                         if (curr_token().cat() == catComment) {
297                                 if (!curr_token().cs().empty())
298                                         cerr << "Ignoring comment: " << curr_token().asInput();
299                         }
300                         else
301                                 result += curr_token().asInput();
302                 }
303
304         return std::make_pair(true, result);
305 }
306
307
308 string Parser::getArg(char left, char right)
309 {
310         return getFullArg(left, right).second;
311 }
312
313
314 string Parser::getFullOpt()
315 {
316         Arg arg = getFullArg('[', ']');
317         if (arg.first)
318                 return '[' + arg.second + ']';
319         return arg.second;
320 }
321
322
323 string Parser::getOpt()
324 {
325         string const res = getArg('[', ']');
326         return res.empty() ? string() : '[' + res + ']';
327 }
328
329
330 string const Parser::verbatimEnvironment(string const & name)
331 {
332         if (!good())
333                 return string();
334
335         ostringstream os;
336         for (Token t = get_token(); good(); t = get_token()) {
337                 if (t.cat() == catBegin) {
338                         putback();
339                         os << '{' << verbatim_item() << '}';
340                 } else if (t.asInput() == "\\begin") {
341                         string const env = getArg('{', '}');
342                         os << "\\begin{" << env << '}'
343                            << verbatimEnvironment(env)
344                            << "\\end{" << env << '}';
345                 } else if (t.asInput() == "\\end") {
346                         string const end = getArg('{', '}');
347                         if (end != name)
348                                 cerr << "\\end{" << end
349                                      << "} does not match \\begin{" << name
350                                      << "}." << endl;
351                         return os.str();
352                 } else
353                         os << t.asInput();
354         }
355         cerr << "unexpected end of input" << endl;
356         return os.str();
357 }
358
359
360 void Parser::tokenize(istream & is)
361 {
362         static bool init_done = false;
363
364         if (!init_done) {
365                 catInit();
366                 init_done = true;
367         }
368
369         char c;
370         while (is.get(c)) {
371                 //cerr << "reading c: " << c << "\n";
372
373                 switch (catcode(c)) {
374                         case catSpace: {
375                                 string s(1, c);
376                                 while (is.get(c) && catcode(c) == catSpace)
377                                         s += c;
378                                 if (catcode(c) != catSpace)
379                                         is.putback(c);
380                                 push_back(Token(s, catSpace));
381                                 break;
382                         }
383
384                         case catNewline: {
385                                 ++lineno_;
386                                 string s(1, getNewline(is, c));
387                                 while (is.get(c) && catcode(c) == catNewline) {
388                                         ++lineno_;
389                                         s += getNewline(is, c);
390                                 }
391                                 if (catcode(c) != catNewline)
392                                         is.putback(c);
393                                 push_back(Token(s, catNewline));
394                                 break;
395                         }
396
397                         case catComment: {
398                                 // We don't treat "%\n" combinations here specially because
399                                 // we want to preserve them in the preamble
400                                 string s;
401                                 while (is.get(c) && catcode(c) != catNewline)
402                                         s += c;
403                                 // handle possible DOS line ending
404                                 if (catcode(c) == catNewline)
405                                         c = getNewline(is, c);
406                                 // Note: The '%' at the beginning and the '\n' at the end
407                                 // of the comment are not stored.
408                                 ++lineno_;
409                                 push_back(Token(s, catComment));
410                                 break;
411                         }
412
413                         case catEscape: {
414                                 is.get(c);
415                                 if (!is) {
416                                         error("unexpected end of input");
417                                 } else {
418                                         string s(1, c);
419                                         if (catcode(c) == catLetter) {
420                                                 // collect letters
421                                                 while (is.get(c) && catcode(c) == catLetter)
422                                                         s += c;
423                                                 if (catcode(c) != catLetter)
424                                                         is.putback(c);
425                                         }
426                                         push_back(Token(s, catEscape));
427                                 }
428                                 break;
429                         }
430
431                         case catIgnore: {
432                                 cerr << "ignoring a char: " << int(c) << "\n";
433                                 break;
434                         }
435
436                         default:
437                                 push_back(Token(c, catcode(c)));
438                 }
439         }
440 }
441
442
443 void Parser::dump() const
444 {
445         cerr << "\nTokens: ";
446         for (unsigned i = 0; i < tokens_.size(); ++i) {
447                 if (i == pos_)
448                         cerr << " <#> ";
449                 cerr << tokens_[i];
450         }
451         cerr << " pos: " << pos_ << "\n";
452 }
453
454
455 void Parser::error(string const & msg)
456 {
457         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
458         dump();
459         //exit(1);
460 }
461
462
463 string Parser::verbatimOption()
464 {
465         string res;
466         if (next_token().character() == '[') {
467                 Token t = get_token();
468                 for (Token t = get_token(); t.character() != ']' && good(); t = get_token()) {
469                         if (t.cat() == catBegin) {
470                                 putback();
471                                 res += '{' + verbatim_item() + '}';
472                         } else
473                                 res += t.asString();
474                 }
475         }
476         return res;
477 }
478
479
480 string Parser::verbatim_item()
481 {
482         if (!good())
483                 error("stream bad");
484         skip_spaces();
485         if (next_token().cat() == catBegin) {
486                 Token t = get_token(); // skip brace
487                 string res;
488                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
489                         if (t.cat() == catBegin) {
490                                 putback();
491                                 res += '{' + verbatim_item() + '}';
492                         }
493                         else
494                                 res += t.asInput();
495                 }
496                 return res;
497         }
498         return get_token().asInput();
499 }
500
501
502 void Parser::reset()
503 {
504         pos_ = 0;
505 }
506
507
508 void Parser::setCatCode(char c, CatCode cat)
509 {
510         theCatcode[(unsigned char)c] = cat;
511 }
512
513
514 CatCode Parser::getCatCode(char c) const
515 {
516         return theCatcode[(unsigned char)c];
517 }
518
519
520 } // namespace lyx