]> git.lyx.org Git - features.git/blob - src/tex2lyx/Parser.cpp
Fix wrong setting of bibinset options if \cite{*} was found.
[features.git] / src / tex2lyx / Parser.cpp
1 /**
2  * \file Parser.cpp
3  * This file is part of LyX, the document processor.
4  * Licence details can be found in the file COPYING.
5  *
6  * \author André Pönitz 
7  *
8  * Full author contact details are available in file CREDITS.
9  */
10
11 #include <config.h>
12
13 #include "Encoding.h"
14 #include "Parser.h"
15 #include "support/textutils.h"
16
17 #include <iostream>
18
19 using namespace std;
20
21 namespace lyx {
22
23 namespace {
24
25 CatCode theCatcode[256];
26
27 void catInit()
28 {
29         static bool init_done = false;
30         if (init_done) 
31                 return;
32         init_done = true;
33
34         fill(theCatcode, theCatcode + 256, catOther);
35         fill(theCatcode + 'a', theCatcode + 'z' + 1, catLetter);
36         fill(theCatcode + 'A', theCatcode + 'Z' + 1, catLetter);
37
38         theCatcode[int('\\')] = catEscape;
39         theCatcode[int('{')]  = catBegin;
40         theCatcode[int('}')]  = catEnd;
41         theCatcode[int('$')]  = catMath;
42         theCatcode[int('&')]  = catAlign;
43         theCatcode[int('\n')] = catNewline;
44         theCatcode[int('#')]  = catParameter;
45         theCatcode[int('^')]  = catSuper;
46         theCatcode[int('_')]  = catSub;
47         theCatcode[0x7f]      = catIgnore;
48         theCatcode[int(' ')]  = catSpace;
49         theCatcode[int('\t')] = catSpace;
50         theCatcode[int('\r')] = catNewline;
51         theCatcode[int('~')]  = catActive;
52         theCatcode[int('%')]  = catComment;
53
54         // This is wrong!
55         theCatcode[int('@')]  = catLetter;
56 }
57
58 /*!
59  * Translate a line ending to '\n'.
60  * \p c must have catcode catNewline, and it must be the last character read
61  * from \p is.
62  */
63 char_type getNewline(idocstream & is, char_type c)
64 {
65         // we have to handle 3 different line endings:
66         // - UNIX (\n)
67         // - MAC  (\r)
68         // - DOS  (\r\n)
69         if (c == '\r') {
70                 // MAC or DOS
71                 char_type wc;
72                 if (is.get(wc) && wc != '\n') {
73                         // MAC
74                         is.putback(wc);
75                 }
76                 return '\n';
77         }
78         // UNIX
79         return c;
80 }
81
82 CatCode catcode(char_type c)
83 {
84         if (c < 256)
85                 return theCatcode[(unsigned char)c];
86         return catOther;
87 }
88
89 }
90
91
92 //
93 // Token
94 //
95
96 ostream & operator<<(ostream & os, Token const & t)
97 {
98         if (t.cat() == catComment)
99                 os << '%' << t.cs() << '\n';
100         else if (t.cat() == catSpace)
101                 os << t.cs();
102         else if (t.cat() == catEscape)
103                 os << '\\' << t.cs() << ' ';
104         else if (t.cat() == catLetter)
105                 os << t.cs();
106         else if (t.cat() == catNewline)
107                 os << "[" << t.cs().size() << "\\n," << t.cat() << "]\n";
108         else
109                 os << '[' << t.cs() << ',' << t.cat() << ']';
110         return os;
111 }
112
113
114 string Token::asInput() const
115 {
116         if (cat_ == catComment)
117                 return '%' + cs_ + '\n';
118         if (cat_ == catEscape)
119                 return '\\' + cs_;
120         return cs_;
121 }
122
123
124 bool Token::isAlnumASCII() const
125 {
126         return cat_ == catLetter ||
127                (cat_ == catOther && cs_.length() == 1 && isDigitASCII(cs_[0]));
128 }
129
130
131 //
132 // Parser
133 //
134
135
136 Parser::Parser(idocstream & is)
137         : lineno_(0), pos_(0), iss_(0), is_(is), encoding_latex_("utf8")
138 {
139 }
140
141
142 Parser::Parser(string const & s)
143         : lineno_(0), pos_(0), 
144           iss_(new idocstringstream(from_utf8(s))), is_(*iss_), 
145           encoding_latex_("utf8")
146 {
147 }
148
149
150 Parser::~Parser()
151 {
152         delete iss_;
153 }
154
155
156 void Parser::setEncoding(std::string const & e)
157 {
158         Encoding const * enc = encodings.fromLaTeXName(e);
159         if (!enc) {
160                 cerr << "Unknown encoding " << e << ". Ignoring." << std::endl;
161                 return;
162         }
163         //cerr << "setting encoding to " << enc->iconvName() << std::endl;
164         is_ << lyx::setEncoding(enc->iconvName());
165         encoding_latex_ = e;
166 }
167
168
169 void Parser::push_back(Token const & t)
170 {
171         tokens_.push_back(t);
172 }
173
174
175 // We return a copy here because the tokens_ vector may get reallocated
176 Token const Parser::prev_token() const
177 {
178         static const Token dummy;
179         return pos_ > 1 ? tokens_[pos_ - 2] : dummy;
180 }
181
182
183 // We return a copy here because the tokens_ vector may get reallocated
184 Token const Parser::curr_token() const
185 {
186         static const Token dummy;
187         return pos_ > 0 ? tokens_[pos_ - 1] : dummy;
188 }
189
190
191 // We return a copy here because the tokens_ vector may get reallocated
192 Token const Parser::next_token()
193 {
194         static const Token dummy;
195         return good() ? tokens_[pos_] : dummy;
196 }
197
198
199 // We return a copy here because the tokens_ vector may get reallocated
200 Token const Parser::get_token()
201 {
202         static const Token dummy;
203         //cerr << "looking at token " << tokens_[pos_] << " pos: " << pos_ << '\n';
204         return good() ? tokens_[pos_++] : dummy;
205 }
206
207
208 bool Parser::isParagraph()
209 {
210         // A new paragraph in TeX ist started
211         // - either by a newline, following any amount of whitespace
212         //   characters (including zero), and another newline
213         // - or the token \par
214         if (curr_token().cat() == catNewline &&
215             (curr_token().cs().size() > 1 ||
216              (next_token().cat() == catSpace &&
217               pos_ < tokens_.size() - 1 &&
218               tokens_[pos_ + 1].cat() == catNewline)))
219                 return true;
220         if (curr_token().cat() == catEscape && curr_token().cs() == "par")
221                 return true;
222         return false;
223 }
224
225
226 bool Parser::skip_spaces(bool skip_comments)
227 {
228         // We just silently return if we have no more tokens.
229         // skip_spaces() should be callable at any time,
230         // the caller must check p::good() anyway.
231         bool skipped = false;
232         while (good()) {
233                 get_token();
234                 if (isParagraph()) {
235                         putback();
236                         break;
237                 }
238                 if (curr_token().cat() == catSpace ||
239                     curr_token().cat() == catNewline) {
240                         skipped = true;
241                         continue;
242                 }
243                 if ((curr_token().cat() == catComment && curr_token().cs().empty()))
244                         continue;
245                 if (skip_comments && curr_token().cat() == catComment)
246                         cerr << "  Ignoring comment: " << curr_token().asInput();
247                 else {
248                         putback();
249                         break;
250                 }
251         }
252         return skipped;
253 }
254
255
256 void Parser::unskip_spaces(bool skip_comments)
257 {
258         while (pos_ > 0) {
259                 if ( curr_token().cat() == catSpace ||
260                     (curr_token().cat() == catNewline && curr_token().cs().size() == 1))
261                         putback();
262                 else if (skip_comments && curr_token().cat() == catComment) {
263                         // TODO: Get rid of this
264                         cerr << "Unignoring comment: " << curr_token().asInput();
265                         putback();
266                 }
267                 else
268                         break;
269         }
270 }
271
272
273 void Parser::putback()
274 {
275         --pos_;
276 }
277
278
279 void Parser::pushPosition()
280 {
281         positions_.push_back(pos_);
282 }
283
284
285 void Parser::popPosition()
286 {
287         pos_ = positions_.back();
288         positions_.pop_back();
289 }
290
291
292 bool Parser::good()
293 {
294         if (pos_ < tokens_.size())
295                 return true;
296         tokenize_one();
297         return pos_ < tokens_.size();
298 }
299
300
301 char Parser::getChar()
302 {
303         if (!good())
304                 error("The input stream is not well...");
305         return get_token().character();
306 }
307
308
309 bool Parser::hasOpt()
310 {
311         // An optional argument can occur in any of the following forms:
312         // - \foo[bar]
313         // - \foo [bar]
314         // - \foo
315         //   [bar]
316         // - \foo %comment
317         //   [bar]
318
319         // remember current position
320         unsigned int oldpos = pos_;
321         // skip spaces and comments
322         while (good()) {
323                 get_token();
324                 if (isParagraph()) {
325                         putback();
326                         break;
327                 }
328                 if (curr_token().cat() == catSpace ||
329                     curr_token().cat() == catNewline ||
330                     curr_token().cat() == catComment)
331                         continue;
332                 putback();
333                 break;
334         }
335         bool const retval = (next_token().asInput() == "[");
336         pos_ = oldpos;
337         return retval;
338 }
339
340
341 Parser::Arg Parser::getFullArg(char left, char right)
342 {
343         skip_spaces(true);
344
345         // This is needed if a partial file ends with a command without arguments,
346         // e. g. \medskip
347         if (! good())
348                 return make_pair(false, string());
349
350         string result;
351         char c = getChar();
352
353         if (c != left) {
354                 putback();
355                 return make_pair(false, string());
356         } else
357                 while ((c = getChar()) != right && good()) {
358                         // Ignore comments
359                         if (curr_token().cat() == catComment) {
360                                 if (!curr_token().cs().empty())
361                                         cerr << "Ignoring comment: " << curr_token().asInput();
362                         }
363                         else
364                                 result += curr_token().asInput();
365                 }
366
367         return make_pair(true, result);
368 }
369
370
371 string Parser::getArg(char left, char right)
372 {
373         return getFullArg(left, right).second;
374 }
375
376
377 string Parser::getFullOpt()
378 {
379         Arg arg = getFullArg('[', ']');
380         if (arg.first)
381                 return '[' + arg.second + ']';
382         return string();
383 }
384
385
386 string Parser::getOpt(bool keepws)
387 {
388         string const res = getArg('[', ']');
389         if (res.empty()) {
390                 if (keepws)
391                         unskip_spaces(true);
392                 return string();
393         }
394         return '[' + res + ']';
395 }
396
397
398 string Parser::getOptContent()
399 // the same as getOpt but without the brackets
400 {
401         string const res = getArg('[', ']');
402         return res.empty() ? string() : res;
403 }
404
405
406 string Parser::getFullParentheseArg()
407 {
408         Arg arg = getFullArg('(', ')');
409         if (arg.first)
410                 return '(' + arg.second + ')';
411         return string();
412 }
413
414
415 string const Parser::verbatimEnvironment(string const & name)
416 {
417         if (!good())
418                 return string();
419
420         ostringstream os;
421         for (Token t = get_token(); good(); t = get_token()) {
422                 if (t.cat() == catBegin) {
423                         putback();
424                         os << '{' << verbatim_item() << '}';
425                 } else if (t.asInput() == "\\begin") {
426                         string const env = getArg('{', '}');
427                         os << "\\begin{" << env << '}'
428                            << verbatimEnvironment(env)
429                            << "\\end{" << env << '}';
430                 } else if (t.asInput() == "\\end") {
431                         string const end = getArg('{', '}');
432                         if (end != name)
433                                 cerr << "\\end{" << end
434                                      << "} does not match \\begin{" << name
435                                      << "}." << endl;
436                         return os.str();
437                 } else
438                         os << t.asInput();
439         }
440         cerr << "unexpected end of input" << endl;
441         return os.str();
442 }
443
444
445 void Parser::tokenize_one()
446 {
447         catInit();
448         char_type c;
449         if (!is_.get(c)) 
450                 return;
451
452         switch (catcode(c)) {
453         case catSpace: {
454                 docstring s(1, c);
455                 while (is_.get(c) && catcode(c) == catSpace)
456                         s += c;
457                 if (catcode(c) != catSpace)
458                         is_.putback(c);
459                 push_back(Token(s, catSpace));
460                 break;
461         }
462                 
463         case catNewline: {
464                 ++lineno_;
465                 docstring s(1, getNewline(is_, c));
466                 while (is_.get(c) && catcode(c) == catNewline) {
467                         ++lineno_;
468                         s += getNewline(is_, c);
469                 }
470                 if (catcode(c) != catNewline)
471                         is_.putback(c);
472                 push_back(Token(s, catNewline));
473                 break;
474         }
475                 
476         case catComment: {
477                 // We don't treat "%\n" combinations here specially because
478                 // we want to preserve them in the preamble
479                 docstring s;
480                 while (is_.get(c) && catcode(c) != catNewline)
481                         s += c;
482                 // handle possible DOS line ending
483                 if (catcode(c) == catNewline)
484                         c = getNewline(is_, c);
485                 // Note: The '%' at the beginning and the '\n' at the end
486                 // of the comment are not stored.
487                 ++lineno_;
488                 push_back(Token(s, catComment));
489                 break;
490         }
491                 
492         case catEscape: {
493                 is_.get(c);
494                 if (!is_) {
495                         error("unexpected end of input");
496                 } else {
497                         docstring s(1, c);
498                         if (catcode(c) == catLetter) {
499                                 // collect letters
500                                 while (is_.get(c) && catcode(c) == catLetter)
501                                         s += c;
502                                 if (catcode(c) != catLetter)
503                                         is_.putback(c);
504                         }
505                         push_back(Token(s, catEscape));
506                 }
507                 break;
508         }
509                 
510         case catIgnore: {
511                 cerr << "ignoring a char: " << c << "\n";
512                 break;
513         }
514                 
515         default:
516                 push_back(Token(docstring(1, c), catcode(c)));
517         }
518         //cerr << tokens_.back();
519 }
520
521
522 void Parser::dump() const
523 {
524         cerr << "\nTokens: ";
525         for (unsigned i = 0; i < tokens_.size(); ++i) {
526                 if (i == pos_)
527                         cerr << " <#> ";
528                 cerr << tokens_[i];
529         }
530         cerr << " pos: " << pos_ << "\n";
531 }
532
533
534 void Parser::error(string const & msg)
535 {
536         cerr << "Line ~" << lineno_ << ":  parse error: " << msg << endl;
537         dump();
538         //exit(1);
539 }
540
541
542 string Parser::verbatimOption()
543 {
544         string res;
545         if (next_token().character() == '[') {
546                 Token t = get_token();
547                 for (t = get_token(); t.character() != ']' && good(); t = get_token()) {
548                         if (t.cat() == catBegin) {
549                                 putback();
550                                 res += '{' + verbatim_item() + '}';
551                         } else
552                                 res += t.cs();
553                 }
554         }
555         return res;
556 }
557
558
559 string Parser::verbatim_item()
560 {
561         if (!good())
562                 error("stream bad");
563         skip_spaces();
564         if (next_token().cat() == catBegin) {
565                 Token t = get_token(); // skip brace
566                 string res;
567                 for (Token t = get_token(); t.cat() != catEnd && good(); t = get_token()) {
568                         if (t.cat() == catBegin) {
569                                 putback();
570                                 res += '{' + verbatim_item() + '}';
571                         }
572                         else
573                                 res += t.asInput();
574                 }
575                 return res;
576         }
577         return get_token().asInput();
578 }
579
580
581 void Parser::reset()
582 {
583         pos_ = 0;
584 }
585
586
587 void Parser::setCatCode(char c, CatCode cat)
588 {
589         theCatcode[(unsigned char)c] = cat;
590 }
591
592
593 CatCode Parser::getCatCode(char c) const
594 {
595         return theCatcode[(unsigned char)c];
596 }
597
598
599 } // namespace lyx