1 # This file is part of lyx2lyx
2 # -*- coding: utf-8 -*-
3 # Copyright (C) 2006 José Matos <jamatos@lyx.org>
4 # Copyright (C) 2004-2006 Georg Baum <Georg.Baum@post.rwth-aachen.de>
6 # This program is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU General Public License
8 # as published by the Free Software Foundation; either version 2
9 # of the License, or (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with this program; if not, write to the Free Software
18 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 """ Convert files to the file format generated by lyx 1.5"""
25 from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
26 from LyX import get_encoding
29 ####################################################################
30 # Private helper functions
32 def find_end_of_inset(lines, i):
33 " Find end of inset, where lines[i] is included."
34 return find_end_of(lines, i, "\\begin_inset", "\\end_inset")
36 def find_end_of_layout(lines, i):
37 " Find end of layout, where lines[i] is included."
38 return find_end_of(lines, i, "\\begin_layout", "\\end_layout")
40 # End of helper functions
41 ####################################################################
45 # Notes: Framed/Shaded
48 def revert_framed(document):
49 "Revert framed notes. "
52 i = find_tokens(document.body, ["\\begin_inset Note Framed", "\\begin_inset Note Shaded"], i)
56 document.body[i] = "\\begin_inset Note"
64 roman_fonts = {'default' : 'default', 'ae' : 'ae',
65 'times' : 'times', 'palatino' : 'palatino',
66 'helvet' : 'default', 'avant' : 'default',
67 'newcent' : 'newcent', 'bookman' : 'bookman',
69 sans_fonts = {'default' : 'default', 'ae' : 'default',
70 'times' : 'default', 'palatino' : 'default',
71 'helvet' : 'helvet', 'avant' : 'avant',
72 'newcent' : 'default', 'bookman' : 'default',
74 typewriter_fonts = {'default' : 'default', 'ae' : 'default',
75 'times' : 'default', 'palatino' : 'default',
76 'helvet' : 'default', 'avant' : 'default',
77 'newcent' : 'default', 'bookman' : 'default',
78 'pslatex' : 'courier'}
80 def convert_font_settings(document):
81 " Convert font settings. "
83 i = find_token_exact(document.header, "\\fontscheme", i)
85 document.warning("Malformed LyX document: Missing `\\fontscheme'.")
87 font_scheme = get_value(document.header, "\\fontscheme", i, i + 1)
89 document.warning("Malformed LyX document: Empty `\\fontscheme'.")
90 font_scheme = 'default'
91 if not font_scheme in roman_fonts.keys():
92 document.warning("Malformed LyX document: Unknown `\\fontscheme' `%s'." % font_scheme)
93 font_scheme = 'default'
94 document.header[i:i+1] = ['\\font_roman %s' % roman_fonts[font_scheme],
95 '\\font_sans %s' % sans_fonts[font_scheme],
96 '\\font_typewriter %s' % typewriter_fonts[font_scheme],
97 '\\font_default_family default',
100 '\\font_sf_scale 100',
101 '\\font_tt_scale 100']
104 def revert_font_settings(document):
105 " Revert font settings. "
108 fonts = {'roman' : 'default', 'sans' : 'default', 'typewriter' : 'default'}
109 for family in 'roman', 'sans', 'typewriter':
110 name = '\\font_%s' % family
111 i = find_token_exact(document.header, name, i)
113 document.warning("Malformed LyX document: Missing `%s'." % name)
116 if (insert_line < 0):
118 fonts[family] = get_value(document.header, name, i, i + 1)
119 del document.header[i]
120 i = find_token_exact(document.header, '\\font_default_family', i)
122 document.warning("Malformed LyX document: Missing `\\font_default_family'.")
123 font_default_family = 'default'
125 font_default_family = get_value(document.header, "\\font_default_family", i, i + 1)
126 del document.header[i]
127 i = find_token_exact(document.header, '\\font_sc', i)
129 document.warning("Malformed LyX document: Missing `\\font_sc'.")
132 font_sc = get_value(document.header, '\\font_sc', i, i + 1)
133 del document.header[i]
134 if font_sc != 'false':
135 document.warning("Conversion of '\\font_sc' not yet implemented.")
136 i = find_token_exact(document.header, '\\font_osf', i)
138 document.warning("Malformed LyX document: Missing `\\font_osf'.")
141 font_osf = get_value(document.header, '\\font_osf', i, i + 1)
142 del document.header[i]
143 i = find_token_exact(document.header, '\\font_sf_scale', i)
145 document.warning("Malformed LyX document: Missing `\\font_sf_scale'.")
146 font_sf_scale = '100'
148 font_sf_scale = get_value(document.header, '\\font_sf_scale', i, i + 1)
149 del document.header[i]
150 if font_sf_scale != '100':
151 document.warning("Conversion of '\\font_sf_scale' not yet implemented.")
152 i = find_token_exact(document.header, '\\font_tt_scale', i)
154 document.warning("Malformed LyX document: Missing `\\font_tt_scale'.")
155 font_tt_scale = '100'
157 font_tt_scale = get_value(document.header, '\\font_tt_scale', i, i + 1)
158 del document.header[i]
159 if font_tt_scale != '100':
160 document.warning("Conversion of '\\font_tt_scale' not yet implemented.")
161 for font_scheme in roman_fonts.keys():
162 if (roman_fonts[font_scheme] == fonts['roman'] and
163 sans_fonts[font_scheme] == fonts['sans'] and
164 typewriter_fonts[font_scheme] == fonts['typewriter']):
165 document.header.insert(insert_line, '\\fontscheme %s' % font_scheme)
166 if font_default_family != 'default':
167 document.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family)
168 if font_osf == 'true':
169 document.warning("Ignoring `\\font_osf = true'")
171 font_scheme = 'default'
172 document.header.insert(insert_line, '\\fontscheme %s' % font_scheme)
173 if fonts['roman'] == 'cmr':
174 document.preamble.append('\\renewcommand{\\rmdefault}{cmr}')
175 if font_osf == 'true':
176 document.preamble.append('\\usepackage{eco}')
178 for font in 'lmodern', 'charter', 'utopia', 'beraserif', 'ccfonts', 'chancery':
179 if fonts['roman'] == font:
180 document.preamble.append('\\usepackage{%s}' % font)
181 for font in 'cmss', 'lmss', 'cmbr':
182 if fonts['sans'] == font:
183 document.preamble.append('\\renewcommand{\\sfdefault}{%s}' % font)
184 for font in 'berasans':
185 if fonts['sans'] == font:
186 document.preamble.append('\\usepackage{%s}' % font)
187 for font in 'cmtt', 'lmtt', 'cmtl':
188 if fonts['typewriter'] == font:
189 document.preamble.append('\\renewcommand{\\ttdefault}{%s}' % font)
190 for font in 'courier', 'beramono', 'luximono':
191 if fonts['typewriter'] == font:
192 document.preamble.append('\\usepackage{%s}' % font)
193 if font_default_family != 'default':
194 document.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family)
195 if font_osf == 'true':
196 document.warning("Ignoring `\\font_osf = true'")
199 def revert_booktabs(document):
200 " We remove the booktabs flag or everything else will become a mess. "
201 re_row = re.compile(r'^<row.*space="[^"]+".*>$')
202 re_tspace = re.compile(r'\s+topspace="[^"]+"')
203 re_bspace = re.compile(r'\s+bottomspace="[^"]+"')
204 re_ispace = re.compile(r'\s+interlinespace="[^"]+"')
207 i = find_token(document.body, "\\begin_inset Tabular", i)
210 j = find_end_of_inset(document.body, i + 1)
212 document.warning("Malformed LyX document: Could not find end of tabular.")
214 for k in range(i, j):
215 if re.search('^<features.* booktabs="true".*>$', document.body[k]):
216 document.warning("Converting 'booktabs' table to normal table.")
217 document.body[k] = document.body[k].replace(' booktabs="true"', '')
218 if re.search(re_row, document.body[k]):
219 document.warning("Removing extra row space.")
220 document.body[k] = re_tspace.sub('', document.body[k])
221 document.body[k] = re_bspace.sub('', document.body[k])
222 document.body[k] = re_ispace.sub('', document.body[k])
226 def convert_multiencoding(document, forward):
227 """ Fix files with multiple encodings.
228 Files with an inputencoding of "auto" or "default" and multiple languages
229 where at least two languages have different default encodings are encoded
230 in multiple encodings for file formats < 249. These files are incorrectly
231 read and written (as if the whole file was in the encoding of the main
235 - converts from fake unicode values to true unicode if forward is true, and
236 - converts from true unicode values to fake unicode if forward is false.
237 document.encoding must be set to the old value (format 248) in both cases.
239 We do this here and not in LyX.py because it is far easier to do the
240 necessary parsing in modern formats than in ancient ones.
242 encoding_stack = [document.encoding]
243 lang_re = re.compile(r"^\\lang\s(\S+)")
244 if document.inputencoding == "auto" or document.inputencoding == "default":
245 for i in range(len(document.body)):
246 result = lang_re.match(document.body[i])
248 language = result.group(1)
249 if language == "default":
250 document.warning("Resetting encoding from %s to %s." % (encoding_stack[-1], document.encoding))
251 encoding_stack[-1] = document.encoding
253 from lyx2lyx_lang import lang
254 document.warning("Setting encoding from %s to %s." % (encoding_stack[-1], lang[language][3]))
255 encoding_stack[-1] = lang[language][3]
256 elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
257 document.warning("Adding nested encoding %s." % encoding_stack[-1])
258 encoding_stack.append(encoding_stack[-1])
259 elif find_token(document.body, "\\end_layout", i, i + 1) == i:
260 document.warning("Removing nested encoding %s." % encoding_stack[-1])
261 del encoding_stack[-1]
262 if encoding_stack[-1] != document.encoding:
264 # This line has been incorrectly interpreted as if it was
265 # encoded in 'encoding'.
266 # Convert back to the 8bit string that was in the file.
267 orig = document.body[i].encode(document.encoding)
268 # Convert the 8bit string that was in the file to unicode
269 # with the correct encoding.
270 document.body[i] = orig.decode(encoding_stack[-1])
272 # Convert unicode to the 8bit string that will be written
273 # to the file with the correct encoding.
274 orig = document.body[i].encode(encoding_stack[-1])
275 # Convert the 8bit string that will be written to the
276 # file to fake unicode with the encoding that will later
277 # be used when writing to the file.
278 document.body[i] = orig.decode(document.encoding)
281 def convert_utf8(document):
282 " Set document encoding to UTF-8. "
283 convert_multiencoding(document, True)
284 document.encoding = "utf8"
287 def revert_utf8(document):
288 " Set document encoding to the value corresponding to inputencoding. "
289 i = find_token(document.header, "\\inputencoding", 0)
291 document.header.append("\\inputencoding auto")
292 elif get_value(document.header, "\\inputencoding", i) == "utf8":
293 document.header[i] = "\\inputencoding auto"
294 document.inputencoding = get_value(document.header, "\\inputencoding", 0)
295 document.encoding = get_encoding(document.language, document.inputencoding, 248)
296 convert_multiencoding(document, False)
299 def revert_cs_label(document):
300 " Remove status flag of charstyle label. "
303 i = find_token(document.body, "\\begin_inset CharStyle", i)
306 # Seach for a line starting 'show_label'
307 # If it is not there, break with a warning message
310 if (document.body[i][:10] == "show_label"):
313 elif (document.body[i][:13] == "\\begin_layout"):
314 document.warning("Malformed LyX document: Missing 'show_label'.")
321 def convert_bibitem(document):
323 \bibitem [option]{argument}
327 \begin_inset LatexCommand bibitem
333 This must be called after convert_commandparams.
335 regex = re.compile(r'\S+\s*(\[[^\[\{]*\])?(\{[^}]*\})')
338 i = find_token(document.body, "\\bibitem", i)
341 match = re.match(regex, document.body[i])
342 option = match.group(1)
343 argument = match.group(2)
344 lines = ['\\begin_inset LatexCommand bibitem']
346 lines.append('label "%s"' % option[1:-1].replace('"', '\\"'))
347 lines.append('key "%s"' % argument[1:-1].replace('"', '\\"'))
349 lines.append('\\end_inset')
350 document.body[i:i+1] = lines
354 commandparams_info = {
355 # command : [option1, option2, argument]
356 "bibitem" : ["label", "", "key"],
357 "bibtex" : ["options", "btprint", "bibfiles"],
358 "cite" : ["after", "before", "key"],
359 "citet" : ["after", "before", "key"],
360 "citep" : ["after", "before", "key"],
361 "citealt" : ["after", "before", "key"],
362 "citealp" : ["after", "before", "key"],
363 "citeauthor" : ["after", "before", "key"],
364 "citeyear" : ["after", "before", "key"],
365 "citeyearpar" : ["after", "before", "key"],
366 "citet*" : ["after", "before", "key"],
367 "citep*" : ["after", "before", "key"],
368 "citealt*" : ["after", "before", "key"],
369 "citealp*" : ["after", "before", "key"],
370 "citeauthor*" : ["after", "before", "key"],
371 "Citet" : ["after", "before", "key"],
372 "Citep" : ["after", "before", "key"],
373 "Citealt" : ["after", "before", "key"],
374 "Citealp" : ["after", "before", "key"],
375 "Citeauthor" : ["after", "before", "key"],
376 "Citet*" : ["after", "before", "key"],
377 "Citep*" : ["after", "before", "key"],
378 "Citealt*" : ["after", "before", "key"],
379 "Citealp*" : ["after", "before", "key"],
380 "Citeauthor*" : ["after", "before", "key"],
381 "citefield" : ["after", "before", "key"],
382 "citetitle" : ["after", "before", "key"],
383 "cite*" : ["after", "before", "key"],
384 "hfill" : ["", "", ""],
385 "index" : ["", "", "name"],
386 "printindex" : ["", "", "name"],
387 "label" : ["", "", "name"],
388 "eqref" : ["name", "", "reference"],
389 "pageref" : ["name", "", "reference"],
390 "prettyref" : ["name", "", "reference"],
391 "ref" : ["name", "", "reference"],
392 "vpageref" : ["name", "", "reference"],
393 "vref" : ["name", "", "reference"],
394 "tableofcontents" : ["", "", "type"],
395 "htmlurl" : ["name", "", "target"],
396 "url" : ["name", "", "target"]}
399 def convert_commandparams(document):
402 \begin_inset LatexCommand \cmdname[opt1][opt2]{arg}
407 \begin_inset LatexCommand cmdname
413 name1, name2 and name3 can be different for each command.
415 # \begin_inset LatexCommand bibitem was not the official version (see
416 # convert_bibitem()), but could be read in, so we convert it here, too.
420 i = find_token(document.body, "\\begin_inset LatexCommand", i)
423 command = document.body[i][26:].strip()
425 document.warning("Malformed LyX document: Missing LatexCommand name.")
429 # The following parser is taken from the original InsetCommandParams::scanCommand
435 # Used to handle things like \command[foo[bar]]{foo{bar}}
439 if ((state == "CMDNAME" and c == ' ') or
440 (state == "CMDNAME" and c == '[') or
441 (state == "CMDNAME" and c == '{')):
443 if ((state == "OPTION" and c == ']') or
444 (state == "SECOPTION" and c == ']') or
445 (state == "CONTENT" and c == '}')):
449 nestdepth = nestdepth - 1
450 if ((state == "OPTION" and c == '[') or
451 (state == "SECOPTION" and c == '[') or
452 (state == "CONTENT" and c == '{')):
453 nestdepth = nestdepth + 1
454 if state == "CMDNAME":
456 elif state == "OPTION":
458 elif state == "SECOPTION":
460 elif state == "CONTENT":
465 elif c == '[' and b != ']':
467 nestdepth = 0 # Just to be sure
468 elif c == '[' and b == ']':
470 nestdepth = 0 # Just to be sure
473 nestdepth = 0 # Just to be sure
476 # Now we have parsed the command, output the parameters
477 lines = ["\\begin_inset LatexCommand %s" % name]
479 if commandparams_info[name][0] == "":
480 document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
482 lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"')))
484 if commandparams_info[name][1] == "":
485 document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
487 lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"')))
489 if commandparams_info[name][2] == "":
490 document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
492 lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"')))
493 document.body[i:i+1] = lines
497 def revert_commandparams(document):
498 regex = re.compile(r'(\S+)\s+(.+)')
501 i = find_token(document.body, "\\begin_inset LatexCommand", i)
504 name = document.body[i].split()[2]
505 j = find_end_of_inset(document.body, i + 1)
510 for k in range(i + 1, j):
511 match = re.match(regex, document.body[k])
513 pname = match.group(1)
514 pvalue = match.group(2)
515 if pname == "preview":
516 preview_line = document.body[k]
517 elif (commandparams_info[name][0] != "" and
518 pname == commandparams_info[name][0]):
519 option1 = pvalue.strip('"').replace('\\"', '"')
520 elif (commandparams_info[name][1] != "" and
521 pname == commandparams_info[name][1]):
522 option2 = pvalue.strip('"').replace('\\"', '"')
523 elif (commandparams_info[name][2] != "" and
524 pname == commandparams_info[name][2]):
525 argument = pvalue.strip('"').replace('\\"', '"')
526 elif document.body[k].strip() != "":
527 document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
528 if name == "bibitem":
530 lines = ["\\bibitem {%s}" % argument]
532 lines = ["\\bibitem [%s]{%s}" % (option1, argument)]
536 lines = ["\\begin_inset LatexCommand \\%s{%s}" % (name, argument)]
538 lines = ["\\begin_inset LatexCommand \\%s[][%s]{%s}" % (name, option2, argument)]
541 lines = ["\\begin_inset LatexCommand \\%s[%s]{%s}" % (name, option1, argument)]
543 lines = ["\\begin_inset LatexCommand \\%s[%s][%s]{%s}" % (name, option1, option2, argument)]
544 if name != "bibitem":
545 if preview_line != "":
546 lines.append(preview_line)
548 lines.append('\\end_inset')
549 document.body[i:j+1] = lines
553 def revert_nomenclature(document):
554 " Convert nomenclature entry to ERT. "
555 regex = re.compile(r'(\S+)\s+(.+)')
559 i = find_token(document.body, "\\begin_inset LatexCommand nomenclature", i)
563 j = find_end_of_inset(document.body, i + 1)
568 for k in range(i + 1, j):
569 match = re.match(regex, document.body[k])
571 name = match.group(1)
572 value = match.group(2)
573 if name == "preview":
574 preview_line = document.body[k]
575 elif name == "symbol":
576 symbol = value.strip('"').replace('\\"', '"')
577 elif name == "description":
578 description = value.strip('"').replace('\\"', '"')
579 elif name == "prefix":
580 prefix = value.strip('"').replace('\\"', '"')
581 elif document.body[k].strip() != "":
582 document.warning("Ignoring unknown contents `%s' in nomenclature inset." % document.body[k])
584 command = 'nomenclature{%s}{%s}' % (symbol, description)
586 command = 'nomenclature[%s]{%s}{%s}' % (prefix, symbol, description)
587 document.body[i:j+1] = ['\\begin_inset ERT',
590 '\\begin_layout %s' % document.default_layout,
599 if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1:
600 document.preamble.append('\\usepackage{nomencl}[2005/09/22]')
601 document.preamble.append('\\makenomenclature')
604 def revert_printnomenclature(document):
605 " Convert printnomenclature to ERT. "
606 regex = re.compile(r'(\S+)\s+(.+)')
610 i = find_token(document.body, "\\begin_inset LatexCommand printnomenclature", i)
614 j = find_end_of_inset(document.body, i + 1)
617 for k in range(i + 1, j):
618 match = re.match(regex, document.body[k])
620 name = match.group(1)
621 value = match.group(2)
622 if name == "preview":
623 preview_line = document.body[k]
624 elif name == "labelwidth":
625 labelwidth = value.strip('"').replace('\\"', '"')
626 elif document.body[k].strip() != "":
627 document.warning("Ignoring unknown contents `%s' in printnomenclature inset." % document.body[k])
629 command = 'nomenclature{}'
631 command = 'nomenclature[%s]' % labelwidth
632 document.body[i:j+1] = ['\\begin_inset ERT',
635 '\\begin_layout %s' % document.default_layout,
644 if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1:
645 document.preamble.append('\\usepackage{nomencl}[2005/09/22]')
646 document.preamble.append('\\makenomenclature')
649 def convert_esint(document):
650 " Add \\use_esint setting to header. "
651 i = find_token(document.header, "\\cite_engine", 0)
653 document.warning("Malformed LyX document: Missing `\\cite_engine'.")
655 # 0 is off, 1 is auto, 2 is on.
656 document.header.insert(i, '\\use_esint 0')
659 def revert_esint(document):
660 " Remove \\use_esint setting from header. "
661 i = find_token(document.header, "\\use_esint", 0)
663 document.warning("Malformed LyX document: Missing `\\use_esint'.")
665 use_esint = document.header[i].split()[1]
666 del document.header[i]
667 # 0 is off, 1 is auto, 2 is on.
669 document.preamble.append('\\usepackage{esint}')
672 def revert_clearpage(document):
676 i = find_token(document.body, "\\clearpage", i)
679 document.body[i:i+1] = ['\\begin_inset ERT',
682 '\\begin_layout %s' % document.default_layout,
693 def revert_cleardoublepage(document):
694 " cleardoublepage -> ERT "
697 i = find_token(document.body, "\\cleardoublepage", i)
700 document.body[i:i+1] = ['\\begin_inset ERT',
703 '\\begin_layout %s' % document.default_layout,
714 def convert_lyxline(document):
715 " remove fontsize commands for \lyxline "
716 # The problematic is: The old \lyxline definition doesn't handle the fontsize
717 # to change the line thickness. The new definiton does this so that imported
718 # \lyxlines would have a different line thickness. The eventual fontsize command
719 # before \lyxline is therefore removed to get the same output.
720 fontsizes = ["tiny", "scriptsize", "footnotesize", "small", "normalsize",
721 "large", "Large", "LARGE", "huge", "Huge"]
722 for n in range(0, len(fontsizes)):
725 while i < len(document.body):
726 i = find_token(document.body, "\\size " + fontsizes[n], i)
727 k = find_token(document.body, "\\lyxline",i)
728 # the corresponding fontsize command is always 2 lines before the \lyxline
729 if (i != -1 and k == i+2):
730 document.body[i:i+1] = []
736 def revert_encodings(document):
737 " Set new encodings to auto. "
738 encodings = ["8859-6", "8859-8", "cp437", "cp437de", "cp850", "cp852",
739 "cp855", "cp858", "cp862", "cp865", "cp866", "cp1250",
740 "cp1252", "cp1256", "cp1257", "latin10", "pt254", "tis620-0"]
741 i = find_token(document.header, "\\inputencoding", 0)
743 document.header.append("\\inputencoding auto")
745 inputenc = get_value(document.header, "\\inputencoding", i)
746 if inputenc in encodings:
747 document.header[i] = "\\inputencoding auto"
748 document.inputencoding = get_value(document.header, "\\inputencoding", 0)
751 def convert_caption(document):
752 " Convert caption layouts to caption insets. "
755 i = find_token(document.body, "\\begin_layout Caption", i)
758 j = find_end_of_layout(document.body, i)
760 document.warning("Malformed LyX document: Missing `\\end_layout'.")
763 document.body[j:j] = ["\\end_layout", "", "\\end_inset", "", ""]
764 document.body[i:i+1] = ["\\begin_layout %s" % document.default_layout,
765 "\\begin_inset Caption", "",
766 "\\begin_layout %s" % document.default_layout]
770 def revert_caption(document):
771 " Convert caption insets to caption layouts. "
772 " This assumes that the text class has a caption style. "
775 i = find_token(document.body, "\\begin_inset Caption", i)
779 # We either need to delete the previous \begin_layout line, or we
780 # need to end the previous layout if this inset is not in the first
781 # position of the paragraph.
782 layout_before = find_token_backwards(document.body, "\\begin_layout", i)
783 if layout_before == -1:
784 document.warning("Malformed LyX document: Missing `\\begin_layout'.")
786 layout_line = document.body[layout_before]
787 del_layout_before = True
788 l = layout_before + 1
790 if document.body[l] != "":
791 del_layout_before = False
794 if del_layout_before:
795 del document.body[layout_before:i]
798 document.body[i:i] = ["\\end_layout", ""]
801 # Find start of layout in the inset and end of inset
802 j = find_token(document.body, "\\begin_layout", i)
804 document.warning("Malformed LyX document: Missing `\\begin_layout'.")
806 k = find_end_of_inset(document.body, i)
808 document.warning("Malformed LyX document: Missing `\\end_inset'.")
811 # We either need to delete the following \end_layout line, or we need
812 # to restart the old layout if this inset is not at the paragraph end.
813 layout_after = find_token(document.body, "\\end_layout", k)
814 if layout_after == -1:
815 document.warning("Malformed LyX document: Missing `\\end_layout'.")
817 del_layout_after = True
819 while l < layout_after:
820 if document.body[l] != "":
821 del_layout_after = False
825 del document.body[k+1:layout_after+1]
827 document.body[k+1:k+1] = [layout_line, ""]
829 # delete \begin_layout and \end_inset and replace \begin_inset with
830 # "\begin_layout Caption". This works because we can only have one
831 # paragraph in the caption inset: The old \end_layout will be recycled.
833 if document.body[k] == "":
836 if document.body[j] == "":
838 document.body[i] = "\\begin_layout Caption"
839 if document.body[i+1] == "":
840 del document.body[i+1]
844 # Accents of InsetLaTeXAccent
846 "`" : u'\u0300', # grave
847 "'" : u'\u0301', # acute
848 "^" : u'\u0302', # circumflex
849 "~" : u'\u0303', # tilde
850 "=" : u'\u0304', # macron
851 "u" : u'\u0306', # breve
852 "." : u'\u0307', # dot above
853 "\"": u'\u0308', # diaresis
854 "r" : u'\u030a', # ring above
855 "H" : u'\u030b', # double acute
856 "v" : u'\u030c', # caron
857 "b" : u'\u0320', # minus sign below
858 "d" : u'\u0323', # dot below
859 "c" : u'\u0327', # cedilla
860 "k" : u'\u0328', # ogonek
861 "t" : u'\u0361' # tie. This is special: It spans two characters, but
862 # only one is given as argument, so we don't need to
863 # treat it differently.
867 # special accents of InsetLaTeXAccent without argument
868 special_accent_map = {
869 'i' : u'\u0131', # dotless i
870 'j' : u'\u0237', # dotless j
871 'l' : u'\u0142', # l with stroke
872 'L' : u'\u0141' # L with stroke
876 # special accent arguments of InsetLaTeXAccent
878 '\\i' : u'\u0131', # dotless i
879 '\\j' : u'\u0237' # dotless j
883 def _convert_accent(accent, accented_char):
887 if type in special_accent_map:
888 return special_accent_map[type]
889 # a missing char is treated as space by LyX
891 elif type == 'q' and char in ['t', 'd', 'l', 'L']:
892 # Special caron, only used with t, d, l and L.
893 # It is not in the map because we convert it to the same unicode
894 # character as the normal caron: \q{} is only defined if babel with
895 # the czech or slovak language is used, and the normal caron
896 # produces the correct output if the T1 font encoding is used.
897 # For the same reason we never convert to \q{} in the other direction.
899 elif char in accented_map:
900 char = accented_map[char]
901 elif (len(char) > 1):
902 # We can only convert accents on a single char
904 a = accent_map.get(type)
906 return unicodedata.normalize("NFKC", "%s%s" % (char, a))
910 def convert_ertbackslash(body, i, ert, default_layout):
911 r""" -------------------------------------------------------------------------------------------
912 Convert backslashes and '\n' into valid ERT code, append the converted
913 text to body[i] and return the (maybe incremented) line index i"""
917 body[i] = body[i] + '\\backslash '
921 body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
924 body[i] = body[i] + c
928 def convert_accent(document):
929 # The following forms are supported by LyX:
930 # '\i \"{a}' (standard form, as written by LyX)
931 # '\i \"{}' (standard form, as written by LyX if the accented char is a space)
932 # '\i \"{ }' (also accepted if the accented char is a space)
933 # '\i \" a' (also accepted)
934 # '\i \"' (also accepted)
935 re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
936 re_contents = re.compile(r'^([^\s{]+)(.*)$')
937 re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
940 i = find_re(document.body, re_wholeinset, i)
943 match = re_wholeinset.match(document.body[i])
944 prefix = match.group(1)
945 contents = match.group(3).strip()
946 match = re_contents.match(contents)
948 # Strip first char (always \)
949 accent = match.group(1)[1:]
950 accented_contents = match.group(2).strip()
951 match = re_accentedcontents.match(accented_contents)
952 accented_char = match.group(1)
953 converted = _convert_accent(accent, accented_char)
956 contents = '%s{%s}' % (accent, accented_char),
958 document.body[i] = '%s%s' % (prefix, converted)
961 document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
962 document.body[i] = prefix
963 document.body[i+1:i+1] = ['\\begin_inset ERT',
966 '\\begin_layout %s' % document.default_layout,
970 i = convert_ertbackslash(document.body, i + 7,
972 document.default_layout)
973 document.body[i+1:i+1] = ['\\end_layout',
979 def revert_accent(document):
980 inverse_accent_map = {}
982 inverse_accent_map[accent_map[k]] = k
983 inverse_special_accent_map = {}
984 for k in special_accent_map:
985 inverse_special_accent_map[special_accent_map[k]] = k
986 inverse_accented_map = {}
987 for k in accented_map:
988 inverse_accented_map[accented_map[k]] = k
990 # Since LyX may insert a line break within a word we must combine all
991 # words before unicode normalization.
992 # We do this only if the next line starts with an accent, otherwise we
993 # would create things like '\begin_inset ERTstatus'.
994 numberoflines = len(document.body)
995 for i in range(numberoflines-1):
996 if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
998 if (document.body[i+1][0] in inverse_accent_map):
999 # the last character of this line and the first of the next line
1000 # form probably a surrogate pair.
1001 while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
1002 document.body[i] += document.body[i+1][0]
1003 document.body[i+1] = document.body[i+1][1:]
1005 # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
1006 # This is needed to catch all accented characters.
1007 for i in range(numberoflines):
1008 # Unfortunately we have a mixture of unicode strings and plain strings,
1009 # because we never use u'xxx' for string literals, but 'xxx'.
1010 # Therefore we may have to try two times to normalize the data.
1012 document.body[i] = unicodedata.normalize("NFKD", document.body[i])
1014 document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
1016 # Replace accented characters with InsetLaTeXAccent
1017 # Do not convert characters that can be represented in the chosen
1019 encoding_stack = [get_encoding(document.language, document.inputencoding, 248)]
1020 lang_re = re.compile(r"^\\lang\s(\S+)")
1021 for i in range(len(document.body)):
1023 if document.inputencoding == "auto" or document.inputencoding == "default":
1024 # Track the encoding of the current line
1025 result = lang_re.match(document.body[i])
1027 language = result.group(1)
1028 if language == "default":
1029 encoding_stack[-1] = document.encoding
1031 from lyx2lyx_lang import lang
1032 encoding_stack[-1] = lang[language][3]
1034 elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
1035 encoding_stack.append(encoding_stack[-1])
1037 elif find_token(document.body, "\\end_layout", i, i + 1) == i:
1038 del encoding_stack[-1]
1041 for j in range(len(document.body[i])):
1042 # dotless i and dotless j are both in special_accent_map and can
1043 # occur as an accented character, so we need to test that the
1044 # following character is no accent
1045 if (document.body[i][j] in inverse_special_accent_map and
1046 (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)):
1047 accent = document.body[i][j]
1049 dummy = accent.encode(encoding_stack[-1])
1050 except UnicodeEncodeError:
1051 # Insert the rest of the line as new line
1052 if j < len(document.body[i]) - 1:
1053 document.body[i+1:i+1] = document.body[i][j+1:]
1054 # Delete the accented character
1056 document.body[i] = document.body[i][:j-1]
1058 document.body[i] = u''
1059 # Finally add the InsetLaTeXAccent
1060 document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
1062 elif j > 0 and document.body[i][j] in inverse_accent_map:
1063 accented_char = document.body[i][j-1]
1064 if accented_char == ' ':
1065 # Conform to LyX output
1067 elif accented_char in inverse_accented_map:
1068 accented_char = inverse_accented_map[accented_char]
1069 accent = document.body[i][j]
1071 dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
1072 except UnicodeEncodeError:
1073 # Insert the rest of the line as new line
1074 if j < len(document.body[i]) - 1:
1075 document.body[i+1:i+1] = document.body[i][j+1:]
1076 # Delete the accented characters
1078 document.body[i] = document.body[i][:j-2]
1080 document.body[i] = u''
1081 # Finally add the InsetLaTeXAccent
1082 document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
1084 # Normalize to "Normal form C" (NFC, pre-composed characters) again
1085 for i in range(numberoflines):
1086 document.body[i] = unicodedata.normalize("NFKC", document.body[i])
1089 def normalize_font_whitespace(document):
1090 """ Before format 259 the font changes were ignored if a
1091 whitespace was the first or last character in the sequence, this function
1092 transfers the whitespace outside."""
1094 if document.backend != "latex":
1097 lines = document.body
1099 char_properties = {"\\series": "default",
1100 "\\emph": "default",
1102 "\\shape": "default",
1104 "\\family": "default"}
1108 while i < len(lines):
1109 words = lines[i].split()
1111 if len(words) > 0 and words[0] == "\\begin_layout":
1112 # a new paragraph resets all font changes
1115 elif len(words) > 1 and words[0] in char_properties.keys():
1116 # we have a font change
1117 if char_properties[words[0]] == words[1]:
1118 # property gets reset
1119 if words[0] in changes.keys():
1120 del changes[words[0]]
1121 defaultproperty = True
1124 changes[words[0]] = words[1]
1125 defaultproperty = False
1127 # We need to explicitly reset all changed properties if we find
1128 # a space below, because LyX 1.4 would output the space after
1129 # closing the previous change and before starting the new one,
1130 # and closing a font change means to close all properties, not
1131 # just the changed one.
1133 if lines[i-1] and lines[i-1][-1] == " ":
1134 lines[i-1] = lines[i-1][:-1]
1135 # a space before the font change
1137 for k in changes.keys():
1138 # exclude property k because that is already in lines[i]
1140 added_lines[1:1] = ["%s %s" % (k, changes[k])]
1141 for k in changes.keys():
1142 # exclude property k because that must be added below anyway
1144 added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
1146 # Property is reset in lines[i], so add the new stuff afterwards
1147 lines[i+1:i+1] = added_lines
1149 # Reset property for the space
1150 added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
1151 lines[i:i] = added_lines
1152 i = i + len(added_lines)
1154 elif lines[i+1] and lines[i+1][0] == " " and (len(changes) > 0 or not defaultproperty):
1155 # a space after the font change
1156 if (lines[i+1] == " " and lines[i+2]):
1157 next_words = lines[i+2].split()
1158 if len(next_words) > 0 and next_words[0] == words[0]:
1159 # a single blank with a property different from the
1160 # previous and the next line must not be changed
1163 lines[i+1] = lines[i+1][1:]
1165 for k in changes.keys():
1166 # exclude property k because that is already in lines[i]
1168 added_lines[1:1] = ["%s %s" % (k, changes[k])]
1169 for k in changes.keys():
1170 # exclude property k because that must be added below anyway
1172 added_lines[0:0] = ["%s %s" % (k, char_properties[k])]
1173 # Reset property for the space
1174 added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])]
1175 lines[i:i] = added_lines
1176 i = i + len(added_lines)
1184 supported_versions = ["1.5.0","1.5"]
1185 convert = [[246, []],
1186 [247, [convert_font_settings]],
1188 [249, [convert_utf8]],
1191 [252, [convert_commandparams, convert_bibitem]],
1193 [254, [convert_esint]],
1196 [257, [convert_caption]],
1197 [258, [convert_lyxline]],
1198 [259, [convert_accent, normalize_font_whitespace]]]
1200 revert = [[258, []],
1202 [256, [revert_caption]],
1203 [255, [revert_encodings]],
1204 [254, [revert_clearpage, revert_cleardoublepage]],
1205 [253, [revert_esint]],
1206 [252, [revert_nomenclature, revert_printnomenclature]],
1207 [251, [revert_commandparams]],
1208 [250, [revert_cs_label]],
1210 [248, [revert_accent, revert_utf8]],
1211 [247, [revert_booktabs]],
1212 [246, [revert_font_settings]],
1213 [245, [revert_framed]]]
1216 if __name__ == "__main__":