X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=lib%2Flyx2lyx%2Flyx_1_5.py;h=a57282b2561d669f6c60844636e9a0f039e58f33;hb=9fd8a869616cad617d0db8b022119c35855ea39d;hp=e27dbc533672583c086b28c8c9cfa615f7b72300;hpb=0c0c43b8eb42b4a895e12c774a9b12f2c5bc6c03;p=lyx.git diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py index e27dbc5336..a57282b256 100644 --- a/lib/lyx2lyx/lyx_1_5.py +++ b/lib/lyx2lyx/lyx_1_5.py @@ -1,6 +1,6 @@ # This file is part of lyx2lyx -# -*- coding: iso-8859-1 -*- -# Copyright (C) 2006 José Matos +# -*- coding: utf-8 -*- +# Copyright (C) 2006 José Matos # Copyright (C) 2004-2006 Georg Baum # # This program is free software; you can redistribute it and/or @@ -17,23 +17,43 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +""" Convert files to the file format generated by lyx 1.5""" + import re -from parser_tools import find_token, find_token_exact, find_tokens, find_end_of_inset, get_value -from string import replace +import unicodedata + +from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value +from LyX import get_encoding + + +#################################################################### +# Private helper functions + +def find_end_of_inset(lines, i): + " Find end of inset, where lines[i] is included." + return find_end_of(lines, i, "\\begin_inset", "\\end_inset") + +def find_end_of_layout(lines, i): + " Find end of layout, where lines[i] is included." + return find_end_of(lines, i, "\\begin_layout", "\\end_layout") + +# End of helper functions +#################################################################### ## # Notes: Framed/Shaded # -def revert_framed(file): +def revert_framed(document): + "Revert framed notes. " i = 0 while 1: - i = find_tokens(file.body, ["\\begin_inset Note Framed", "\\begin_inset Note Shaded"], i) + i = find_tokens(document.body, ["\\begin_inset Note Framed", "\\begin_inset Note Shaded"], i) if i == -1: return - file.body[i] = "\\begin_inset Note" + document.body[i] = "\\begin_inset Note" i = i + 1 @@ -57,20 +77,21 @@ typewriter_fonts = {'default' : 'default', 'ae' : 'default', 'newcent' : 'default', 'bookman' : 'default', 'pslatex' : 'courier'} -def convert_font_settings(file): +def convert_font_settings(document): + " Convert font settings. " i = 0 - i = find_token_exact(file.header, "\\fontscheme", i) + i = find_token_exact(document.header, "\\fontscheme", i) if i == -1: - file.warning("Malformed LyX file: Missing `\\fontscheme'.") + document.warning("Malformed LyX document: Missing `\\fontscheme'.") return - font_scheme = get_value(file.header, "\\fontscheme", i, i + 1) + font_scheme = get_value(document.header, "\\fontscheme", i, i + 1) if font_scheme == '': - file.warning("Malformed LyX file: Empty `\\fontscheme'.") + document.warning("Malformed LyX document: Empty `\\fontscheme'.") font_scheme = 'default' if not font_scheme in roman_fonts.keys(): - file.warning("Malformed LyX file: Unknown `\\fontscheme' `%s'." % font_scheme) + document.warning("Malformed LyX document: Unknown `\\fontscheme' `%s'." % font_scheme) font_scheme = 'default' - file.header[i:i+1] = ['\\font_roman %s' % roman_fonts[font_scheme], + document.header[i:i+1] = ['\\font_roman %s' % roman_fonts[font_scheme], '\\font_sans %s' % sans_fonts[font_scheme], '\\font_typewriter %s' % typewriter_fonts[font_scheme], '\\font_default_family default', @@ -80,136 +101,1133 @@ def convert_font_settings(file): '\\font_tt_scale 100'] -def revert_font_settings(file): +def revert_font_settings(document): + " Revert font settings. " i = 0 insert_line = -1 fonts = {'roman' : 'default', 'sans' : 'default', 'typewriter' : 'default'} for family in 'roman', 'sans', 'typewriter': name = '\\font_%s' % family - i = find_token_exact(file.header, name, i) + i = find_token_exact(document.header, name, i) if i == -1: - file.warning("Malformed LyX file: Missing `%s'." % name) + document.warning("Malformed LyX document: Missing `%s'." % name) i = 0 else: if (insert_line < 0): insert_line = i - fonts[family] = get_value(file.header, name, i, i + 1) - del file.header[i] - i = find_token_exact(file.header, '\\font_default_family', i) + fonts[family] = get_value(document.header, name, i, i + 1) + del document.header[i] + i = find_token_exact(document.header, '\\font_default_family', i) if i == -1: - file.warning("Malformed LyX file: Missing `\\font_default_family'.") + document.warning("Malformed LyX document: Missing `\\font_default_family'.") font_default_family = 'default' else: - font_default_family = get_value(file.header, "\\font_default_family", i, i + 1) - del file.header[i] - i = find_token_exact(file.header, '\\font_sc', i) + font_default_family = get_value(document.header, "\\font_default_family", i, i + 1) + del document.header[i] + i = find_token_exact(document.header, '\\font_sc', i) if i == -1: - file.warning("Malformed LyX file: Missing `\\font_sc'.") + document.warning("Malformed LyX document: Missing `\\font_sc'.") font_sc = 'false' else: - font_sc = get_value(file.header, '\\font_sc', i, i + 1) - del file.header[i] + font_sc = get_value(document.header, '\\font_sc', i, i + 1) + del document.header[i] if font_sc != 'false': - file.warning("Conversion of '\\font_sc' not yet implemented.") - i = find_token_exact(file.header, '\\font_osf', i) + document.warning("Conversion of '\\font_sc' not yet implemented.") + i = find_token_exact(document.header, '\\font_osf', i) if i == -1: - file.warning("Malformed LyX file: Missing `\\font_osf'.") + document.warning("Malformed LyX document: Missing `\\font_osf'.") font_osf = 'false' else: - font_osf = get_value(file.header, '\\font_osf', i, i + 1) - del file.header[i] - i = find_token_exact(file.header, '\\font_sf_scale', i) + font_osf = get_value(document.header, '\\font_osf', i, i + 1) + del document.header[i] + i = find_token_exact(document.header, '\\font_sf_scale', i) if i == -1: - file.warning("Malformed LyX file: Missing `\\font_sf_scale'.") + document.warning("Malformed LyX document: Missing `\\font_sf_scale'.") font_sf_scale = '100' else: - font_sf_scale = get_value(file.header, '\\font_sf_scale', i, i + 1) - del file.header[i] + font_sf_scale = get_value(document.header, '\\font_sf_scale', i, i + 1) + del document.header[i] if font_sf_scale != '100': - file.warning("Conversion of '\\font_sf_scale' not yet implemented.") - i = find_token_exact(file.header, '\\font_tt_scale', i) + document.warning("Conversion of '\\font_sf_scale' not yet implemented.") + i = find_token_exact(document.header, '\\font_tt_scale', i) if i == -1: - file.warning("Malformed LyX file: Missing `\\font_tt_scale'.") + document.warning("Malformed LyX document: Missing `\\font_tt_scale'.") font_tt_scale = '100' else: - font_tt_scale = get_value(file.header, '\\font_tt_scale', i, i + 1) - del file.header[i] + font_tt_scale = get_value(document.header, '\\font_tt_scale', i, i + 1) + del document.header[i] if font_tt_scale != '100': - file.warning("Conversion of '\\font_tt_scale' not yet implemented.") + document.warning("Conversion of '\\font_tt_scale' not yet implemented.") for font_scheme in roman_fonts.keys(): if (roman_fonts[font_scheme] == fonts['roman'] and sans_fonts[font_scheme] == fonts['sans'] and typewriter_fonts[font_scheme] == fonts['typewriter']): - file.header.insert(insert_line, '\\fontscheme %s' % font_scheme) + document.header.insert(insert_line, '\\fontscheme %s' % font_scheme) if font_default_family != 'default': - file.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family) + document.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family) if font_osf == 'true': - file.warning("Ignoring `\\font_osf = true'") + document.warning("Ignoring `\\font_osf = true'") return font_scheme = 'default' - file.header.insert(insert_line, '\\fontscheme %s' % font_scheme) + document.header.insert(insert_line, '\\fontscheme %s' % font_scheme) if fonts['roman'] == 'cmr': - file.preamble.append('\\renewcommand{\\rmdefault}{cmr}') + document.preamble.append('\\renewcommand{\\rmdefault}{cmr}') if font_osf == 'true': - file.preamble.append('\\usepackage{eco}') + document.preamble.append('\\usepackage{eco}') font_osf = 'false' for font in 'lmodern', 'charter', 'utopia', 'beraserif', 'ccfonts', 'chancery': if fonts['roman'] == font: - file.preamble.append('\\usepackage{%s}' % font) + document.preamble.append('\\usepackage{%s}' % font) for font in 'cmss', 'lmss', 'cmbr': if fonts['sans'] == font: - file.preamble.append('\\renewcommand{\\sfdefault}{%s}' % font) + document.preamble.append('\\renewcommand{\\sfdefault}{%s}' % font) for font in 'berasans': if fonts['sans'] == font: - file.preamble.append('\\usepackage{%s}' % font) + document.preamble.append('\\usepackage{%s}' % font) for font in 'cmtt', 'lmtt', 'cmtl': if fonts['typewriter'] == font: - file.preamble.append('\\renewcommand{\\ttdefault}{%s}' % font) + document.preamble.append('\\renewcommand{\\ttdefault}{%s}' % font) for font in 'courier', 'beramono', 'luximono': if fonts['typewriter'] == font: - file.preamble.append('\\usepackage{%s}' % font) + document.preamble.append('\\usepackage{%s}' % font) if font_default_family != 'default': - file.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family) + document.preamble.append('\\renewcommand{\\familydefault}{\\%s}' % font_default_family) if font_osf == 'true': - file.warning("Ignoring `\\font_osf = true'") + document.warning("Ignoring `\\font_osf = true'") -def revert_booktabs(file): -# we just remove the booktabs flag, everything else will become a mess. +def revert_booktabs(document): + " We remove the booktabs flag or everything else will become a mess. " re_row = re.compile(r'^$') re_tspace = re.compile(r'\s+topspace="[^"]+"') re_bspace = re.compile(r'\s+bottomspace="[^"]+"') re_ispace = re.compile(r'\s+interlinespace="[^"]+"') i = 0 while 1: - i = find_token(file.body, "\\begin_inset Tabular", i) + i = find_token(document.body, "\\begin_inset Tabular", i) if i == -1: return - j = find_end_of_inset(file.body, i + 1) + j = find_end_of_inset(document.body, i + 1) if j == -1: - file.warning("Malformed LyX file: Could not find end of tabular.") - continue + document.warning("Malformed LyX document: Could not find end of tabular.") + continue for k in range(i, j): - if re.search('^$', file.body[k]): - file.warning("Converting 'booktabs' table to normal table.") - file.body[k] = replace(file.body[k], ' booktabs="true"', '') - if re.search(re_row, file.body[k]): - file.warning("Removing extra row space.") - file.body[k] = re_tspace.sub('', file.body[k]) - file.body[k] = re_bspace.sub('', file.body[k]) - file.body[k] = re_ispace.sub('', file.body[k]) + if re.search('^$', document.body[k]): + document.warning("Converting 'booktabs' table to normal table.") + document.body[k] = document.body[k].replace(' booktabs="true"', '') + if re.search(re_row, document.body[k]): + document.warning("Removing extra row space.") + document.body[k] = re_tspace.sub('', document.body[k]) + document.body[k] = re_bspace.sub('', document.body[k]) + document.body[k] = re_ispace.sub('', document.body[k]) + i = i + 1 + + +def convert_multiencoding(document, forward): + """ Fix files with multiple encodings. +Files with an inputencoding of "auto" or "default" and multiple languages +where at least two languages have different default encodings are encoded +in multiple encodings for file formats < 249. These files are incorrectly +read and written (as if the whole file was in the encoding of the main +language). +This is not true for files written by CJK-LyX, they are always in the locale +encoding. + +This function +- converts from fake unicode values to true unicode if forward is true, and +- converts from true unicode values to fake unicode if forward is false. +document.encoding must be set to the old value (format 248) in both cases. + +We do this here and not in LyX.py because it is far easier to do the +necessary parsing in modern formats than in ancient ones. +""" + if document.cjk_encoding != '': + return + encoding_stack = [document.encoding] + lang_re = re.compile(r"^\\lang\s(\S+)") + if document.inputencoding == "auto" or document.inputencoding == "default": + for i in range(len(document.body)): + result = lang_re.match(document.body[i]) + if result: + language = result.group(1) + if language == "default": + document.warning("Resetting encoding from %s to %s." % (encoding_stack[-1], document.encoding)) + encoding_stack[-1] = document.encoding + else: + from lyx2lyx_lang import lang + document.warning("Setting encoding from %s to %s." % (encoding_stack[-1], lang[language][3])) + encoding_stack[-1] = lang[language][3] + elif find_token(document.body, "\\begin_layout", i, i + 1) == i: + document.warning("Adding nested encoding %s." % encoding_stack[-1]) + encoding_stack.append(encoding_stack[-1]) + elif find_token(document.body, "\\end_layout", i, i + 1) == i: + document.warning("Removing nested encoding %s." % encoding_stack[-1]) + del encoding_stack[-1] + if encoding_stack[-1] != document.encoding: + if forward: + # This line has been incorrectly interpreted as if it was + # encoded in 'encoding'. + # Convert back to the 8bit string that was in the file. + orig = document.body[i].encode(document.encoding) + # Convert the 8bit string that was in the file to unicode + # with the correct encoding. + document.body[i] = orig.decode(encoding_stack[-1]) + else: + # Convert unicode to the 8bit string that will be written + # to the file with the correct encoding. + orig = document.body[i].encode(encoding_stack[-1]) + # Convert the 8bit string that will be written to the + # file to fake unicode with the encoding that will later + # be used when writing to the file. + document.body[i] = orig.decode(document.encoding) + + +def convert_utf8(document): + " Set document encoding to UTF-8. " + convert_multiencoding(document, True) + document.encoding = "utf8" + + +def revert_utf8(document): + " Set document encoding to the value corresponding to inputencoding. " + i = find_token(document.header, "\\inputencoding", 0) + if i == -1: + document.header.append("\\inputencoding auto") + elif get_value(document.header, "\\inputencoding", i) == "utf8": + document.header[i] = "\\inputencoding auto" + document.inputencoding = get_value(document.header, "\\inputencoding", 0) + document.encoding = get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding) + convert_multiencoding(document, False) + + +def revert_cs_label(document): + " Remove status flag of charstyle label. " + i = 0 + while 1: + i = find_token(document.body, "\\begin_inset CharStyle", i) + if i == -1: + return + # Seach for a line starting 'show_label' + # If it is not there, break with a warning message + i = i + 1 + while 1: + if (document.body[i][:10] == "show_label"): + del document.body[i] + break + elif (document.body[i][:13] == "\\begin_layout"): + document.warning("Malformed LyX document: Missing 'show_label'.") + break + i = i + 1 + + i = i + 1 + + +def convert_bibitem(document): + """ Convert +\bibitem [option]{argument} + +to + +\begin_inset LatexCommand bibitem +label "option" +key "argument" + +\end_inset + +This must be called after convert_commandparams. +""" + regex = re.compile(r'\S+\s*(\[[^\[\{]*\])?(\{[^}]*\})') + i = 0 + while 1: + i = find_token(document.body, "\\bibitem", i) + if i == -1: + break + match = re.match(regex, document.body[i]) + option = match.group(1) + argument = match.group(2) + lines = ['\\begin_inset LatexCommand bibitem'] + if option != None: + lines.append('label "%s"' % option[1:-1].replace('"', '\\"')) + lines.append('key "%s"' % argument[1:-1].replace('"', '\\"')) + lines.append('') + lines.append('\\end_inset') + document.body[i:i+1] = lines + i = i + 1 + + +commandparams_info = { + # command : [option1, option2, argument] + "bibitem" : ["label", "", "key"], + "bibtex" : ["options", "btprint", "bibfiles"], + "cite" : ["after", "before", "key"], + "citet" : ["after", "before", "key"], + "citep" : ["after", "before", "key"], + "citealt" : ["after", "before", "key"], + "citealp" : ["after", "before", "key"], + "citeauthor" : ["after", "before", "key"], + "citeyear" : ["after", "before", "key"], + "citeyearpar" : ["after", "before", "key"], + "citet*" : ["after", "before", "key"], + "citep*" : ["after", "before", "key"], + "citealt*" : ["after", "before", "key"], + "citealp*" : ["after", "before", "key"], + "citeauthor*" : ["after", "before", "key"], + "Citet" : ["after", "before", "key"], + "Citep" : ["after", "before", "key"], + "Citealt" : ["after", "before", "key"], + "Citealp" : ["after", "before", "key"], + "Citeauthor" : ["after", "before", "key"], + "Citet*" : ["after", "before", "key"], + "Citep*" : ["after", "before", "key"], + "Citealt*" : ["after", "before", "key"], + "Citealp*" : ["after", "before", "key"], + "Citeauthor*" : ["after", "before", "key"], + "citefield" : ["after", "before", "key"], + "citetitle" : ["after", "before", "key"], + "cite*" : ["after", "before", "key"], + "hfill" : ["", "", ""], + "index" : ["", "", "name"], + "printindex" : ["", "", "name"], + "label" : ["", "", "name"], + "eqref" : ["name", "", "reference"], + "pageref" : ["name", "", "reference"], + "prettyref" : ["name", "", "reference"], + "ref" : ["name", "", "reference"], + "vpageref" : ["name", "", "reference"], + "vref" : ["name", "", "reference"], + "tableofcontents" : ["", "", "type"], + "htmlurl" : ["name", "", "target"], + "url" : ["name", "", "target"]} + + +def convert_commandparams(document): + """ Convert + + \begin_inset LatexCommand \cmdname[opt1][opt2]{arg} + \end_inset + + to + + \begin_inset LatexCommand cmdname + name1 "opt1" + name2 "opt2" + name3 "arg" + \end_inset + + name1, name2 and name3 can be different for each command. +""" + # \begin_inset LatexCommand bibitem was not the official version (see + # convert_bibitem()), but could be read in, so we convert it here, too. + + i = 0 + while 1: + i = find_token(document.body, "\\begin_inset LatexCommand", i) + if i == -1: + break + command = document.body[i][26:].strip() + if command == "": + document.warning("Malformed LyX document: Missing LatexCommand name.") + i = i + 1 + continue + + # The following parser is taken from the original InsetCommandParams::scanCommand + name = "" + option1 = "" + option2 = "" + argument = "" + state = "WS" + # Used to handle things like \command[foo[bar]]{foo{bar}} + nestdepth = 0 + b = 0 + for c in command: + if ((state == "CMDNAME" and c == ' ') or + (state == "CMDNAME" and c == '[') or + (state == "CMDNAME" and c == '{')): + state = "WS" + if ((state == "OPTION" and c == ']') or + (state == "SECOPTION" and c == ']') or + (state == "CONTENT" and c == '}')): + if nestdepth == 0: + state = "WS" + else: + nestdepth = nestdepth - 1 + if ((state == "OPTION" and c == '[') or + (state == "SECOPTION" and c == '[') or + (state == "CONTENT" and c == '{')): + nestdepth = nestdepth + 1 + if state == "CMDNAME": + name += c + elif state == "OPTION": + option1 += c + elif state == "SECOPTION": + option2 += c + elif state == "CONTENT": + argument += c + elif state == "WS": + if c == '\\': + state = "CMDNAME" + elif c == '[' and b != ']': + state = "OPTION" + nestdepth = 0 # Just to be sure + elif c == '[' and b == ']': + state = "SECOPTION" + nestdepth = 0 # Just to be sure + elif c == '{': + state = "CONTENT" + nestdepth = 0 # Just to be sure + b = c + + # Now we have parsed the command, output the parameters + lines = ["\\begin_inset LatexCommand %s" % name] + if option1 != "": + if commandparams_info[name][0] == "": + document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name)) + else: + lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"'))) + if option2 != "": + if commandparams_info[name][1] == "": + document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name)) + else: + lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"'))) + if argument != "": + if commandparams_info[name][2] == "": + document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name)) + else: + lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"'))) + document.body[i:i+1] = lines i = i + 1 +def revert_commandparams(document): + regex = re.compile(r'(\S+)\s+(.+)') + i = 0 + while 1: + i = find_token(document.body, "\\begin_inset LatexCommand", i) + if i == -1: + break + name = document.body[i].split()[2] + j = find_end_of_inset(document.body, i + 1) + preview_line = "" + option1 = "" + option2 = "" + argument = "" + for k in range(i + 1, j): + match = re.match(regex, document.body[k]) + if match: + pname = match.group(1) + pvalue = match.group(2) + if pname == "preview": + preview_line = document.body[k] + elif (commandparams_info[name][0] != "" and + pname == commandparams_info[name][0]): + option1 = pvalue.strip('"').replace('\\"', '"') + elif (commandparams_info[name][1] != "" and + pname == commandparams_info[name][1]): + option2 = pvalue.strip('"').replace('\\"', '"') + elif (commandparams_info[name][2] != "" and + pname == commandparams_info[name][2]): + argument = pvalue.strip('"').replace('\\"', '"') + elif document.body[k].strip() != "": + document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name)) + if name == "bibitem": + if option1 == "": + lines = ["\\bibitem {%s}" % argument] + else: + lines = ["\\bibitem [%s]{%s}" % (option1, argument)] + else: + if option1 == "": + if option2 == "": + lines = ["\\begin_inset LatexCommand \\%s{%s}" % (name, argument)] + else: + lines = ["\\begin_inset LatexCommand \\%s[][%s]{%s}" % (name, option2, argument)] + else: + if option2 == "": + lines = ["\\begin_inset LatexCommand \\%s[%s]{%s}" % (name, option1, argument)] + else: + lines = ["\\begin_inset LatexCommand \\%s[%s][%s]{%s}" % (name, option1, option2, argument)] + if name != "bibitem": + if preview_line != "": + lines.append(preview_line) + lines.append('') + lines.append('\\end_inset') + document.body[i:j+1] = lines + i = j + 1 + + +def revert_nomenclature(document): + " Convert nomenclature entry to ERT. " + regex = re.compile(r'(\S+)\s+(.+)') + i = 0 + use_nomencl = 0 + while 1: + i = find_token(document.body, "\\begin_inset LatexCommand nomenclature", i) + if i == -1: + break + use_nomencl = 1 + j = find_end_of_inset(document.body, i + 1) + preview_line = "" + symbol = "" + description = "" + prefix = "" + for k in range(i + 1, j): + match = re.match(regex, document.body[k]) + if match: + name = match.group(1) + value = match.group(2) + if name == "preview": + preview_line = document.body[k] + elif name == "symbol": + symbol = value.strip('"').replace('\\"', '"') + elif name == "description": + description = value.strip('"').replace('\\"', '"') + elif name == "prefix": + prefix = value.strip('"').replace('\\"', '"') + elif document.body[k].strip() != "": + document.warning("Ignoring unknown contents `%s' in nomenclature inset." % document.body[k]) + if prefix == "": + command = 'nomenclature{%s}{%s}' % (symbol, description) + else: + command = 'nomenclature[%s]{%s}{%s}' % (prefix, symbol, description) + document.body[i:j+1] = ['\\begin_inset ERT', + 'status collapsed', + '', + '\\begin_layout %s' % document.default_layout, + '', + '', + '\\backslash', + command, + '\\end_layout', + '', + '\\end_inset'] + i = i + 11 + if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1: + document.preamble.append('\\usepackage{nomencl}[2005/09/22]') + document.preamble.append('\\makenomenclature') + + +def revert_printnomenclature(document): + " Convert printnomenclature to ERT. " + regex = re.compile(r'(\S+)\s+(.+)') + i = 0 + use_nomencl = 0 + while 1: + i = find_token(document.body, "\\begin_inset LatexCommand printnomenclature", i) + if i == -1: + break + use_nomencl = 1 + j = find_end_of_inset(document.body, i + 1) + preview_line = "" + labelwidth = "" + for k in range(i + 1, j): + match = re.match(regex, document.body[k]) + if match: + name = match.group(1) + value = match.group(2) + if name == "preview": + preview_line = document.body[k] + elif name == "labelwidth": + labelwidth = value.strip('"').replace('\\"', '"') + elif document.body[k].strip() != "": + document.warning("Ignoring unknown contents `%s' in printnomenclature inset." % document.body[k]) + if labelwidth == "": + command = 'nomenclature{}' + else: + command = 'nomenclature[%s]' % labelwidth + document.body[i:j+1] = ['\\begin_inset ERT', + 'status collapsed', + '', + '\\begin_layout %s' % document.default_layout, + '', + '', + '\\backslash', + command, + '\\end_layout', + '', + '\\end_inset'] + i = i + 11 + if use_nomencl and find_token(document.preamble, '\\usepackage{nomencl}[2005/09/22]', 0) == -1: + document.preamble.append('\\usepackage{nomencl}[2005/09/22]') + document.preamble.append('\\makenomenclature') + + +def convert_esint(document): + " Add \\use_esint setting to header. " + i = find_token(document.header, "\\cite_engine", 0) + if i == -1: + document.warning("Malformed LyX document: Missing `\\cite_engine'.") + return + # 0 is off, 1 is auto, 2 is on. + document.header.insert(i, '\\use_esint 0') + + +def revert_esint(document): + " Remove \\use_esint setting from header. " + i = find_token(document.header, "\\use_esint", 0) + if i == -1: + document.warning("Malformed LyX document: Missing `\\use_esint'.") + return + use_esint = document.header[i].split()[1] + del document.header[i] + # 0 is off, 1 is auto, 2 is on. + if (use_esint == 2): + document.preamble.append('\\usepackage{esint}') + + +def revert_clearpage(document): + " clearpage -> ERT " + i = 0 + while 1: + i = find_token(document.body, "\\clearpage", i) + if i == -1: + break + document.body[i:i+1] = ['\\begin_inset ERT', + 'status collapsed', + '', + '\\begin_layout %s' % document.default_layout, + '', + '', + '\\backslash', + 'clearpage', + '\\end_layout', + '', + '\\end_inset'] + i = i + 1 + + +def revert_cleardoublepage(document): + " cleardoublepage -> ERT " + i = 0 + while 1: + i = find_token(document.body, "\\cleardoublepage", i) + if i == -1: + break + document.body[i:i+1] = ['\\begin_inset ERT', + 'status collapsed', + '', + '\\begin_layout %s' % document.default_layout, + '', + '', + '\\backslash', + 'cleardoublepage', + '\\end_layout', + '', + '\\end_inset'] + i = i + 1 + + +def convert_lyxline(document): + " remove fontsize commands for \lyxline " + # The problematic is: The old \lyxline definition doesn't handle the fontsize + # to change the line thickness. The new definiton does this so that imported + # \lyxlines would have a different line thickness. The eventual fontsize command + # before \lyxline is therefore removed to get the same output. + fontsizes = ["tiny", "scriptsize", "footnotesize", "small", "normalsize", + "large", "Large", "LARGE", "huge", "Huge"] + for n in range(0, len(fontsizes)): + i = 0 + k = 0 + while i < len(document.body): + i = find_token(document.body, "\\size " + fontsizes[n], i) + k = find_token(document.body, "\\lyxline",i) + # the corresponding fontsize command is always 2 lines before the \lyxline + if (i != -1 and k == i+2): + document.body[i:i+1] = [] + else: + break + i = i + 1 + + +def revert_encodings(document): + " Set new encodings to auto. " + encodings = ["8859-6", "8859-8", "cp437", "cp437de", "cp850", "cp852", + "cp855", "cp858", "cp862", "cp865", "cp866", "cp1250", + "cp1252", "cp1256", "cp1257", "latin10", "pt254", "tis620-0"] + i = find_token(document.header, "\\inputencoding", 0) + if i == -1: + document.header.append("\\inputencoding auto") + else: + inputenc = get_value(document.header, "\\inputencoding", i) + if inputenc in encodings: + document.header[i] = "\\inputencoding auto" + document.inputencoding = get_value(document.header, "\\inputencoding", 0) + + +def convert_caption(document): + " Convert caption layouts to caption insets. " + i = 0 + while 1: + i = find_token(document.body, "\\begin_layout Caption", i) + if i == -1: + return + j = find_end_of_layout(document.body, i) + if j == -1: + document.warning("Malformed LyX document: Missing `\\end_layout'.") + return + + document.body[j:j] = ["\\end_layout", "", "\\end_inset", "", ""] + document.body[i:i+1] = ["\\begin_layout %s" % document.default_layout, + "\\begin_inset Caption", "", + "\\begin_layout %s" % document.default_layout] + i = i + 1 + + +def revert_caption(document): + " Convert caption insets to caption layouts. " + " This assumes that the text class has a caption style. " + i = 0 + while 1: + i = find_token(document.body, "\\begin_inset Caption", i) + if i == -1: + return + + # We either need to delete the previous \begin_layout line, or we + # need to end the previous layout if this inset is not in the first + # position of the paragraph. + layout_before = find_token_backwards(document.body, "\\begin_layout", i) + if layout_before == -1: + document.warning("Malformed LyX document: Missing `\\begin_layout'.") + return + layout_line = document.body[layout_before] + del_layout_before = True + l = layout_before + 1 + while l < i: + if document.body[l] != "": + del_layout_before = False + break + l = l + 1 + if del_layout_before: + del document.body[layout_before:i] + i = layout_before + else: + document.body[i:i] = ["\\end_layout", ""] + i = i + 2 + + # Find start of layout in the inset and end of inset + j = find_token(document.body, "\\begin_layout", i) + if j == -1: + document.warning("Malformed LyX document: Missing `\\begin_layout'.") + return + k = find_end_of_inset(document.body, i) + if k == -1: + document.warning("Malformed LyX document: Missing `\\end_inset'.") + return + + # We either need to delete the following \end_layout line, or we need + # to restart the old layout if this inset is not at the paragraph end. + layout_after = find_token(document.body, "\\end_layout", k) + if layout_after == -1: + document.warning("Malformed LyX document: Missing `\\end_layout'.") + return + del_layout_after = True + l = k + 1 + while l < layout_after: + if document.body[l] != "": + del_layout_after = False + break + l = l + 1 + if del_layout_after: + del document.body[k+1:layout_after+1] + else: + document.body[k+1:k+1] = [layout_line, ""] + + # delete \begin_layout and \end_inset and replace \begin_inset with + # "\begin_layout Caption". This works because we can only have one + # paragraph in the caption inset: The old \end_layout will be recycled. + del document.body[k] + if document.body[k] == "": + del document.body[k] + del document.body[j] + if document.body[j] == "": + del document.body[j] + document.body[i] = "\\begin_layout Caption" + if document.body[i+1] == "": + del document.body[i+1] + i = i + 1 + + +# Accents of InsetLaTeXAccent +accent_map = { + "`" : u'\u0300', # grave + "'" : u'\u0301', # acute + "^" : u'\u0302', # circumflex + "~" : u'\u0303', # tilde + "=" : u'\u0304', # macron + "u" : u'\u0306', # breve + "." : u'\u0307', # dot above + "\"": u'\u0308', # diaresis + "r" : u'\u030a', # ring above + "H" : u'\u030b', # double acute + "v" : u'\u030c', # caron + "b" : u'\u0320', # minus sign below + "d" : u'\u0323', # dot below + "c" : u'\u0327', # cedilla + "k" : u'\u0328', # ogonek + "t" : u'\u0361' # tie. This is special: It spans two characters, but + # only one is given as argument, so we don't need to + # treat it differently. +} + + +# special accents of InsetLaTeXAccent without argument +special_accent_map = { + 'i' : u'\u0131', # dotless i + 'j' : u'\u0237', # dotless j + 'l' : u'\u0142', # l with stroke + 'L' : u'\u0141' # L with stroke +} + + +# special accent arguments of InsetLaTeXAccent +accented_map = { + '\\i' : u'\u0131', # dotless i + '\\j' : u'\u0237' # dotless j +} + + +def _convert_accent(accent, accented_char): + type = accent + char = accented_char + if char == '': + if type in special_accent_map: + return special_accent_map[type] + # a missing char is treated as space by LyX + char = ' ' + elif type == 'q' and char in ['t', 'd', 'l', 'L']: + # Special caron, only used with t, d, l and L. + # It is not in the map because we convert it to the same unicode + # character as the normal caron: \q{} is only defined if babel with + # the czech or slovak language is used, and the normal caron + # produces the correct output if the T1 font encoding is used. + # For the same reason we never convert to \q{} in the other direction. + type = 'v' + elif char in accented_map: + char = accented_map[char] + elif (len(char) > 1): + # We can only convert accents on a single char + return '' + a = accent_map.get(type) + if a: + return unicodedata.normalize("NFKC", "%s%s" % (char, a)) + return '' + + +def convert_ertbackslash(body, i, ert, default_layout): + r""" ------------------------------------------------------------------------------------------- + Convert backslashes and '\n' into valid ERT code, append the converted + text to body[i] and return the (maybe incremented) line index i""" + + for c in ert: + if c == '\\': + body[i] = body[i] + '\\backslash ' + i = i + 1 + body.insert(i, '') + elif c == '\n': + body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, ''] + i = i + 4 + else: + body[i] = body[i] + c + return i + + +def convert_accent(document): + # The following forms are supported by LyX: + # '\i \"{a}' (standard form, as written by LyX) + # '\i \"{}' (standard form, as written by LyX if the accented char is a space) + # '\i \"{ }' (also accepted if the accented char is a space) + # '\i \" a' (also accepted) + # '\i \"' (also accepted) + re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$') + re_contents = re.compile(r'^([^\s{]+)(.*)$') + re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$') + i = 0 + while 1: + i = find_re(document.body, re_wholeinset, i) + if i == -1: + return + match = re_wholeinset.match(document.body[i]) + prefix = match.group(1) + contents = match.group(3).strip() + match = re_contents.match(contents) + if match: + # Strip first char (always \) + accent = match.group(1)[1:] + accented_contents = match.group(2).strip() + match = re_accentedcontents.match(accented_contents) + accented_char = match.group(1) + converted = _convert_accent(accent, accented_char) + if converted == '': + # Normalize contents + contents = '%s{%s}' % (accent, accented_char), + else: + document.body[i] = '%s%s' % (prefix, converted) + i += 1 + continue + document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents) + document.body[i] = prefix + document.body[i+1:i+1] = ['\\begin_inset ERT', + 'status collapsed', + '', + '\\begin_layout %s' % document.default_layout, + '', + '', + ''] + i = convert_ertbackslash(document.body, i + 7, + '\\%s' % contents, + document.default_layout) + document.body[i+1:i+1] = ['\\end_layout', + '', + '\\end_inset'] + i += 3 + + +def revert_accent(document): + inverse_accent_map = {} + for k in accent_map: + inverse_accent_map[accent_map[k]] = k + inverse_special_accent_map = {} + for k in special_accent_map: + inverse_special_accent_map[special_accent_map[k]] = k + inverse_accented_map = {} + for k in accented_map: + inverse_accented_map[accented_map[k]] = k + + # Since LyX may insert a line break within a word we must combine all + # words before unicode normalization. + # We do this only if the next line starts with an accent, otherwise we + # would create things like '\begin_inset ERTstatus'. + numberoflines = len(document.body) + for i in range(numberoflines-1): + if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ': + continue + if (document.body[i+1][0] in inverse_accent_map): + # the last character of this line and the first of the next line + # form probably a surrogate pair. + while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '): + document.body[i] += document.body[i+1][0] + document.body[i+1] = document.body[i+1][1:] + + # Normalize to "Normal form D" (NFD, also known as canonical decomposition). + # This is needed to catch all accented characters. + for i in range(numberoflines): + # Unfortunately we have a mixture of unicode strings and plain strings, + # because we never use u'xxx' for string literals, but 'xxx'. + # Therefore we may have to try two times to normalize the data. + try: + document.body[i] = unicodedata.normalize("NFKD", document.body[i]) + except TypeError: + document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8')) + + # Replace accented characters with InsetLaTeXAccent + # Do not convert characters that can be represented in the chosen + # encoding. + encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)] + lang_re = re.compile(r"^\\lang\s(\S+)") + for i in range(len(document.body)): + + if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '': + # Track the encoding of the current line + result = lang_re.match(document.body[i]) + if result: + language = result.group(1) + if language == "default": + encoding_stack[-1] = document.encoding + else: + from lyx2lyx_lang import lang + encoding_stack[-1] = lang[language][3] + continue + elif find_token(document.body, "\\begin_layout", i, i + 1) == i: + encoding_stack.append(encoding_stack[-1]) + continue + elif find_token(document.body, "\\end_layout", i, i + 1) == i: + del encoding_stack[-1] + continue + + for j in range(len(document.body[i])): + # dotless i and dotless j are both in special_accent_map and can + # occur as an accented character, so we need to test that the + # following character is no accent + if (document.body[i][j] in inverse_special_accent_map and + (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)): + accent = document.body[i][j] + try: + dummy = accent.encode(encoding_stack[-1]) + except UnicodeEncodeError: + # Insert the rest of the line as new line + if j < len(document.body[i]) - 1: + document.body[i+1:i+1] = document.body[i][j+1:] + # Delete the accented character + if j > 0: + document.body[i] = document.body[i][:j-1] + else: + document.body[i] = u'' + # Finally add the InsetLaTeXAccent + document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent] + break + elif j > 0 and document.body[i][j] in inverse_accent_map: + accented_char = document.body[i][j-1] + if accented_char == ' ': + # Conform to LyX output + accented_char = '' + elif accented_char in inverse_accented_map: + accented_char = inverse_accented_map[accented_char] + accent = document.body[i][j] + try: + dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1]) + except UnicodeEncodeError: + # Insert the rest of the line as new line + if j < len(document.body[i]) - 1: + document.body[i+1:i+1] = document.body[i][j+1:] + # Delete the accented characters + if j > 1: + document.body[i] = document.body[i][:j-2] + else: + document.body[i] = u'' + # Finally add the InsetLaTeXAccent + document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char) + break + # Normalize to "Normal form C" (NFC, pre-composed characters) again + for i in range(numberoflines): + document.body[i] = unicodedata.normalize("NFKC", document.body[i]) + + +def normalize_font_whitespace(document): + """ Before format 259 the font changes were ignored if a + whitespace was the first or last character in the sequence, this function + transfers the whitespace outside.""" + + if document.backend != "latex": + return + + lines = document.body + + char_properties = {"\\series": "default", + "\\emph": "default", + "\\color": "none", + "\\shape": "default", + "\\bar": "default", + "\\family": "default"} + changes = {} + + i = 0 + while i < len(lines): + words = lines[i].split() + + if len(words) > 0 and words[0] == "\\begin_layout": + # a new paragraph resets all font changes + changes.clear() + + elif len(words) > 1 and words[0] in char_properties.keys(): + # we have a font change + if char_properties[words[0]] == words[1]: + # property gets reset + if words[0] in changes.keys(): + del changes[words[0]] + defaultproperty = True + else: + # property gets set + changes[words[0]] = words[1] + defaultproperty = False + + # We need to explicitly reset all changed properties if we find + # a space below, because LyX 1.4 would output the space after + # closing the previous change and before starting the new one, + # and closing a font change means to close all properties, not + # just the changed one. + + if lines[i-1] and lines[i-1][-1] == " ": + lines[i-1] = lines[i-1][:-1] + # a space before the font change + added_lines = [" "] + for k in changes.keys(): + # exclude property k because that is already in lines[i] + if k != words[0]: + added_lines[1:1] = ["%s %s" % (k, changes[k])] + for k in changes.keys(): + # exclude property k because that must be added below anyway + if k != words[0]: + added_lines[0:0] = ["%s %s" % (k, char_properties[k])] + if defaultproperty: + # Property is reset in lines[i], so add the new stuff afterwards + lines[i+1:i+1] = added_lines + else: + # Reset property for the space + added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])] + lines[i:i] = added_lines + i = i + len(added_lines) + + elif lines[i+1] and lines[i+1][0] == " " and (len(changes) > 0 or not defaultproperty): + # a space after the font change + if (lines[i+1] == " " and lines[i+2]): + next_words = lines[i+2].split() + if len(next_words) > 0 and next_words[0] == words[0]: + # a single blank with a property different from the + # previous and the next line must not be changed + i = i + 2 + continue + lines[i+1] = lines[i+1][1:] + added_lines = [" "] + for k in changes.keys(): + # exclude property k because that is already in lines[i] + if k != words[0]: + added_lines[1:1] = ["%s %s" % (k, changes[k])] + for k in changes.keys(): + # exclude property k because that must be added below anyway + if k != words[0]: + added_lines[0:0] = ["%s %s" % (k, char_properties[k])] + # Reset property for the space + added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])] + lines[i:i] = added_lines + i = i + len(added_lines) + + i = i + 1 + + +def revert_utf8x(document): + " Set utf8x encoding to utf8. " + i = find_token(document.header, "\\inputencoding", 0) + if i == -1: + document.header.append("\\inputencoding auto") + else: + inputenc = get_value(document.header, "\\inputencoding", i) + if inputenc == "utf8x": + document.header[i] = "\\inputencoding utf8" + document.inputencoding = get_value(document.header, "\\inputencoding", 0) + + ## # Conversion hub # +supported_versions = ["1.5.0","1.5"] convert = [[246, []], [247, [convert_font_settings]], - [248, []]] + [248, []], + [249, [convert_utf8]], + [250, []], + [251, []], + [252, [convert_commandparams, convert_bibitem]], + [253, []], + [254, [convert_esint]], + [255, []], + [256, []], + [257, [convert_caption]], + [258, [convert_lyxline]], + [259, [convert_accent, normalize_font_whitespace]], + [260, []]] -revert = [[247, [revert_booktabs]], +revert = [[259, [revert_utf8x]], + [258, []], + [257, []], + [256, [revert_caption]], + [255, [revert_encodings]], + [254, [revert_clearpage, revert_cleardoublepage]], + [253, [revert_esint]], + [252, [revert_nomenclature, revert_printnomenclature]], + [251, [revert_commandparams]], + [250, [revert_cs_label]], + [249, []], + [248, [revert_accent, revert_utf8]], + [247, [revert_booktabs]], [246, [revert_font_settings]], [245, [revert_framed]]] @@ -217,3 +1235,4 @@ revert = [[247, [revert_booktabs]], if __name__ == "__main__": pass +