X-Git-Url: https://git.lyx.org/gitweb/?a=blobdiff_plain;f=lib%2Flyx2lyx%2Flyx_1_5.py;h=a57282b2561d669f6c60844636e9a0f039e58f33;hb=9fd8a869616cad617d0db8b022119c35855ea39d;hp=bf4b6631e6773eaa622cb36bbbf1b6e83fe6887f;hpb=82b0e98d549605b38c5666708707f21ebee151ed;p=lyx.git diff --git a/lib/lyx2lyx/lyx_1_5.py b/lib/lyx2lyx/lyx_1_5.py index bf4b6631e6..a57282b256 100644 --- a/lib/lyx2lyx/lyx_1_5.py +++ b/lib/lyx2lyx/lyx_1_5.py @@ -20,7 +20,9 @@ """ Convert files to the file format generated by lyx 1.5""" import re -from parser_tools import find_token, find_token_exact, find_tokens, find_end_of, get_value +import unicodedata + +from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value from LyX import get_encoding @@ -28,9 +30,13 @@ from LyX import get_encoding # Private helper functions def find_end_of_inset(lines, i): - " Find beginning of inset, where lines[i] is included." + " Find end of inset, where lines[i] is included." return find_end_of(lines, i, "\\begin_inset", "\\end_inset") +def find_end_of_layout(lines, i): + " Find end of layout, where lines[i] is included." + return find_end_of(lines, i, "\\begin_layout", "\\end_layout") + # End of helper functions #################################################################### @@ -217,18 +223,81 @@ def revert_booktabs(document): i = i + 1 +def convert_multiencoding(document, forward): + """ Fix files with multiple encodings. +Files with an inputencoding of "auto" or "default" and multiple languages +where at least two languages have different default encodings are encoded +in multiple encodings for file formats < 249. These files are incorrectly +read and written (as if the whole file was in the encoding of the main +language). +This is not true for files written by CJK-LyX, they are always in the locale +encoding. + +This function +- converts from fake unicode values to true unicode if forward is true, and +- converts from true unicode values to fake unicode if forward is false. +document.encoding must be set to the old value (format 248) in both cases. + +We do this here and not in LyX.py because it is far easier to do the +necessary parsing in modern formats than in ancient ones. +""" + if document.cjk_encoding != '': + return + encoding_stack = [document.encoding] + lang_re = re.compile(r"^\\lang\s(\S+)") + if document.inputencoding == "auto" or document.inputencoding == "default": + for i in range(len(document.body)): + result = lang_re.match(document.body[i]) + if result: + language = result.group(1) + if language == "default": + document.warning("Resetting encoding from %s to %s." % (encoding_stack[-1], document.encoding)) + encoding_stack[-1] = document.encoding + else: + from lyx2lyx_lang import lang + document.warning("Setting encoding from %s to %s." % (encoding_stack[-1], lang[language][3])) + encoding_stack[-1] = lang[language][3] + elif find_token(document.body, "\\begin_layout", i, i + 1) == i: + document.warning("Adding nested encoding %s." % encoding_stack[-1]) + encoding_stack.append(encoding_stack[-1]) + elif find_token(document.body, "\\end_layout", i, i + 1) == i: + document.warning("Removing nested encoding %s." % encoding_stack[-1]) + del encoding_stack[-1] + if encoding_stack[-1] != document.encoding: + if forward: + # This line has been incorrectly interpreted as if it was + # encoded in 'encoding'. + # Convert back to the 8bit string that was in the file. + orig = document.body[i].encode(document.encoding) + # Convert the 8bit string that was in the file to unicode + # with the correct encoding. + document.body[i] = orig.decode(encoding_stack[-1]) + else: + # Convert unicode to the 8bit string that will be written + # to the file with the correct encoding. + orig = document.body[i].encode(encoding_stack[-1]) + # Convert the 8bit string that will be written to the + # file to fake unicode with the encoding that will later + # be used when writing to the file. + document.body[i] = orig.decode(document.encoding) + + def convert_utf8(document): + " Set document encoding to UTF-8. " + convert_multiencoding(document, True) document.encoding = "utf8" def revert_utf8(document): + " Set document encoding to the value corresponding to inputencoding. " i = find_token(document.header, "\\inputencoding", 0) if i == -1: document.header.append("\\inputencoding auto") elif get_value(document.header, "\\inputencoding", i) == "utf8": document.header[i] = "\\inputencoding auto" document.inputencoding = get_value(document.header, "\\inputencoding", 0) - document.encoding = get_encoding(document.language, document.inputencoding, 248) + document.encoding = get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding) + convert_multiencoding(document, False) def revert_cs_label(document): @@ -605,7 +674,7 @@ def revert_esint(document): def revert_clearpage(document): - " clearpage -> ERT" + " clearpage -> ERT " i = 0 while 1: i = find_token(document.body, "\\clearpage", i) @@ -626,7 +695,7 @@ def revert_clearpage(document): def revert_cleardoublepage(document): - " cleardoublepage -> ERT" + " cleardoublepage -> ERT " i = 0 while 1: i = find_token(document.body, "\\cleardoublepage", i) @@ -646,6 +715,485 @@ def revert_cleardoublepage(document): i = i + 1 +def convert_lyxline(document): + " remove fontsize commands for \lyxline " + # The problematic is: The old \lyxline definition doesn't handle the fontsize + # to change the line thickness. The new definiton does this so that imported + # \lyxlines would have a different line thickness. The eventual fontsize command + # before \lyxline is therefore removed to get the same output. + fontsizes = ["tiny", "scriptsize", "footnotesize", "small", "normalsize", + "large", "Large", "LARGE", "huge", "Huge"] + for n in range(0, len(fontsizes)): + i = 0 + k = 0 + while i < len(document.body): + i = find_token(document.body, "\\size " + fontsizes[n], i) + k = find_token(document.body, "\\lyxline",i) + # the corresponding fontsize command is always 2 lines before the \lyxline + if (i != -1 and k == i+2): + document.body[i:i+1] = [] + else: + break + i = i + 1 + + +def revert_encodings(document): + " Set new encodings to auto. " + encodings = ["8859-6", "8859-8", "cp437", "cp437de", "cp850", "cp852", + "cp855", "cp858", "cp862", "cp865", "cp866", "cp1250", + "cp1252", "cp1256", "cp1257", "latin10", "pt254", "tis620-0"] + i = find_token(document.header, "\\inputencoding", 0) + if i == -1: + document.header.append("\\inputencoding auto") + else: + inputenc = get_value(document.header, "\\inputencoding", i) + if inputenc in encodings: + document.header[i] = "\\inputencoding auto" + document.inputencoding = get_value(document.header, "\\inputencoding", 0) + + +def convert_caption(document): + " Convert caption layouts to caption insets. " + i = 0 + while 1: + i = find_token(document.body, "\\begin_layout Caption", i) + if i == -1: + return + j = find_end_of_layout(document.body, i) + if j == -1: + document.warning("Malformed LyX document: Missing `\\end_layout'.") + return + + document.body[j:j] = ["\\end_layout", "", "\\end_inset", "", ""] + document.body[i:i+1] = ["\\begin_layout %s" % document.default_layout, + "\\begin_inset Caption", "", + "\\begin_layout %s" % document.default_layout] + i = i + 1 + + +def revert_caption(document): + " Convert caption insets to caption layouts. " + " This assumes that the text class has a caption style. " + i = 0 + while 1: + i = find_token(document.body, "\\begin_inset Caption", i) + if i == -1: + return + + # We either need to delete the previous \begin_layout line, or we + # need to end the previous layout if this inset is not in the first + # position of the paragraph. + layout_before = find_token_backwards(document.body, "\\begin_layout", i) + if layout_before == -1: + document.warning("Malformed LyX document: Missing `\\begin_layout'.") + return + layout_line = document.body[layout_before] + del_layout_before = True + l = layout_before + 1 + while l < i: + if document.body[l] != "": + del_layout_before = False + break + l = l + 1 + if del_layout_before: + del document.body[layout_before:i] + i = layout_before + else: + document.body[i:i] = ["\\end_layout", ""] + i = i + 2 + + # Find start of layout in the inset and end of inset + j = find_token(document.body, "\\begin_layout", i) + if j == -1: + document.warning("Malformed LyX document: Missing `\\begin_layout'.") + return + k = find_end_of_inset(document.body, i) + if k == -1: + document.warning("Malformed LyX document: Missing `\\end_inset'.") + return + + # We either need to delete the following \end_layout line, or we need + # to restart the old layout if this inset is not at the paragraph end. + layout_after = find_token(document.body, "\\end_layout", k) + if layout_after == -1: + document.warning("Malformed LyX document: Missing `\\end_layout'.") + return + del_layout_after = True + l = k + 1 + while l < layout_after: + if document.body[l] != "": + del_layout_after = False + break + l = l + 1 + if del_layout_after: + del document.body[k+1:layout_after+1] + else: + document.body[k+1:k+1] = [layout_line, ""] + + # delete \begin_layout and \end_inset and replace \begin_inset with + # "\begin_layout Caption". This works because we can only have one + # paragraph in the caption inset: The old \end_layout will be recycled. + del document.body[k] + if document.body[k] == "": + del document.body[k] + del document.body[j] + if document.body[j] == "": + del document.body[j] + document.body[i] = "\\begin_layout Caption" + if document.body[i+1] == "": + del document.body[i+1] + i = i + 1 + + +# Accents of InsetLaTeXAccent +accent_map = { + "`" : u'\u0300', # grave + "'" : u'\u0301', # acute + "^" : u'\u0302', # circumflex + "~" : u'\u0303', # tilde + "=" : u'\u0304', # macron + "u" : u'\u0306', # breve + "." : u'\u0307', # dot above + "\"": u'\u0308', # diaresis + "r" : u'\u030a', # ring above + "H" : u'\u030b', # double acute + "v" : u'\u030c', # caron + "b" : u'\u0320', # minus sign below + "d" : u'\u0323', # dot below + "c" : u'\u0327', # cedilla + "k" : u'\u0328', # ogonek + "t" : u'\u0361' # tie. This is special: It spans two characters, but + # only one is given as argument, so we don't need to + # treat it differently. +} + + +# special accents of InsetLaTeXAccent without argument +special_accent_map = { + 'i' : u'\u0131', # dotless i + 'j' : u'\u0237', # dotless j + 'l' : u'\u0142', # l with stroke + 'L' : u'\u0141' # L with stroke +} + + +# special accent arguments of InsetLaTeXAccent +accented_map = { + '\\i' : u'\u0131', # dotless i + '\\j' : u'\u0237' # dotless j +} + + +def _convert_accent(accent, accented_char): + type = accent + char = accented_char + if char == '': + if type in special_accent_map: + return special_accent_map[type] + # a missing char is treated as space by LyX + char = ' ' + elif type == 'q' and char in ['t', 'd', 'l', 'L']: + # Special caron, only used with t, d, l and L. + # It is not in the map because we convert it to the same unicode + # character as the normal caron: \q{} is only defined if babel with + # the czech or slovak language is used, and the normal caron + # produces the correct output if the T1 font encoding is used. + # For the same reason we never convert to \q{} in the other direction. + type = 'v' + elif char in accented_map: + char = accented_map[char] + elif (len(char) > 1): + # We can only convert accents on a single char + return '' + a = accent_map.get(type) + if a: + return unicodedata.normalize("NFKC", "%s%s" % (char, a)) + return '' + + +def convert_ertbackslash(body, i, ert, default_layout): + r""" ------------------------------------------------------------------------------------------- + Convert backslashes and '\n' into valid ERT code, append the converted + text to body[i] and return the (maybe incremented) line index i""" + + for c in ert: + if c == '\\': + body[i] = body[i] + '\\backslash ' + i = i + 1 + body.insert(i, '') + elif c == '\n': + body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, ''] + i = i + 4 + else: + body[i] = body[i] + c + return i + + +def convert_accent(document): + # The following forms are supported by LyX: + # '\i \"{a}' (standard form, as written by LyX) + # '\i \"{}' (standard form, as written by LyX if the accented char is a space) + # '\i \"{ }' (also accepted if the accented char is a space) + # '\i \" a' (also accepted) + # '\i \"' (also accepted) + re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$') + re_contents = re.compile(r'^([^\s{]+)(.*)$') + re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$') + i = 0 + while 1: + i = find_re(document.body, re_wholeinset, i) + if i == -1: + return + match = re_wholeinset.match(document.body[i]) + prefix = match.group(1) + contents = match.group(3).strip() + match = re_contents.match(contents) + if match: + # Strip first char (always \) + accent = match.group(1)[1:] + accented_contents = match.group(2).strip() + match = re_accentedcontents.match(accented_contents) + accented_char = match.group(1) + converted = _convert_accent(accent, accented_char) + if converted == '': + # Normalize contents + contents = '%s{%s}' % (accent, accented_char), + else: + document.body[i] = '%s%s' % (prefix, converted) + i += 1 + continue + document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents) + document.body[i] = prefix + document.body[i+1:i+1] = ['\\begin_inset ERT', + 'status collapsed', + '', + '\\begin_layout %s' % document.default_layout, + '', + '', + ''] + i = convert_ertbackslash(document.body, i + 7, + '\\%s' % contents, + document.default_layout) + document.body[i+1:i+1] = ['\\end_layout', + '', + '\\end_inset'] + i += 3 + + +def revert_accent(document): + inverse_accent_map = {} + for k in accent_map: + inverse_accent_map[accent_map[k]] = k + inverse_special_accent_map = {} + for k in special_accent_map: + inverse_special_accent_map[special_accent_map[k]] = k + inverse_accented_map = {} + for k in accented_map: + inverse_accented_map[accented_map[k]] = k + + # Since LyX may insert a line break within a word we must combine all + # words before unicode normalization. + # We do this only if the next line starts with an accent, otherwise we + # would create things like '\begin_inset ERTstatus'. + numberoflines = len(document.body) + for i in range(numberoflines-1): + if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ': + continue + if (document.body[i+1][0] in inverse_accent_map): + # the last character of this line and the first of the next line + # form probably a surrogate pair. + while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '): + document.body[i] += document.body[i+1][0] + document.body[i+1] = document.body[i+1][1:] + + # Normalize to "Normal form D" (NFD, also known as canonical decomposition). + # This is needed to catch all accented characters. + for i in range(numberoflines): + # Unfortunately we have a mixture of unicode strings and plain strings, + # because we never use u'xxx' for string literals, but 'xxx'. + # Therefore we may have to try two times to normalize the data. + try: + document.body[i] = unicodedata.normalize("NFKD", document.body[i]) + except TypeError: + document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8')) + + # Replace accented characters with InsetLaTeXAccent + # Do not convert characters that can be represented in the chosen + # encoding. + encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)] + lang_re = re.compile(r"^\\lang\s(\S+)") + for i in range(len(document.body)): + + if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '': + # Track the encoding of the current line + result = lang_re.match(document.body[i]) + if result: + language = result.group(1) + if language == "default": + encoding_stack[-1] = document.encoding + else: + from lyx2lyx_lang import lang + encoding_stack[-1] = lang[language][3] + continue + elif find_token(document.body, "\\begin_layout", i, i + 1) == i: + encoding_stack.append(encoding_stack[-1]) + continue + elif find_token(document.body, "\\end_layout", i, i + 1) == i: + del encoding_stack[-1] + continue + + for j in range(len(document.body[i])): + # dotless i and dotless j are both in special_accent_map and can + # occur as an accented character, so we need to test that the + # following character is no accent + if (document.body[i][j] in inverse_special_accent_map and + (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)): + accent = document.body[i][j] + try: + dummy = accent.encode(encoding_stack[-1]) + except UnicodeEncodeError: + # Insert the rest of the line as new line + if j < len(document.body[i]) - 1: + document.body[i+1:i+1] = document.body[i][j+1:] + # Delete the accented character + if j > 0: + document.body[i] = document.body[i][:j-1] + else: + document.body[i] = u'' + # Finally add the InsetLaTeXAccent + document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent] + break + elif j > 0 and document.body[i][j] in inverse_accent_map: + accented_char = document.body[i][j-1] + if accented_char == ' ': + # Conform to LyX output + accented_char = '' + elif accented_char in inverse_accented_map: + accented_char = inverse_accented_map[accented_char] + accent = document.body[i][j] + try: + dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1]) + except UnicodeEncodeError: + # Insert the rest of the line as new line + if j < len(document.body[i]) - 1: + document.body[i+1:i+1] = document.body[i][j+1:] + # Delete the accented characters + if j > 1: + document.body[i] = document.body[i][:j-2] + else: + document.body[i] = u'' + # Finally add the InsetLaTeXAccent + document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char) + break + # Normalize to "Normal form C" (NFC, pre-composed characters) again + for i in range(numberoflines): + document.body[i] = unicodedata.normalize("NFKC", document.body[i]) + + +def normalize_font_whitespace(document): + """ Before format 259 the font changes were ignored if a + whitespace was the first or last character in the sequence, this function + transfers the whitespace outside.""" + + if document.backend != "latex": + return + + lines = document.body + + char_properties = {"\\series": "default", + "\\emph": "default", + "\\color": "none", + "\\shape": "default", + "\\bar": "default", + "\\family": "default"} + changes = {} + + i = 0 + while i < len(lines): + words = lines[i].split() + + if len(words) > 0 and words[0] == "\\begin_layout": + # a new paragraph resets all font changes + changes.clear() + + elif len(words) > 1 and words[0] in char_properties.keys(): + # we have a font change + if char_properties[words[0]] == words[1]: + # property gets reset + if words[0] in changes.keys(): + del changes[words[0]] + defaultproperty = True + else: + # property gets set + changes[words[0]] = words[1] + defaultproperty = False + + # We need to explicitly reset all changed properties if we find + # a space below, because LyX 1.4 would output the space after + # closing the previous change and before starting the new one, + # and closing a font change means to close all properties, not + # just the changed one. + + if lines[i-1] and lines[i-1][-1] == " ": + lines[i-1] = lines[i-1][:-1] + # a space before the font change + added_lines = [" "] + for k in changes.keys(): + # exclude property k because that is already in lines[i] + if k != words[0]: + added_lines[1:1] = ["%s %s" % (k, changes[k])] + for k in changes.keys(): + # exclude property k because that must be added below anyway + if k != words[0]: + added_lines[0:0] = ["%s %s" % (k, char_properties[k])] + if defaultproperty: + # Property is reset in lines[i], so add the new stuff afterwards + lines[i+1:i+1] = added_lines + else: + # Reset property for the space + added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])] + lines[i:i] = added_lines + i = i + len(added_lines) + + elif lines[i+1] and lines[i+1][0] == " " and (len(changes) > 0 or not defaultproperty): + # a space after the font change + if (lines[i+1] == " " and lines[i+2]): + next_words = lines[i+2].split() + if len(next_words) > 0 and next_words[0] == words[0]: + # a single blank with a property different from the + # previous and the next line must not be changed + i = i + 2 + continue + lines[i+1] = lines[i+1][1:] + added_lines = [" "] + for k in changes.keys(): + # exclude property k because that is already in lines[i] + if k != words[0]: + added_lines[1:1] = ["%s %s" % (k, changes[k])] + for k in changes.keys(): + # exclude property k because that must be added below anyway + if k != words[0]: + added_lines[0:0] = ["%s %s" % (k, char_properties[k])] + # Reset property for the space + added_lines[0:0] = ["%s %s" % (words[0], char_properties[words[0]])] + lines[i:i] = added_lines + i = i + len(added_lines) + + i = i + 1 + + +def revert_utf8x(document): + " Set utf8x encoding to utf8. " + i = find_token(document.header, "\\inputencoding", 0) + if i == -1: + document.header.append("\\inputencoding auto") + else: + inputenc = get_value(document.header, "\\inputencoding", i) + if inputenc == "utf8x": + document.header[i] = "\\inputencoding utf8" + document.inputencoding = get_value(document.header, "\\inputencoding", 0) + + ## # Conversion hub # @@ -660,15 +1208,25 @@ convert = [[246, []], [252, [convert_commandparams, convert_bibitem]], [253, []], [254, [convert_esint]], - [255, []]] + [255, []], + [256, []], + [257, [convert_caption]], + [258, [convert_lyxline]], + [259, [convert_accent, normalize_font_whitespace]], + [260, []]] -revert = [[254, [revert_clearpage, revert_cleardoublepage]], +revert = [[259, [revert_utf8x]], + [258, []], + [257, []], + [256, [revert_caption]], + [255, [revert_encodings]], + [254, [revert_clearpage, revert_cleardoublepage]], [253, [revert_esint]], [252, [revert_nomenclature, revert_printnomenclature]], [251, [revert_commandparams]], [250, [revert_cs_label]], [249, []], - [248, [revert_utf8]], + [248, [revert_accent, revert_utf8]], [247, [revert_booktabs]], [246, [revert_font_settings]], [245, [revert_framed]]] @@ -677,3 +1235,4 @@ revert = [[254, [revert_clearpage, revert_cleardoublepage]], if __name__ == "__main__": pass +