From 2fce4d49ee685e5af8b3bf8dae551de30144ee5f Mon Sep 17 00:00:00 2001 From: =?utf8?q?G=C3=BCnter=20Milde?= Date: Wed, 31 Jan 2018 15:09:32 +0100 Subject: [PATCH] lyx2lyx refactoring. * use unicode.transform() instead of loop over replacements * telling variable names * remove trailing whitespace * documentation update * don't set use_ligature_dashes if both dash types are found * remove spurious warning, normalize indentation, and use Python idioms in revert_baselineskip() --- autotests/export/lyx2lyx/lyx_2_3_test.lyx | 66 ++++++++++- lib/lyx2lyx/lyx2lyx_tools.py | 59 +++++----- lib/lyx2lyx/lyx_1_6.py | 52 +-------- lib/lyx2lyx/lyx_2_2.py | 4 +- lib/lyx2lyx/lyx_2_3.py | 136 +++++++++------------- lib/lyx2lyx/parser_tools.py | 13 ++- lib/lyx2lyx/test_lyx2lyx_tools.py | 52 +++++++++ lib/lyx2lyx/unicode_symbols.py | 69 ++++++----- 8 files changed, 251 insertions(+), 200 deletions(-) create mode 100644 lib/lyx2lyx/test_lyx2lyx_tools.py diff --git a/autotests/export/lyx2lyx/lyx_2_3_test.lyx b/autotests/export/lyx2lyx/lyx_2_3_test.lyx index 33eae27f73..dbed95330e 100644 --- a/autotests/export/lyx2lyx/lyx_2_3_test.lyx +++ b/autotests/export/lyx2lyx/lyx_2_3_test.lyx @@ -94,7 +94,7 @@ Test reversion/conversion between 2.3 and 2.2 formats with lyx2lyx. \end_layout \begin_layout Description -Allowbreak: +allowbreak: \bar under \begin_inset Box Boxed @@ -169,5 +169,69 @@ without spaces. \end_layout +\begin_layout Description +baselineskip%: +\begin_inset Box Boxed +position "t" +hor_pos "c" +has_inner_box 1 +inner_pos "t" +use_parbox 1 +use_makebox 0 +width "250baselineskip%" +special "none" +height "50baselineskip%" +height_special "none" +thickness "4baselineskip%" +separation "9baselineskip%" +shadowsize "4pt" +framecolor "black" +backgroundcolor "none" +status open + +\begin_layout Plain Layout +test +\end_layout + +\end_inset + + +\begin_inset CommandInset line +LatexCommand rule +offset "40baselineskip%" +width "800baselineskip%" +height "5.3baselineskip%" + +\end_inset + + +\end_layout + +\begin_deeper +\begin_layout Standard +\begin_inset VSpace 200baselineskip% +\end_inset + + +\end_layout + +\begin_layout Standard +Vertical space above this paragraph is 2·baselineskip. +\end_layout + +\begin_layout Standard +\begin_inset space \hspace*{} +\length 75.2baselineskip% +\end_inset + +Paragraph with +\begin_inset space \hspace{} +\length 135baselineskip% +\end_inset + +horizontal space insets using baselineskip. +\end_layout + +\end_deeper \end_body \end_document diff --git a/lib/lyx2lyx/lyx2lyx_tools.py b/lib/lyx2lyx/lyx2lyx_tools.py index f63f402616..2f75cdcda3 100644 --- a/lib/lyx2lyx/lyx2lyx_tools.py +++ b/lib/lyx2lyx/lyx2lyx_tools.py @@ -17,8 +17,8 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ''' -This module offers several free functions to help with lyx2lyx'ing. -More documentaton is below, but here is a quick guide to what +This module offers several free functions to help with lyx2lyx'ing. +More documentaton is below, but here is a quick guide to what they do. Optional arguments are marked by brackets. add_to_preamble(document, text): @@ -37,8 +37,8 @@ insert_to_preamble(document, text[, index]): default index is 0, so the material is inserted at the beginning. Prepends a comment "% Added by lyx2lyx" to text. -put_cmd_in_ert(arg): - Here arg should be a list of strings (lines), which we want to +put_cmd_in_ert(cmd): + Here cmd should be a list of strings (lines), which we want to wrap in ERT. Returns a list of strings so wrapped. A call to this routine will often go something like this: i = find_token('\\begin_inset FunkyInset', ...) @@ -81,7 +81,6 @@ import string from parser_tools import find_token, find_end_of_inset from unicode_symbols import unicode_reps - # This will accept either a list of lines or a single line. # It is bad practice to pass something with embedded newlines, # though we will handle that. @@ -118,34 +117,37 @@ def add_to_preamble(document, text): # It should really be a list. def insert_to_preamble(document, text, index = 0): """ Insert text to the preamble at a given line""" - + if not type(text) is list: # split on \n just in case # it'll give us the one element list we want # if there's no \n, too text = text.split('\n') - + text.insert(0, "% Added by lyx2lyx") document.preamble[index:index] = text -def put_cmd_in_ert(arg): - ''' - arg should be a list of lines we want to wrap in ERT. - Returns a list of strings, with the lines so wrapped. - ''' - +# A dictionary of Unicode->LICR mappings for use in a Unicode string's translate() method +# Created from the reversed list to keep the first of alternative definitions. +licr_table = dict((ord(ch), cmd) for cmd, ch in unicode_reps[::-1]) + +def put_cmd_in_ert(cmd): + """ + Return ERT inset wrapping `cmd` as a list of strings. + + `cmd` can be a string or list of lines. Non-ASCII characters are converted + to the respective LICR macros if defined in unicodesymbols. + """ ret = ["\\begin_inset ERT", "status collapsed", "", "\\begin_layout Plain Layout", ""] - # It will be faster for us to work with a single string internally. - # That way, we only go through the unicode_reps loop once. - if type(arg) is list: - s = "\n".join(arg) + # It will be faster to work with a single string internally. + if isinstance(cmd, list): + cmd = u"\n".join(cmd) else: - s = arg - for rep in unicode_reps: - s = s.replace(rep[1], rep[0]) - s = s.replace('\\', "\\backslash\n") - ret += s.splitlines() + cmd = u"%s" % cmd # ensure it is an unicode instance + cmd = cmd.translate(licr_table) + cmd = cmd.replace("\\", "\\backslash\n") + ret += cmd.splitlines() ret += ["\\end_layout", "", "\\end_inset"] return ret @@ -300,7 +302,7 @@ def lyx2verbatim(document, lines): def latex_length(slen): - ''' + ''' Convert lengths to their LaTeX representation. Returns (bool, length), where the bool tells us if it was a percentage, and the length is the LaTeX representation. @@ -314,9 +316,14 @@ def latex_length(slen): # the + always precedes the - # Convert relative lengths to LaTeX units - units = {"text%":"\\textwidth", "col%":"\\columnwidth", - "page%":"\\paperwidth", "line%":"\\linewidth", - "theight%":"\\textheight", "pheight%":"\\paperheight"} + units = {"col%": "\\columnwidth", + "text%": "\\textwidth", + "page%": "\\paperwidth", + "line%": "\\linewidth", + "theight%": "\\textheight", + "pheight%": "\\paperheight", + "baselineskip%": "\\baselineskip" + } for unit in list(units.keys()): i = slen.find(unit) if i == -1: diff --git a/lib/lyx2lyx/lyx_1_6.py b/lib/lyx2lyx/lyx_1_6.py index 071280836b..c022b875aa 100644 --- a/lib/lyx2lyx/lyx_1_6.py +++ b/lib/lyx2lyx/lyx_1_6.py @@ -23,7 +23,7 @@ import unicodedata import sys, os from parser_tools import find_token, find_end_of, find_tokens, get_value -from unicode_symbols import read_unicodesymbols +from unicode_symbols import unicode_reps #################################################################### # Private helper functions @@ -146,54 +146,6 @@ def set_option(document, m, option, value): return l -# FIXME: Remove this function if the version imported from unicode_symbols works. -# This function was the predecessor from that function, that in the meanwhile got -# new fixes. -def read_unicodesymbols2(): - " Read the unicodesymbols list of unicode characters and corresponding commands." - - # Provide support for both python 2 and 3 - PY2 = sys.version_info[0] == 2 - if not PY2: - unichr = chr - # End of code to support for both python 2 and 3 - - pathname = os.path.abspath(os.path.dirname(sys.argv[0])) - fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols')) - spec_chars = [] - # Two backslashes, followed by some non-word character, and then a character - # in brackets. The idea is to check for constructs like: \"{u}, which is how - # they are written in the unicodesymbols file; but they can also be written - # as: \"u or even \" u. - r = re.compile(r'\\\\(\W)\{(\w)\}') - for line in fp.readlines(): - if line[0] != '#' and line.strip() != "": - line=line.replace(' "',' ') # remove all quotation marks with spaces before - line=line.replace('" ',' ') # remove all quotation marks with spaces after - line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis) - try: - [ucs4,command,dead] = line.split(None,2) - if command[0:1] != "\\": - continue - spec_chars.append([command, unichr(eval(ucs4))]) - except: - continue - m = r.match(command) - if m != None: - command = "\\\\" - # If the character is a double-quote, then we need to escape it, too, - # since it is done that way in the LyX file. - if m.group(1) == "\"": - command += "\\" - commandbl = command - command += m.group(1) + m.group(2) - commandbl += m.group(1) + ' ' + m.group(2) - spec_chars.append([command, unichr(eval(ucs4))]) - spec_chars.append([commandbl, unichr(eval(ucs4))]) - fp.close() - return spec_chars - - def extract_argument(line): 'Extracts a LaTeX argument from the start of line. Returns (arg, rest).' @@ -280,8 +232,6 @@ def latex2ert(line, isindex): return retval -unicode_reps = read_unicodesymbols() - #Bug 5022.... #Might should do latex2ert first, then deal with stuff that DOESN'T #end up inside ERT. That routine could be modified so that it returned diff --git a/lib/lyx2lyx/lyx_2_2.py b/lib/lyx2lyx/lyx_2_2.py index 664dd6c3f7..3ba32b31e5 100644 --- a/lib/lyx2lyx/lyx_2_2.py +++ b/lib/lyx2lyx/lyx_2_2.py @@ -746,10 +746,10 @@ def convert_phrases(document): if len(words) > 1 and words[0] == "\\begin_inset" and \ words[1] in ["CommandInset", "External", "Formula", "Graphics", "listings"]: # must not replace anything in insets that store LaTeX contents in .lyx files - # (math and command insets withut overridden read() and write() methods + # (math and command insets without overridden read() and write() methods) j = find_end_of_inset(document.body, i) if j == -1: - document.warning("Malformed LyX document: Can't find end of Formula inset at line " + str(i)) + document.warning("Malformed LyX document: Can't find end of inset at line " + str(i)) i += 1 else: i = j diff --git a/lib/lyx2lyx/lyx_2_3.py b/lib/lyx2lyx/lyx_2_3.py index 310de38899..6551175515 100644 --- a/lib/lyx2lyx/lyx_2_3.py +++ b/lib/lyx2lyx/lyx_2_3.py @@ -27,15 +27,14 @@ import sys, os from parser_tools import (del_token, del_value, del_complete_lines, find_complete_lines, find_end_of, find_end_of_layout, find_end_of_inset, find_re, find_token, find_token_backwards, get_containing_inset, - get_containing_layout, get_bool_value, get_value, get_quoted_value) -# find_tokens, find_token_exact, is_in_inset, -# check_token, get_option_value + get_containing_layout, get_bool_value, get_value, get_quoted_value, + is_in_inset) +# find_tokens, find_token_exact, check_token, get_option_value -from lyx2lyx_tools import add_to_preamble, put_cmd_in_ert, revert_font_attrs, \ - insert_to_preamble -# get_ert, lyx2latex, \ -# lyx2verbatim, length_in_bp, convert_info_insets -# latex_length, revert_flex_inset, hex2ratio, str2bool +from lyx2lyx_tools import (add_to_preamble, put_cmd_in_ert, revert_font_attrs, + insert_to_preamble, latex_length) +# get_ert, lyx2latex, lyx2verbatim, length_in_bp, convert_info_insets +# revert_flex_inset, hex2ratio, str2bool #################################################################### # Private helper functions @@ -88,13 +87,12 @@ def convert_dateinset(document): continue if get_value(document.body, 'template', i, j) == "Date": document.body[i : j + 1] = put_cmd_in_ert("\\today ") - i += 1 - continue + i = j+1 # skip inset def convert_inputenc(document): " Replace no longer supported input encoding settings. " - i = find_token(document.header, "\\inputenc", 0) + i = find_token(document.header, "\\inputenc") if i == -1: return if get_value(document.header, "\\inputencoding", i) == "pt254": @@ -1859,7 +1857,7 @@ def convert_dashligatures(document): while i+1 < len(lines): i += 1 line = lines[i] - # skip lines without any dashes: + # skip lines without dashes: if not re.search(u"[\u2013\u2014]|\\twohyphens|\\threehyphens", line): continue # skip label width string (see bug 10243): @@ -1867,28 +1865,28 @@ def convert_dashligatures(document): continue # do not touch hyphens in some insets (cf. lyx_2_2.convert_dashes): try: - value, start, end = get_containing_inset(lines, i) + inset_type, start, end = get_containing_inset(lines, i) except TypeError: # no containing inset - value, start, end = "no inset", -1, -1 - if (value.split()[0] in + inset_type, start, end = "no inset", -1, -1 + if (inset_type.split()[0] in ["CommandInset", "ERT", "External", "Formula", "FormulaMacro", "Graphics", "IPA", "listings"] - or value == "Flex Code"): + or inset_type == "Flex Code"): i = end continue try: - layout, start, end, j = get_containing_layout(lines, i) + layoutname, start, end, j = get_containing_layout(lines, i) except TypeError: # no (or malformed) containing layout document.warning("Malformed LyX document: " "Can't find layout at line %d" % i) continue - if layout == "LyX-Code": + if layoutname == "LyX-Code": i = end continue # literal dash followed by a word or no-break space: - if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line, - flags=re.UNICODE): + if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", + line, flags=re.UNICODE): has_literal_dashes = True # ligature dash followed by word or no-break space on next line: if (re.search(r"(\\twohyphens|\\threehyphens)", line) and @@ -1900,14 +1898,15 @@ def convert_dashligatures(document): '"ligature" dashes.\n Line breaks may have changed. ' 'See UserGuide chapter 3.9.1 for details.') break - if has_literal_dashes: + + if has_literal_dashes and not has_ligature_dashes: use_dash_ligatures = False - elif has_ligature_dashes: + elif has_ligature_dashes and not has_literal_dashes: use_dash_ligatures = True + # insert the setting if there is a preferred value if use_dash_ligatures is not None: - i = find_token(document.header, "\\graphics") - document.header.insert(i, "\\use_dash_ligatures %s" + document.header.insert(-1, "\\use_dash_ligatures %s" % str(use_dash_ligatures).lower()) @@ -2020,64 +2019,37 @@ def revert_mathindent(document): def revert_baselineskip(document): - " Revert baselineskips to TeX code " - i = 0 - vspaceLine = 0 - hspaceLine = 0 - while True: - regexp = re.compile(r'^.*baselineskip%.*$') - i = find_re(document.body, regexp, i) - if i == -1: - return - vspaceLine = find_token(document.body, "\\begin_inset VSpace", i) - if vspaceLine == i: - # output VSpace inset as TeX code - # first read out the values - beg = document.body[i].rfind("VSpace "); - end = document.body[i].rfind("baselineskip%"); - baselineskip = float(document.body[i][beg + 7:end]); - # we store the value in percent, thus divide by 100 - baselineskip = baselineskip/100; - baselineskip = str(baselineskip); - # check if it is the starred version - if document.body[i].find('*') != -1: - star = '*' - else: - star = '' - # now output TeX code - endInset = find_end_of_inset(document.body, i) - if endInset == -1: - document.warning("Malformed LyX document: Missing '\\end_inset' of VSpace inset.") - return - else: - document.body[vspaceLine: endInset + 1] = put_cmd_in_ert("\\vspace" + star + '{' + baselineskip + "\\baselineskip}") - hspaceLine = find_token(document.body, "\\begin_inset space \\hspace", i - 1) - document.warning("hspaceLine: " + str(hspaceLine)) - document.warning("i: " + str(i)) - if hspaceLine == i - 1: - # output space inset as TeX code - # first read out the values - beg = document.body[i].rfind("\\length "); - end = document.body[i].rfind("baselineskip%"); - baselineskip = float(document.body[i][beg + 7:end]); - document.warning("baselineskip: " + str(baselineskip)) - # we store the value in percent, thus divide by 100 - baselineskip = baselineskip/100; - baselineskip = str(baselineskip); - # check if it is the starred version - if document.body[i-1].find('*') != -1: - star = '*' - else: - star = '' - # now output TeX code - endInset = find_end_of_inset(document.body, i) - if endInset == -1: - document.warning("Malformed LyX document: Missing '\\end_inset' of space inset.") - return - else: - document.body[hspaceLine: endInset + 1] = put_cmd_in_ert("\\hspace" + star + '{' + baselineskip + "\\baselineskip}") - - i = i + 1 + " Revert baselineskips to TeX code " + i = 0 + regexp = re.compile(r'.*baselineskip%.*') + while True: + i = i + 1 + i = find_re(document.body, regexp, i) + if i == -1: + return + if document.body[i].startswith("\\begin_inset VSpace"): + # output VSpace inset as TeX code + end = find_end_of_inset(document.body, i) + if end == -1: + document.warning("Malformed LyX document: " + "Can't find end of VSpace inset at line %d." % i) + continue + # read out the value + baselineskip = document.body[i].split()[-1] + # check if it is the starred version + star = '*' if '*' in document.body[i] else '' + # now output TeX code + cmd = "\\vspace%s{%s}" %(star, latex_length(baselineskip)[1]) + document.body[i:end+1] = put_cmd_in_ert(cmd) + i += 8 + continue + begin, end = is_in_inset(document.body, i, "\\begin_inset space \\hspace") + if begin != - 1: + # output space inset as TeX code + baselineskip = document.body[i].split()[-1] + star = '*' if '*' in document.body[i-1] else '' + cmd = "\\hspace%s{%s}" %(star, latex_length(baselineskip)[1]) + document.body[begin:end+1] = put_cmd_in_ert(cmd) def revert_rotfloat(document): diff --git a/lib/lyx2lyx/parser_tools.py b/lib/lyx2lyx/parser_tools.py index 06cb41df91..f36cd42b94 100644 --- a/lib/lyx2lyx/parser_tools.py +++ b/lib/lyx2lyx/parser_tools.py @@ -23,7 +23,7 @@ This module offers several free functions to help parse lines. More documentaton is below, but here is a quick guide to what they do. Optional arguments are marked by brackets. -find_token(lines, token, start[, end[, ignorews]]): +find_token(lines, token[, start[, end[, ignorews]]]): Returns the first line i, start <= i < end, on which token is found at the beginning. Returns -1 if not found. @@ -31,10 +31,10 @@ find_token(lines, token, start[, end[, ignorews]]): in whitespace do not count, except that there must be no extra whitespace following token itself. -find_token_exact(lines, token, start[, end]): +find_token_exact(lines, token[, start[, end]]]): As find_token, but with ignorews set to True. -find_tokens(lines, tokens, start[, end[, ignorews]]): +find_tokens(lines, tokens[, start[, end[, ignorews]]]): Returns the first line i, start <= i < end, on which one of the tokens in tokens is found at the beginning. Returns -1 if not found. @@ -42,7 +42,7 @@ find_tokens(lines, tokens, start[, end[, ignorews]]): in whitespace do not count, except that there must be no extra whitespace following token itself. -find_tokens_exact(lines, token, start[, end]): +find_tokens_exact(lines, token[, start[, end]]): As find_tokens, but with ignorews True. find_token_backwards(lines, token, start): @@ -543,8 +543,9 @@ def is_in_inset(lines, i, inset, default=(-1,-1)): is_in_inset(document.body, i, "\\begin_inset Tabular") returns (-1,-1) if `i` is not within a "Tabular" inset (i.e. a table). If it is, then it returns the line on which the table begins and the one - on which it ends. Note that this pair will evaulate to - boolean True, so + on which it ends. + Note that this pair will evaulate to boolean True, so (with the optional + default value set to False) if is_in_inset(..., default=False): will do what you expect. """ diff --git a/lib/lyx2lyx/test_lyx2lyx_tools.py b/lib/lyx2lyx/test_lyx2lyx_tools.py new file mode 100644 index 0000000000..9988d8dcf8 --- /dev/null +++ b/lib/lyx2lyx/test_lyx2lyx_tools.py @@ -0,0 +1,52 @@ +# This file is part of lyx2lyx +# -*- coding: utf-8 -*- +# Copyright (C) 2018 The LyX team +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +" This modules tests the auxiliary functions for lyx2lyx." + +from lyx2lyx_tools import * + +import unittest + +class TestParserTools(unittest.TestCase): + + def test_put_cmd_in_ert(self): + ert = ['\\begin_inset ERT', + 'status collapsed', + '', + '\\begin_layout Plain Layout', + '', + u'\\backslash', + u'texttt{Gr\\backslash', + u'"{u}\\backslash', + u'ss{}e}', + '\\end_layout', + '', + '\\end_inset'] + self.assertEqual(put_cmd_in_ert(u"\\texttt{Grüße}"), ert) + self.assertEqual(put_cmd_in_ert([u"\\texttt{Grüße}"]), ert) + + def test_latex_length(self): + self.assertEqual(latex_length("-30.5col%"), (True, "-0.305\\columnwidth")) + self.assertEqual(latex_length("35baselineskip%"), (True, "0.35\\baselineskip")) + self.assertEqual(latex_length("11em"), (False, "11em")) + self.assertEqual(latex_length("-0.4pt"), (False, "-0.4pt")) + + + +if __name__ == '__main__': + unittest.main() diff --git a/lib/lyx2lyx/unicode_symbols.py b/lib/lyx2lyx/unicode_symbols.py index d9eeff9683..0c5032aceb 100644 --- a/lib/lyx2lyx/unicode_symbols.py +++ b/lib/lyx2lyx/unicode_symbols.py @@ -18,7 +18,7 @@ " Import unicode_reps from this module for access to the unicode<->LaTeX mapping. " -import sys, os, re +import sys, os, re, codecs # Provide support for both python 2 and 3 PY2 = sys.version_info[0] == 2 @@ -28,14 +28,13 @@ if not PY2: def read_unicodesymbols(): " Read the unicodesymbols list of unicode characters and corresponding commands." - pathname = os.path.abspath(os.path.dirname(sys.argv[0])) + pathname = os.path.abspath(os.path.dirname(__file__)) filename = os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols') - # For python 3+ we have to specify the encoding for those systems - # where the default is not UTF-8 - fp = open(filename, encoding="utf8") if (not PY2) else open(filename) + # Read as Unicode strings in both, Python 2 and 3 + # Specify the encoding for those systems where the default is not UTF-8 + fp = codecs.open(filename, encoding="utf8") - spec_chars = [] # A backslash, followed by some non-word character, and then a character # in brackets. The idea is to check for constructs like: \"{u}, which is how # they are written in the unicodesymbols file; but they can also be written @@ -43,36 +42,42 @@ def read_unicodesymbols(): # The two backslashes in the string literal are needed to specify a literal # backslash in the regex. Without r prefix, these would be four backslashes. r = re.compile(r'\\(\W)\{(\w)\}') + + spec_chars = [] for line in fp.readlines(): - if line[0] != '#' and line.strip() != "": - # Note: backslashes in the string literals with r prefix are not escaped, - # so one backslash in the source file equals one backslash in memory. - # Without r prefix backslahses are escaped, so two backslashes in the - # source file equal one backslash in memory. - line=line.replace(' "',' ') # remove all quotation marks with spaces before - line=line.replace('" ',' ') # remove all quotation marks with spaces after - line=line.replace(r'\"','"') # unescape " - line=line.replace(r'\\','\\') # unescape \ - try: - [ucs4,command,dead] = line.split(None,2) - if command[0:1] != "\\": - continue - if (line.find("notermination=text") < 0 and - line.find("notermination=both") < 0 and command[-1] != "}"): - command = command + "{}" - spec_chars.append([command, unichr(eval(ucs4))]) - except: + if not line.strip() or line.startswith('#'): + # skip empty lines and comments + continue + # Note: backslashes in the string literals with r prefix are not escaped, + # so one backslash in the source file equals one backslash in memory. + # Without r prefix backslahses are escaped, so two backslashes in the + # source file equal one backslash in memory. + line=line.replace(' "',' ') # remove all quotation marks with spaces before + line=line.replace('" ',' ') # remove all quotation marks with spaces after + line=line.replace(r'\"','"') # unescape " + line=line.replace(r'\\','\\') # unescape \ + try: + [ucs4,command,dead] = line.split(None,2) + if command[0:1] != "\\": continue - m = r.match(command) - if m != None: - command = "\\" - commandbl = command - command += m.group(1) + m.group(2) - commandbl += m.group(1) + ' ' + m.group(2) - spec_chars.append([command, unichr(eval(ucs4))]) - spec_chars.append([commandbl, unichr(eval(ucs4))]) + literal_char = unichr(int(ucs4, 16)) + if (line.find("notermination=text") < 0 and + line.find("notermination=both") < 0 and command[-1] != "}"): + command = command + "{}" + spec_chars.append([command, literal_char]) + except: + continue + m = r.match(command) + if m != None: + command = "\\" + commandbl = command + command += m.group(1) + m.group(2) + commandbl += m.group(1) + ' ' + m.group(2) + spec_chars.append([command, literal_char]) + spec_chars.append([commandbl, literal_char]) fp.close() return spec_chars unicode_reps = read_unicodesymbols() + -- 2.39.2