insets = []
lang_re = re.compile(r"^\\lang\s(\S+)")
inset_re = re.compile(r"^\\begin_inset\s(\S+)")
+ if not forward: # no need to read file unless we are reverting
+ spec_chars = read_unicodesymbols()
+
if document.inputencoding == "auto" or document.inputencoding == "default":
- for i in range(len(document.body)):
+ i = 0
+ while i < len(document.body):
result = lang_re.match(document.body[i])
if result:
language = result.group(1)
# with the correct encoding.
document.body[i] = orig.decode(encoding_stack[-1])
else:
- # Convert unicode to the 8bit string that will be written
- # to the file with the correct encoding.
- orig = document.body[i].encode(encoding_stack[-1])
- # Convert the 8bit string that will be written to the
- # file to fake unicode with the encoding that will later
- # be used when writing to the file.
- document.body[i] = orig.decode(document.encoding)
+ try:
+ # Convert unicode to the 8bit string that will be written
+ # to the file with the correct encoding.
+ orig = document.body[i].encode(encoding_stack[-1])
+ # Convert the 8bit string that will be written to the
+ # file to fake unicode with the encoding that will later
+ # be used when writing to the file.
+ document.body[i] = orig.decode(document.encoding)
+ except:
+ mod_line = revert_unicode_line(document, i, insets, spec_chars)
+ document.body[i:i+1] = mod_line.split('\n')
+ i += len(mod_line.split('\n')) - 1
+ i += 1
def convert_utf8(document):
except:
pass
fp.close()
-
return spec_chars
+def revert_unicode_line(document, i, insets, spec_chars, replacement_character = '???'):
+ # Define strings to start and end ERT and math insets
+ ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
+ ert_outro='\n\\end_layout\n\n\\end_inset\n'
+ math_intro='\n\\begin_inset Formula $'
+ math_outro='$\n\\end_inset'
+
+ mod_line = u''
+ if i and not is_inset_line(document, i-1):
+ last_char = document.body[i - 1][-1:]
+ else:
+ last_char = ''
+
+ line = document.body[i]
+ for character in line:
+ try:
+ # Try to write the character
+ dummy = character.encode(document.encoding)
+ mod_line += character
+ last_char = character
+ except:
+ # Try to replace with ERT/math inset
+ if spec_chars.has_key(character):
+ command = spec_chars[character][0] # the command to replace unicode
+ flag1 = spec_chars[character][1]
+ flag2 = spec_chars[character][2]
+ if flag1.find('combining') > -1 or flag2.find('combining') > -1:
+ # We have a character that should be combined with the previous
+ command += '{' + last_char + '}'
+ # Remove the last character. Ignore if it is whitespace
+ if len(last_char.rstrip()):
+ # last_char was found and is not whitespace
+ if mod_line:
+ mod_line = mod_line[:-1]
+ else: # last_char belongs to the last line
+ document.body[i-1] = document.body[i-1][:-1]
+ else:
+ # The last character was replaced by a command. For now it is
+ # ignored. This could be handled better.
+ pass
+ if command[0:2] == '\\\\':
+ if command[2:12]=='ensuremath':
+ if insets and insets[-1] == "ERT":
+ # math in ERT
+ command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
+ command = command.replace('}', '$\n')
+ elif not insets or insets[-1] != "Formula":
+ # add a math inset with the replacement character
+ command = command.replace('\\\\ensuremath{\\', math_intro)
+ command = command.replace('}', math_outro)
+ else:
+ # we are already in a math inset
+ command = command.replace('\\\\ensuremath{\\', '')
+ command = command.replace('}', '')
+ else:
+ if insets and insets[-1] == "Formula":
+ # avoid putting an ERT in a math; instead put command as text
+ command = command.replace('\\\\', '\mathrm{')
+ command = command + '}'
+ elif not insets or insets[-1] != "ERT":
+ # add an ERT inset with the replacement character
+ command = command.replace('\\\\', ert_intro)
+ command = command + ert_outro
+ else:
+ command = command.replace('\\\\', '\n\\backslash\n')
+ last_char = '' # indicate that the character should not be removed
+ mod_line += command
+ else:
+ # Replace with replacement string
+ mod_line += replacement_character
+ return mod_line
+
+
def revert_unicode(document):
'''Transform unicode characters that can not be written using the
document encoding to commands according to the unicodesymbols
file. Characters that can not be replaced by commands are replaced by
an replacement string. Flags other than 'combined' are currently not
implemented.'''
-
- replacement_character = '???'
spec_chars = read_unicodesymbols()
-
- # Define strings to start and end ERT and math insets
- ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
- ert_outro='\n\\end_layout\n\n\\end_inset\n'
- math_intro='\n\\begin_inset Formula $'
- math_outro='$\n\\end_inset'
- # Find unicode characters and replace them
- in_ert = False # flag set to 1 if in ERT inset
- in_math = False # flag set to 1 if in math inset
insets = [] # list of active insets
- # Go through the file to capture all combining characters
- last_char = '' # to store the previous character
-
+ # Go through the document to capture all combining characters
i = 0
while i < len(document.body):
line = document.body[i]
try:
# If all goes well the line is written here
dummy = line.encode(document.encoding)
- last_char = line[-1]
i += 1
except:
# Error, some character(s) in the line need to be replaced
- mod_line = u''
- for character in line:
- try:
- # Try to write the character
- dummy = character.encode(document.encoding)
- mod_line += character
- last_char = character
- except:
- # Try to replace with ERT/math inset
- if spec_chars.has_key(character):
- command = spec_chars[character][0] # the command to replace unicode
- flag1 = spec_chars[character][1]
- flag2 = spec_chars[character][2]
- if flag1.find('combining') > -1 or flag2.find('combining') > -1:
- # We have a character that should be combined with the previous
- command += '{' + last_char + '}'
- # Remove the last character. Ignore if it is whitespace
- if len(last_char.rstrip()):
- # last_char was found and is not whitespace
- if mod_line:
- mod_line = mod_line[:-1]
- else: # last_char belongs to the last line
- document.body[i-1] = document.body[i-1][:-1]
- else:
- # The last character was replaced by a command. For now it is
- # ignored. This could be handled better.
- pass
- if command[0:2] == '\\\\':
- if command[2:12]=='ensuremath':
- if insets[-1] == "ERT":
- # math in ERT
- command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
- command = command.replace('}', '$\n')
- elif insets[-1] != "Formula":
- # add a math inset with the replacement character
- command = command.replace('\\\\ensuremath{\\', math_intro)
- command = command.replace('}', math_outro)
- else:
- # we are already in a math inset
- command = command.replace('\\\\ensuremath{\\', '')
- command = command.replace('}', '')
- else:
- if insets[-1] == "Formula":
- # avoid putting an ERT in a math; instead put command as text
- command = command.replace('\\\\', '\mathrm{')
- command = command + '}'
- elif insets[-1] != "ERT":
- # add an ERT inset with the replacement character
- command = command.replace('\\\\', ert_intro)
- command = command + ert_outro
- else:
- command = command.replace('\\\\', '\n\\backslash\n')
- last_char = '' # indicate that the character should not be removed
- mod_line += command
- else:
- # Replace with replacement string
- mod_line += replacement_character
+ mod_line = revert_unicode_line(document, i, insets, spec_chars)
document.body[i:i+1] = mod_line.split('\n')
i += len(mod_line.split('\n'))
if commandparams_info[name][0] == "":
document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
else:
- lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"')))
+ lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('\\', '\\\\').replace('"', '\\"')))
if option2 != "":
if commandparams_info[name][1] == "":
document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
else:
- lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"')))
+ lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('\\', '\\\\').replace('"', '\\"')))
if argument != "":
if commandparams_info[name][2] == "":
document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
else:
- lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"')))
+ lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('\\', '\\\\').replace('"', '\\"')))
document.body[i:i+1] = lines
i = i + 1
preview_line = document.body[k]
elif (commandparams_info[name][0] != "" and
pname == commandparams_info[name][0]):
- option1 = pvalue.strip('"').replace('\\"', '"')
+ option1 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
elif (commandparams_info[name][1] != "" and
pname == commandparams_info[name][1]):
- option2 = pvalue.strip('"').replace('\\"', '"')
+ option2 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
elif (commandparams_info[name][2] != "" and
pname == commandparams_info[name][2]):
- argument = pvalue.strip('"').replace('\\"', '"')
+ argument = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
elif document.body[k].strip() != "":
document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
if name == "bibitem":
i += 3
+def is_inset_line(document, i):
+ """ Line i of body has an inset """
+ if document.body[i][:1] == '\\':
+ return True
+ last_tokens = "".join(document.body[i].split()[-2:])
+ return last_tokens.find('\\') != -1
+
+
+# A wrapper around normalize that handles special cases (cf. bug 3313)
+def normalize(form, text):
+ # do not normalize OHM, ANGSTROM
+ keep_characters = [0x2126,0x212b]
+ result = ''
+ convert = ''
+ for i in text:
+ if ord(i) in keep_characters:
+ if len(convert) > 0:
+ result = result + unicodedata.normalize(form, convert)
+ convert = ''
+ result = result + i
+ else:
+ convert = convert + i
+ if len(convert) > 0:
+ result = result + unicodedata.normalize(form, convert)
+ return result
+
+
def revert_accent(document):
inverse_accent_map = {}
for k in accent_map:
# words before unicode normalization.
# We do this only if the next line starts with an accent, otherwise we
# would create things like '\begin_inset ERTstatus'.
- numberoflines = len(document.body)
- for i in range(numberoflines-1):
+ for i in range(len(document.body) - 1):
if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
continue
- if (document.body[i+1][0] in inverse_accent_map):
+ if (document.body[i+1][0] in inverse_accent_map and not is_inset_line(document, i)):
# the last character of this line and the first of the next line
- # form probably a surrogate pair.
+ # form probably a surrogate pair, inline insets are excluded (second part of the test)
while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
document.body[i] += document.body[i+1][0]
document.body[i+1] = document.body[i+1][1:]
# Normalize to "Normal form D" (NFD, also known as canonical decomposition).
# This is needed to catch all accented characters.
- for i in range(numberoflines):
+ for i in range(len(document.body)):
# Unfortunately we have a mixture of unicode strings and plain strings,
# because we never use u'xxx' for string literals, but 'xxx'.
# Therefore we may have to try two times to normalize the data.
try:
- document.body[i] = unicodedata.normalize("NFD", document.body[i])
+ document.body[i] = normalize("NFD", document.body[i])
except TypeError:
- document.body[i] = unicodedata.normalize("NFD", unicode(document.body[i], 'utf-8'))
+ document.body[i] = normalize("NFD", unicode(document.body[i], 'utf-8'))
# Replace accented characters with InsetLaTeXAccent
# Do not convert characters that can be represented in the chosen
# encoding.
encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
lang_re = re.compile(r"^\\lang\s(\S+)")
+
i = 0
while i < len(document.body):
-
if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
# Track the encoding of the current line
result = lang_re.match(document.body[i])
if j < len(document.body[i]) - 1:
document.body.insert(i+1, document.body[i][j+1:])
# Delete the accented character
- if j > 0:
- document.body[i] = document.body[i][:j-1]
- else:
- document.body[i] = u''
+ document.body[i] = document.body[i][:j]
# Finally add the InsetLaTeXAccent
document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
break
accented_char = inverse_accented_map[accented_char]
accent = document.body[i][j]
try:
- dummy = unicodedata.normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
+ dummy = normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
except UnicodeEncodeError:
# Insert the rest of the line as new line
if j < len(document.body[i]) - 1:
document.body.insert(i+1, document.body[i][j+1:])
# Delete the accented characters
- if j > 1:
- document.body[i] = document.body[i][:j-2]
- else:
- document.body[i] = u''
+ document.body[i] = document.body[i][:j-1]
# Finally add the InsetLaTeXAccent
document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
break
i = i + 1
# Normalize to "Normal form C" (NFC, pre-composed characters) again
- for i in range(numberoflines):
- document.body[i] = unicodedata.normalize("NFC", document.body[i])
+ for i in range(len(document.body)):
+ document.body[i] = normalize("NFC", document.body[i])
def normalize_font_whitespace_259(document):
'',
r'\backslash',
r'begin{lstlisting}%s' % params,
- r'\end_layout'
+ r'\end_layout',
+ '',
+ r'\begin_layout %s' % document.default_layout,
] + document.body[k : j - 1] + \
['',
r'\begin_layout %s' % document.default_layout,