We do this here and not in LyX.py because it is far easier to do the
necessary parsing in modern formats than in ancient ones.
"""
+ inset_types = ["Foot", "Note"]
if document.cjk_encoding != '':
return
encoding_stack = [document.encoding]
+ insets = []
lang_re = re.compile(r"^\\lang\s(\S+)")
+ inset_re = re.compile(r"^\\begin_inset\s(\S+)")
+ if not forward: # no need to read file unless we are reverting
+ spec_chars = read_unicodesymbols()
+
if document.inputencoding == "auto" or document.inputencoding == "default":
- for i in range(len(document.body)):
+ i = 0
+ while i < len(document.body):
result = lang_re.match(document.body[i])
if result:
language = result.group(1)
encoding_stack[-1] = lang[language][3]
elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
document.warning("Adding nested encoding %s." % encoding_stack[-1], 3)
- encoding_stack.append(encoding_stack[-1])
+ if len(insets) > 0 and insets[-1] in inset_types:
+ from lyx2lyx_lang import lang
+ encoding_stack.append(lang[document.language][3])
+ else:
+ encoding_stack.append(encoding_stack[-1])
elif find_token(document.body, "\\end_layout", i, i + 1) == i:
document.warning("Removing nested encoding %s." % encoding_stack[-1], 3)
if len(encoding_stack) == 1:
document.warning("Malformed LyX document: Unexpected `\\end_layout'.")
else:
del encoding_stack[-1]
+ elif find_token(document.body, "\\begin_inset", i, i + 1) == i:
+ inset_result = inset_re.match(document.body[i])
+ if inset_result:
+ insets.append(inset_result.group(1))
+ else:
+ insets.append("")
+ elif find_token(document.body, "\\end_inset", i, i + 1) == i:
+ del insets[-1]
if encoding_stack[-1] != document.encoding:
if forward:
# This line has been incorrectly interpreted as if it was
# with the correct encoding.
document.body[i] = orig.decode(encoding_stack[-1])
else:
- # Convert unicode to the 8bit string that will be written
- # to the file with the correct encoding.
- orig = document.body[i].encode(encoding_stack[-1])
- # Convert the 8bit string that will be written to the
- # file to fake unicode with the encoding that will later
- # be used when writing to the file.
- document.body[i] = orig.decode(document.encoding)
+ try:
+ # Convert unicode to the 8bit string that will be written
+ # to the file with the correct encoding.
+ orig = document.body[i].encode(encoding_stack[-1])
+ # Convert the 8bit string that will be written to the
+ # file to fake unicode with the encoding that will later
+ # be used when writing to the file.
+ document.body[i] = orig.decode(document.encoding)
+ except:
+ mod_line = revert_unicode_line(document, i, insets, spec_chars)
+ document.body[i:i+1] = mod_line.split('\n')
+ i += len(mod_line.split('\n')) - 1
+ i += 1
def convert_utf8(document):
convert_multiencoding(document, False)
+def read_unicodesymbols():
+ " Read the unicodesymbols list of unicode characters and corresponding commands."
+ pathname = os.path.abspath(os.path.dirname(sys.argv[0]))
+ fp = open(os.path.join(pathname.strip('lyx2lyx'), 'unicodesymbols'))
+ spec_chars = {}
+ for line in fp.readlines():
+ if line[0] != '#':
+ line=line.replace(' "',' ') # remove all quotation marks with spaces before
+ line=line.replace('" ',' ') # remove all quotation marks with spaces after
+ line=line.replace(r'\"','"') # replace \" by " (for characters with diaeresis)
+ try:
+ # flag1 and flag2 are preamble and other flags
+ [ucs4,command,flag1,flag2] =line.split(None,3)
+ spec_chars[unichr(eval(ucs4))] = [command, flag1, flag2]
+ except:
+ pass
+ fp.close()
+ return spec_chars
+
+
+def revert_unicode_line(document, i, insets, spec_chars, replacement_character = '???'):
+ # Define strings to start and end ERT and math insets
+ ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout %s\n\\backslash\n' % document.default_layout
+ ert_outro='\n\\end_layout\n\n\\end_inset\n'
+ math_intro='\n\\begin_inset Formula $'
+ math_outro='$\n\\end_inset'
+
+ mod_line = u''
+ if i and not is_inset_line(document, i-1):
+ last_char = document.body[i - 1][-1:]
+ else:
+ last_char = ''
+
+ line = document.body[i]
+ for character in line:
+ try:
+ # Try to write the character
+ dummy = character.encode(document.encoding)
+ mod_line += character
+ last_char = character
+ except:
+ # Try to replace with ERT/math inset
+ if spec_chars.has_key(character):
+ command = spec_chars[character][0] # the command to replace unicode
+ flag1 = spec_chars[character][1]
+ flag2 = spec_chars[character][2]
+ if flag1.find('combining') > -1 or flag2.find('combining') > -1:
+ # We have a character that should be combined with the previous
+ command += '{' + last_char + '}'
+ # Remove the last character. Ignore if it is whitespace
+ if len(last_char.rstrip()):
+ # last_char was found and is not whitespace
+ if mod_line:
+ mod_line = mod_line[:-1]
+ else: # last_char belongs to the last line
+ document.body[i-1] = document.body[i-1][:-1]
+ else:
+ # The last character was replaced by a command. For now it is
+ # ignored. This could be handled better.
+ pass
+ if command[0:2] == '\\\\':
+ if command[2:12]=='ensuremath':
+ if insets and insets[-1] == "ERT":
+ # math in ERT
+ command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
+ command = command.replace('}', '$\n')
+ elif not insets or insets[-1] != "Formula":
+ # add a math inset with the replacement character
+ command = command.replace('\\\\ensuremath{\\', math_intro)
+ command = command.replace('}', math_outro)
+ else:
+ # we are already in a math inset
+ command = command.replace('\\\\ensuremath{\\', '')
+ command = command.replace('}', '')
+ else:
+ if insets and insets[-1] == "Formula":
+ # avoid putting an ERT in a math; instead put command as text
+ command = command.replace('\\\\', '\mathrm{')
+ command = command + '}'
+ elif not insets or insets[-1] != "ERT":
+ # add an ERT inset with the replacement character
+ command = command.replace('\\\\', ert_intro)
+ command = command + ert_outro
+ else:
+ command = command.replace('\\\\', '\n\\backslash\n')
+ last_char = '' # indicate that the character should not be removed
+ mod_line += command
+ else:
+ # Replace with replacement string
+ mod_line += replacement_character
+ return mod_line
+
+
+def revert_unicode(document):
+ '''Transform unicode characters that can not be written using the
+document encoding to commands according to the unicodesymbols
+file. Characters that can not be replaced by commands are replaced by
+an replacement string. Flags other than 'combined' are currently not
+implemented.'''
+ spec_chars = read_unicodesymbols()
+ insets = [] # list of active insets
+
+ # Go through the document to capture all combining characters
+ i = 0
+ while i < len(document.body):
+ line = document.body[i]
+ # Check for insets
+ if line.find('\\begin_inset') > -1:
+ insets.append(line[13:].split()[0])
+ if line.find('\\end_inset') > -1:
+ del insets[-1]
+
+ # Try to write the line
+ try:
+ # If all goes well the line is written here
+ dummy = line.encode(document.encoding)
+ i += 1
+ except:
+ # Error, some character(s) in the line need to be replaced
+ mod_line = revert_unicode_line(document, i, insets, spec_chars)
+ document.body[i:i+1] = mod_line.split('\n')
+ i += len(mod_line.split('\n'))
+
+
def revert_cs_label(document):
" Remove status flag of charstyle label. "
i = 0
i = i + 1
continue
+ j = find_token(document.body, "\\end_inset", i + 1)
+ if j == -1:
+ document.warning("Malformed document")
+ else:
+ command += "".join(document.body[i+1:j])
+ document.body[i+1:j] = []
+
# The following parser is taken from the original InsetCommandParams::scanCommand
name = ""
option1 = ""
if commandparams_info[name][0] == "":
document.warning("Ignoring invalid option `%s' of command `%s'." % (option1, name))
else:
- lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('"', '\\"')))
+ lines.append('%s "%s"' % (commandparams_info[name][0], option1.replace('\\', '\\\\').replace('"', '\\"')))
if option2 != "":
if commandparams_info[name][1] == "":
document.warning("Ignoring invalid second option `%s' of command `%s'." % (option2, name))
else:
- lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('"', '\\"')))
+ lines.append('%s "%s"' % (commandparams_info[name][1], option2.replace('\\', '\\\\').replace('"', '\\"')))
if argument != "":
if commandparams_info[name][2] == "":
document.warning("Ignoring invalid argument `%s' of command `%s'." % (argument, name))
else:
- lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('"', '\\"')))
+ lines.append('%s "%s"' % (commandparams_info[name][2], argument.replace('\\', '\\\\').replace('"', '\\"')))
document.body[i:i+1] = lines
i = i + 1
preview_line = document.body[k]
elif (commandparams_info[name][0] != "" and
pname == commandparams_info[name][0]):
- option1 = pvalue.strip('"').replace('\\"', '"')
+ option1 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
elif (commandparams_info[name][1] != "" and
pname == commandparams_info[name][1]):
- option2 = pvalue.strip('"').replace('\\"', '"')
+ option2 = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
elif (commandparams_info[name][2] != "" and
pname == commandparams_info[name][2]):
- argument = pvalue.strip('"').replace('\\"', '"')
+ argument = pvalue.strip('"').replace('\\"', '"').replace('\\\\', '\\')
elif document.body[k].strip() != "":
document.warning("Ignoring unknown contents `%s' in command inset %s." % (document.body[k], name))
if name == "bibitem":
"=" : u'\u0304', # macron
"u" : u'\u0306', # breve
"." : u'\u0307', # dot above
- "\"": u'\u0308', # diaresis
+ "\"": u'\u0308', # diaeresis
"r" : u'\u030a', # ring above
"H" : u'\u030b', # double acute
"v" : u'\u030c', # caron
return ''
a = accent_map.get(type)
if a:
- return unicodedata.normalize("NFKC", "%s%s" % (char, a))
+ return unicodedata.normalize("NFC", "%s%s" % (char, a))
return ''
i += 3
+def is_inset_line(document, i):
+ """ Line i of body has an inset """
+ if document.body[i][:1] == '\\':
+ return True
+ last_tokens = "".join(document.body[i].split()[-2:])
+ return last_tokens.find('\\') != -1
+
+
+# A wrapper around normalize that handles special cases (cf. bug 3313)
+def normalize(form, text):
+ # do not normalize OHM, ANGSTROM
+ keep_characters = [0x2126,0x212b]
+ result = ''
+ convert = ''
+ for i in text:
+ if ord(i) in keep_characters:
+ if len(convert) > 0:
+ result = result + unicodedata.normalize(form, convert)
+ convert = ''
+ result = result + i
+ else:
+ convert = convert + i
+ if len(convert) > 0:
+ result = result + unicodedata.normalize(form, convert)
+ return result
+
+
def revert_accent(document):
inverse_accent_map = {}
for k in accent_map:
# words before unicode normalization.
# We do this only if the next line starts with an accent, otherwise we
# would create things like '\begin_inset ERTstatus'.
- numberoflines = len(document.body)
- for i in range(numberoflines-1):
+ for i in range(len(document.body) - 1):
if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
continue
- if (document.body[i+1][0] in inverse_accent_map):
+ if (document.body[i+1][0] in inverse_accent_map and not is_inset_line(document, i)):
# the last character of this line and the first of the next line
- # form probably a surrogate pair.
+ # form probably a surrogate pair, inline insets are excluded (second part of the test)
while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
document.body[i] += document.body[i+1][0]
document.body[i+1] = document.body[i+1][1:]
# Normalize to "Normal form D" (NFD, also known as canonical decomposition).
# This is needed to catch all accented characters.
- for i in range(numberoflines):
+ for i in range(len(document.body)):
# Unfortunately we have a mixture of unicode strings and plain strings,
# because we never use u'xxx' for string literals, but 'xxx'.
# Therefore we may have to try two times to normalize the data.
try:
- document.body[i] = unicodedata.normalize("NFKD", document.body[i])
+ document.body[i] = normalize("NFD", document.body[i])
except TypeError:
- document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
+ document.body[i] = normalize("NFD", unicode(document.body[i], 'utf-8'))
# Replace accented characters with InsetLaTeXAccent
# Do not convert characters that can be represented in the chosen
# encoding.
encoding_stack = [get_encoding(document.language, document.inputencoding, 248, document.cjk_encoding)]
lang_re = re.compile(r"^\\lang\s(\S+)")
- for i in range(len(document.body)):
+ i = 0
+ while i < len(document.body):
if (document.inputencoding == "auto" or document.inputencoding == "default") and document.cjk_encoding != '':
# Track the encoding of the current line
result = lang_re.match(document.body[i])
except UnicodeEncodeError:
# Insert the rest of the line as new line
if j < len(document.body[i]) - 1:
- document.body[i+1:i+1] = document.body[i][j+1:]
+ document.body.insert(i+1, document.body[i][j+1:])
# Delete the accented character
- if j > 0:
- document.body[i] = document.body[i][:j-1]
- else:
- document.body[i] = u''
+ document.body[i] = document.body[i][:j]
# Finally add the InsetLaTeXAccent
document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
break
accented_char = inverse_accented_map[accented_char]
accent = document.body[i][j]
try:
- dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
+ dummy = normalize("NFC", accented_char + accent).encode(encoding_stack[-1])
except UnicodeEncodeError:
# Insert the rest of the line as new line
if j < len(document.body[i]) - 1:
- document.body[i+1:i+1] = document.body[i][j+1:]
+ document.body.insert(i+1, document.body[i][j+1:])
# Delete the accented characters
- if j > 1:
- document.body[i] = document.body[i][:j-2]
- else:
- document.body[i] = u''
+ document.body[i] = document.body[i][:j-1]
# Finally add the InsetLaTeXAccent
document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
break
+ i = i + 1
+
# Normalize to "Normal form C" (NFC, pre-composed characters) again
- for i in range(numberoflines):
- document.body[i] = unicodedata.normalize("NFKC", document.body[i])
+ for i in range(len(document.body)):
+ document.body[i] = normalize("NFC", document.body[i])
def normalize_font_whitespace_259(document):
document.warning("Malformed LyX document: Could not find end of graphics inset.")
# Seach for rotateAngle and width or height or scale
# If these params are not there, nothing needs to be done.
- # FIXME: this also inserts scaleBeforeRotation if "rotateAngle" is not there!
- for k in range(i+1, j):
- if (document.body[k].find("rotateAngle") and \
- (document.body[k].find("width") or \
- document.body[k].find("height") or \
- document.body[k].find("scale"))):
- document.body.insert(j, 'scaleBeforeRotation')
+ k = find_token(document.body, "\trotateAngle", i + 1, j)
+ l = find_tokens(document.body, ["\twidth", "\theight", "\tscale"], i + 1, j)
+ if (k != -1 and l != -1):
+ document.body.insert(j, 'scaleBeforeRotation')
i = i + 1
-# FIXME: does not work at all
+#
+# remove scaleBeforeRotation graphics param
def revert_graphics_rotation(document):
" remove scaleBeforeRotation graphics parameter. "
i = 0
if j == -1:
# should not happen
document.warning("Malformed LyX document: Could not find end of graphics inset.")
- for k in range(i+1, j):
- # If there's a scaleBeforeRotation param, just remove that
- if document.body[k].find('scaleBeforeRotation'):
- del document.body[k]
- break
+ # If there's a scaleBeforeRotation param, just remove that
+ k = find_token(document.body, "\tscaleBeforeRotation", i + 1, j)
+ if k != -1:
+ del document.body[k]
+ else:
# if not, and if we have rotateAngle and width or height or scale,
# we have to put the rotateAngle value to special
- rotateAngle = get_value(document.body, 'rotateAngle', i+1, j)
- special = get_value(document.body, 'special', i+1, j)
- if (document.body[k].find("width") or \
- document.body[k].find("height") or \
- document.body[k].find("scale") and \
- document.body[k].find("rotateAngle")):
- if special == "":
- document.body.insert(j-1, '\tspecial angle=%s' % rotateAngle)
- else:
- l = find_token(document.body, "special", i+1, j)
- document.body[l].replace(special, 'angle=%s,%s' % (rotateAngle, special))
+ rotateAngle = get_value(document.body, 'rotateAngle', i + 1, j)
+ special = get_value(document.body, 'special', i + 1, j)
+ if rotateAngle != "":
+ k = find_tokens(document.body, ["\twidth", "\theight", "\tscale"], i + 1, j)
+ if k == -1:
+ break
+ if special == "":
+ document.body.insert(j-1, '\tspecial angle=%s' % rotateAngle)
+ else:
+ l = find_token(document.body, "\tspecial", i + 1, j)
+ document.body[l] = document.body[l].replace(special, 'angle=%s,%s' % (rotateAngle, special))
+ k = find_token(document.body, "\trotateAngle", i + 1, j)
+ if k != -1:
+ del document.body[k]
i = i + 1
inlinecode = ''
# looking for the oneline code for lstinline
inlinecode = document.body[find_end_of_layout(document.body,
- find_token(document.body, '\\begin_layout Standard', i + 1) +1 ) - 1]
+ find_token(document.body, '\\begin_layout %s' % document.default_layout, i + 1) +1 ) - 1]
if len(caption) > 0:
if len(params) == 0:
params = 'caption={%s}' % caption
if inline == 'true':
document.body[i:(j+1)] = [r'\begin_inset ERT',
'status %s' % status,
- r'\begin_layout Standard',
+ r'\begin_layout %s' % document.default_layout,
'',
'',
r'\backslash',
document.body[i: j+1] = [r'\begin_inset ERT',
'status %s' % status,
'',
- r'\begin_layout Standard',
+ r'\begin_layout %s' % document.default_layout,
'',
'',
r'\backslash',
r'begin{lstlisting}%s' % params,
- r'\end_layout'
+ r'\end_layout',
+ '',
+ r'\begin_layout %s' % document.default_layout,
] + document.body[k : j - 1] + \
['',
- r'\begin_layout Standard',
+ r'\begin_layout %s' % document.default_layout,
'',
r'\backslash',
'end{lstlisting}',
document.body[i : j + 1] = [r'\begin_inset ERT',
'status open',
'',
- r'\begin_layout Standard',
+ r'\begin_layout %s' % document.default_layout,
'',
'',
r'\backslash',
else:
del document.header[i]
+
def revert_separator_layout(document):
r'''Revert --Separator-- to a lyx note
From
if j == -1:
# this should not happen
break
- document.body[i : j + 1] = [r'\begin_layout Standard',
+ document.body[i : j + 1] = [r'\begin_layout %s' % document.default_layout,
r'\begin_inset Note Note',
'status open',
'',
- r'\begin_layout Standard',
+ r'\begin_layout %s' % document.default_layout,
'Separate Environment',
r'\end_layout',
'',
r'\end_layout'
]
+
def convert_arabic (document):
if document.language == "arabic":
document.language = "arabic_arabtex"
# change the language name
document.body[i] = '\lang arabic_arabtex'
i = i + 1
-
+
+
def revert_arabic (document):
if document.language == "arabic_arabtex":
document.language = "arabic"
document.body[i] = '\lang arabic'
i = i + 1
-def revert_unicode(document):
- '''Transform unicode symbols according to the unicode list.
-Preamble flags are not implemented.
-Combination characters are currently ignored.
-Forced output is currently not enforced'''
- pathname = os.path.dirname(sys.argv[0])
- fp = open(pathname.strip('lyx2lyx') + 'unicodesymbols','r')
- spec_chars = {}
- for line in fp.readlines():
- if line[0] != '#':
- line=line.replace('"','') #remove all qoutation marks
- try:
- # flag1 and flag2 are preamble & flags
- # currently NOT impemented
- [ucs4,command,flag1,flag2] =line.split(None,3)
- spec_chars[unichr(eval(ucs4))] = [command, flag1, flag2]
- except:
- pass
- fp.close()
- #Define strings to start and end ERT and math insets
- ert_intro='\n\n\\begin_inset ERT\nstatus collapsed\n\\begin_layout Standard\n\\backslash\n'
- ert_outro='\n\\end_layout\n\n\\end_inset\n\n'
- math_intro='\n\\begin_inset Formula $'
- math_outro='$\n\\end_inset\n'
- # Find unicode characters and replace them
- in_ert = 0 # flag set to 1 if in ERT inset
- in_math = 0 # flag set to 1 if in math inset
- insets = [] # list of active insets
- for i, current_line in enumerate(document.body):
- if current_line.find('\\begin_inset') > -1:
- # check which inset to start
- if current_line.find('\\begin_inset ERT') > -1:
- in_ert = 1
- insets.append('ert')
- elif current_line.find('\\begin_inset Formula') > -1:
- in_math = 1
- insets.append('math')
- else:
- insets.append('other')
- if current_line.find('\\end_inset') > -1:
- # check which inset to end
- try:
- cur_inset = insets.pop()
- if cur_inset == 'ert':
- in_ert = 0
- elif cur_inset == 'math':
- in_math = 0
- else:
- pass # end of other inset
- except:
- pass # inset list was empty (for some reason)
- current_line=''; # clear to have as container for modified line
- for j in range(len(document.body[i])):
- if spec_chars.has_key(document.body[i][j]):
- flags = spec_chars[document.body[i][j]][1] + spec_chars[document.body[i][j]][2]
- if flags.find('combining') > -1:
- command = ''
- else:
- command = spec_chars[document.body[i][j]][0]; # the command to replace unicode
- if command[0:2] == '\\\\':
- if command[2:12]=='ensuremath':
- if in_ert == 1:
- # math in ERT
- command = command.replace('\\\\ensuremath{\\\\', '$\n\\backslash\n')
- command = command.replace('}', '$\n')
- elif in_math == 0:
- # add a math inset with the replacement character
- command = command.replace('\\\\ensuremath{\\', math_intro)
- command = command.replace('}', math_outro)
- else:
- # we are already in a math inset
- command = command.replace('\\\\ensuremath{\\', '')
- command = command.replace('}', '')
- else:
- if in_math == 1:
- # avoid putting an ERT in a math; instead put command as text
- command = command.replace('\\\\', '\mathrm{')
- command = command + '}'
- elif in_ert == 0:
- # add an ERT inset with the replacement character
- command = command.replace('\\\\', ert_intro)
- command = command + ert_outro
- else:
- command = command.replace('\\\\', '\n\\backslash\n')
- current_line = current_line + command
- else:
- current_line = current_line + document.body[i][j]
- document.body[i] = current_line
-
##
# Conversion hub
if __name__ == "__main__":
pass
-
-
-