""" Convert files to the file format generated by lyx 1.5"""
import re
-from parser_tools import find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
+import unicodedata
+
+from parser_tools import find_re, find_token, find_token_backwards, find_token_exact, find_tokens, find_end_of, get_value
from LyX import get_encoding
i = i + 1
+# Accents of InsetLaTeXAccent
+accent_map = {
+ "`" : u'\u0300', # grave
+ "'" : u'\u0301', # acute
+ "^" : u'\u0302', # circumflex
+ "~" : u'\u0303', # tilde
+ "=" : u'\u0304', # macron
+ "u" : u'\u0306', # breve
+ "." : u'\u0307', # dot above
+ "\"": u'\u0308', # diaresis
+ "r" : u'\u030a', # ring above
+ "H" : u'\u030b', # double acute
+ "v" : u'\u030c', # caron
+ "b" : u'\u0320', # minus sign below
+ "d" : u'\u0323', # dot below
+ "c" : u'\u0327', # cedilla
+ "k" : u'\u0328', # ogonek
+ "t" : u'\u0361' # tie. This is special: It spans two characters, but
+ # only one is given as argument, so we don't need to
+ # treat it differently.
+}
+
+
+# special accents of InsetLaTeXAccent without argument
+special_accent_map = {
+ 'i' : u'\u0131', # dotless i
+ 'j' : u'\u0237', # dotless j
+ 'l' : u'\u0142', # l with stroke
+ 'L' : u'\u0141' # L with stroke
+}
+
+
+# special accent arguments of InsetLaTeXAccent
+accented_map = {
+ '\\i' : u'\u0131', # dotless i
+ '\\j' : u'\u0237' # dotless j
+}
+
+
+def _convert_accent(accent, accented_char):
+ type = accent
+ char = accented_char
+ if char == '':
+ if type in special_accent_map:
+ return special_accent_map[type]
+ # a missing char is treated as space by LyX
+ char = ' '
+ elif type == 'q' and char in ['t', 'd', 'l', 'L']:
+ # Special caron, only used with t, d, l and L.
+ # It is not in the map because we convert it to the same unicode
+ # character as the normal caron: \q{} is only defined if babel with
+ # the czech or slovak language is used, and the normal caron
+ # produces the correct output if the T1 font encoding is used.
+ # For the same reason we never convert to \q{} in the other direction.
+ type = 'v'
+ elif char in accented_map:
+ char = accented_map[char]
+ elif (len(char) > 1):
+ # We can only convert accents on a single char
+ return ''
+ a = accent_map.get(type)
+ if a:
+ return unicodedata.normalize("NFKC", "%s%s" % (char, a))
+ return ''
+
+
+def convert_ertbackslash(body, i, ert, default_layout):
+ r""" -------------------------------------------------------------------------------------------
+ Convert backslashes and '\n' into valid ERT code, append the converted
+ text to body[i] and return the (maybe incremented) line index i"""
+
+ for c in ert:
+ if c == '\\':
+ body[i] = body[i] + '\\backslash '
+ i = i + 1
+ body.insert(i, '')
+ elif c == '\n':
+ body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
+ i = i + 4
+ else:
+ body[i] = body[i] + c
+ return i
+
+
+def convert_accent(document):
+ # The following forms are supported by LyX:
+ # '\i \"{a}' (standard form, as written by LyX)
+ # '\i \"{}' (standard form, as written by LyX if the accented char is a space)
+ # '\i \"{ }' (also accepted if the accented char is a space)
+ # '\i \" a' (also accepted)
+ # '\i \"' (also accepted)
+ re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
+ re_contents = re.compile(r'^([^\s{]+)(.*)$')
+ re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
+ i = 0
+ while 1:
+ i = find_re(document.body, re_wholeinset, i)
+ if i == -1:
+ return
+ match = re_wholeinset.match(document.body[i])
+ prefix = match.group(1)
+ contents = match.group(3).strip()
+ match = re_contents.match(contents)
+ if match:
+ # Strip first char (always \)
+ accent = match.group(1)[1:]
+ accented_contents = match.group(2).strip()
+ match = re_accentedcontents.match(accented_contents)
+ accented_char = match.group(1)
+ converted = _convert_accent(accent, accented_char)
+ if converted == '':
+ # Normalize contents
+ contents = '%s{%s}' % (accent, accented_char),
+ else:
+ document.body[i] = '%s%s' % (prefix, converted)
+ i += 1
+ continue
+ document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
+ document.body[i] = prefix
+ document.body[i+1:i+1] = ['\\begin_inset ERT',
+ 'status collapsed',
+ '',
+ '\\begin_layout %s' % document.default_layout,
+ '',
+ '',
+ '']
+ i = convert_ertbackslash(document.body, i + 7,
+ '\\%s' % contents,
+ document.default_layout)
+ document.body[i+1:i+1] = ['\\end_layout',
+ '',
+ '\\end_inset']
+ i += 3
+
+
+def revert_accent(document):
+ inverse_accent_map = {}
+ for k in accent_map:
+ inverse_accent_map[accent_map[k]] = k
+ inverse_special_accent_map = {}
+ for k in special_accent_map:
+ inverse_special_accent_map[special_accent_map[k]] = k
+ inverse_accented_map = {}
+ for k in accented_map:
+ inverse_accented_map[accented_map[k]] = k
+
+ # Since LyX may insert a line break within a word we must combine all
+ # words before unicode normalization.
+ # We do this only if the next line starts with an accent, otherwise we
+ # would create things like '\begin_inset ERTstatus'.
+ numberoflines = len(document.body)
+ for i in range(numberoflines-1):
+ if document.body[i] == '' or document.body[i+1] == '' or document.body[i][-1] == ' ':
+ continue
+ if (document.body[i+1][0] in inverse_accent_map):
+ # the last character of this line and the first of the next line
+ # form probably a surrogate pair.
+ while (len(document.body[i+1]) > 0 and document.body[i+1][0] != ' '):
+ document.body[i] += document.body[i+1][0]
+ document.body[i+1] = document.body[i+1][1:]
+
+ # Normalize to "Normal form D" (NFD, also known as canonical decomposition).
+ # This is needed to catch all accented characters.
+ for i in range(numberoflines):
+ # Unfortunately we have a mixture of unicode strings and plain strings,
+ # because we never use u'xxx' for string literals, but 'xxx'.
+ # Therefore we may have to try two times to normalize the data.
+ try:
+ document.body[i] = unicodedata.normalize("NFKD", document.body[i])
+ except TypeError:
+ document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
+
+ # Replace accented characters with InsetLaTeXAccent
+ # Do not convert characters that can be represented in the chosen
+ # encoding.
+ encoding_stack = [get_encoding(document.language, document.inputencoding, 248)]
+ lang_re = re.compile(r"^\\lang\s(\S+)")
+ for i in range(len(document.body)):
+
+ if document.inputencoding == "auto" or document.inputencoding == "default":
+ # Track the encoding of the current line
+ result = lang_re.match(document.body[i])
+ if result:
+ language = result.group(1)
+ if language == "default":
+ encoding_stack[-1] = document.encoding
+ else:
+ from lyx2lyx_lang import lang
+ encoding_stack[-1] = lang[language][3]
+ continue
+ elif find_token(document.body, "\\begin_layout", i, i + 1) == i:
+ encoding_stack.append(encoding_stack[-1])
+ continue
+ elif find_token(document.body, "\\end_layout", i, i + 1) == i:
+ del encoding_stack[-1]
+ continue
+
+ for j in range(len(document.body[i])):
+ # dotless i and dotless j are both in special_accent_map and can
+ # occur as an accented character, so we need to test that the
+ # following character is no accent
+ if (document.body[i][j] in inverse_special_accent_map and
+ (j == len(document.body[i]) - 1 or document.body[i][j+1] not in inverse_accent_map)):
+ accent = document.body[i][j]
+ try:
+ dummy = accent.encode(encoding_stack[-1])
+ except UnicodeEncodeError:
+ # Insert the rest of the line as new line
+ if j < len(document.body[i]) - 1:
+ document.body[i+1:i+1] = document.body[i][j+1:]
+ # Delete the accented character
+ if j > 0:
+ document.body[i] = document.body[i][:j-1]
+ else:
+ document.body[i] = u''
+ # Finally add the InsetLaTeXAccent
+ document.body[i] += "\\i \\%s{}" % inverse_special_accent_map[accent]
+ break
+ elif j > 0 and document.body[i][j] in inverse_accent_map:
+ accented_char = document.body[i][j-1]
+ if accented_char == ' ':
+ # Conform to LyX output
+ accented_char = ''
+ elif accented_char in inverse_accented_map:
+ accented_char = inverse_accented_map[accented_char]
+ accent = document.body[i][j]
+ try:
+ dummy = unicodedata.normalize("NFKC", accented_char + accent).encode(encoding_stack[-1])
+ except UnicodeEncodeError:
+ # Insert the rest of the line as new line
+ if j < len(document.body[i]) - 1:
+ document.body[i+1:i+1] = document.body[i][j+1:]
+ # Delete the accented characters
+ if j > 1:
+ document.body[i] = document.body[i][:j-2]
+ else:
+ document.body[i] = u''
+ # Finally add the InsetLaTeXAccent
+ document.body[i] += "\\i \\%s{%s}" % (inverse_accent_map[accent], accented_char)
+ break
+ # Normalize to "Normal form C" (NFC, pre-composed characters) again
+ for i in range(numberoflines):
+ document.body[i] = unicodedata.normalize("NFKC", document.body[i])
+
+
##
# Conversion hub
#
[251, [revert_commandparams]],
[250, [revert_cs_label]],
[249, []],
- [248, [revert_utf8]],
+ [248, [revert_accent, revert_utf8]],
[247, [revert_booktabs]],
[246, [revert_font_settings]],
[245, [revert_framed]]]