2 # -*- coding: utf-8 -*-
5 # This file is part of LyX, the document processor.
6 # Licence details can be found in the file COPYING.
10 # Full author contact details are available in file CREDITS
12 # This script converts a kmap file from LaTeX commands to unicode characters
13 # The kmap file is read and written in utf8 encoding
16 import os, re, string, sys, unicodedata
19 return ("Usage: %s unicodesymbolsfile inputfile outputfile\n" % prog_name +
20 "or %s unicodesymbolsfile <inputfile >outputfile" % prog_name)
24 sys.stderr.write(message + '\n')
29 " Remove end of line char(s)."
30 if line[-2:-1] == '\r':
32 elif line[-1:] == '\r' or line[-1:] == '\n':
35 # file with no EOL in last line
40 " Read input file and strip lineendings."
43 line = input.readline()
46 lines.append(trim_eol(line).decode('utf8'))
51 " Escape a word for LyXLex."
52 re_quote = re.compile(r'\s|,')
56 if c == '\\' or c == '"' or c == '#':
57 retval = retval + u'\\'
59 if re_quote.match(retval):
60 return u'"%s"' % retval
65 " Unescape a LyXLex escaped word."
66 if len(word) > 1 and word[0] == '"' and word[-1] == '"':
75 if word[i] == '\\' and i < stop - 1:
77 retval = retval + word[i]
82 def readsymbols(input):
83 " Build the symbol list from the unicodesymbols file and add some hardcoded symbols."
86 line = input.readline()
91 if len(tokens) > 0 and tokens[0][0] != '#':
93 tokens[1] = unescape(tokens[1])
94 if tokens[0][0:2] == "0x":
95 tokens[0] = int(tokens[0][2:], 16)
96 symbols.append(tokens)
97 # special cases from .cdef files (e.g. duplicates with different commands)
98 symbols.append([0x00a1, '\\nobreakspace'])
99 symbols.append([0x00a7, '\\S'])
100 symbols.append([0x00a9, '\\copyright'])
101 symbols.append([0x00b1, '$\\pm$'])
102 symbols.append([0x00b5, '$\\mu$'])
103 symbols.append([0x00b7, '$\\cdot$'])
104 symbols.append([0x00b9, '$\\mathonesuperior$'])
105 symbols.append([0x00d7, '$\\times$'])
106 symbols.append([0x00d7, '\\times'])
107 symbols.append([0x00f7, '\\div'])
108 symbols.append([0x20ac, '\\euro'])
109 # special caron, see lib/lyx2lyx/lyx_1_5.py for an explanation
110 symbols.append([0x030c, '\\q', '', 'combining'])
114 def write(output, lines):
115 " Write output file with native lineendings."
117 output.write(line.encode('utf8') + os.linesep)
120 def translate_symbol(unicodesymbols, symbol, try_combining = True):
121 " Translate a symbol from LaTeX to unicode."
122 re_combining = re.compile(r'^[^a-zA-Z]')
125 for i in unicodesymbols:
126 # Play safe and don't try combining symbols (not sure if this is
128 if i[1] == symbol and (len(i) < 4 or i[3].find('combining') < 0):
131 # no direct match, see whether this is a combining sequence
132 for i in unicodesymbols:
133 if len(i) > 3 and i[3].find('combining') >= 0 and symbol.find(i[1]) == 0:
134 # Test whether this is really a combining sequence, e.g.
135 # \"o or \d{o}, and not a symbol like \dh that shares the
136 # beginning with a combining symbol
137 translated = symbol[len(i[1]):]
138 if translated != '' and re_combining.match(translated):
139 # Really a combining sequence
140 if len(translated) > 1 and translated[0] == '{' and translated[-1] == '}':
141 # Strip braces from things like \d{o}
142 translated = translated[1:-1]
144 # for some strange reason \\'\\i does not get
145 # correctly combined, so we try \\'\\i which has an
146 # entry in unicodesymbols
147 combined = translate_symbol(unicodesymbols, u'%s{%s}' % (i[1], translated))
150 if len(translated) > 1:
151 # The base character may be a symbol itself, e.g \"{\i}
152 translated = translate_symbol(unicodesymbols, translated, False)
153 # Play safe and only translate combining sequences with
155 if len(translated) == 1 and (i[1] != '\\q' or translated in ['t', 'd', 'l', 'L']):
156 return unicodedata.normalize("NFKC", translated + unichr(i[0]))
158 # we founed a combining character, but could not convert the argument to a single character
163 def convert(lines, unicodesymbols):
164 " Translate all symbols in lines from LaTeX to unicode."
165 # convert both commented and active entries
166 re_kmap = re.compile(r'^(#?\s*\\kmap\s+\S+\s+)([^\s]+)(.*)$')
167 re_kxmod = re.compile(r'^(#?\s*\\kxmod\s+\S+\s+\S+\s+)([^\s]+)(.*)$')
168 for i in range(len(lines)):
169 match = re_kmap.match(lines[i])
171 match = re_kxmod.match(lines[i])
173 symbol = unescape(match.group(2))
174 if len(symbol) > 2 and symbol[-2:] == '{}':
175 # The unicodesymbols file does not include the trailing delimiter {}
176 symbol = symbol[0:-2]
177 translated = translate_symbol(unicodesymbols, symbol)
179 lines[i] = u'%s%s%s' % (match.group(1), match.group(2), match.group(3))
181 lines[i] = u'%s%s%s' % (match.group(1), escape(translated), match.group(3))
192 input = open(argv[2], 'rb')
193 output = open(argv[3], 'wb')
195 error(usage(argv[0]))
196 unicodesymbols = open(argv[1], 'rb')
199 symbols = readsymbols(unicodesymbols)
201 convert(lines, symbols)
212 if __name__ == "__main__":