1 # file html2latexwrapper.py
2 # This file is part of LyX, the document processor.
3 # Licence details can be found in the file COPYING.
7 # Full author contact details are available in file CREDITS
10 # html2latexwrapper.py <converter> <from file> <to file>
12 # This script will call <converter> -s <from file> > <to file>
13 # and add a \usepackage{inputenc} line if needed.
16 import os, string, sys, re
18 from lyxpreview_tools import error, run_command
22 return "Usage: %s <converter> <from file> <to file>" % prog_name
25 def get_encoding(from_file_name):
26 '''Read the encoding from a HTML or XHTML file'''
28 from_file = open(from_file_name)
29 regexpxml = re.compile(r'^\s?<\?xml\s+.*?encoding\s*=\s*"([^"]+)"', re.IGNORECASE)
30 regexptype = re.compile(r'^\s?<meta\s+.*?charset\s*=\s*"([^"]+)"', re.IGNORECASE)
31 for line in from_file.readlines():
32 m = regexpxml.match(line)
34 m = regexptype.match(line)
37 return m.group(1).lower()
45 # Parse and manipulate the command line arguments.
50 from_file_name = argv[2]
51 to_file_name = argv[3]
54 cmd = f'{converter} -s {from_file_name}'
55 (ret, output) = run_command(cmd, False)
57 # Determine encoding of HTML file
58 enc = get_encoding(from_file_name).replace('iso_8859', 'iso-8859')
59 # The HTML encodings were taken from http://www.iana.org/assignments/character-sets/character-sets.xml.
60 # Only those with inputenc support were added, and only thge most important aliases.
61 # List of encodings that have the same name in HTML (may be as an alias) and inputenc
62 same_enc = ['cp437', 'cp850', 'cp852', 'cp855', 'cp858', 'cp862', 'cp865', 'cp866', \
63 'cp1250', 'cp1251', 'cp1252', 'cp1255', 'cp1256', 'cp1257', \
64 'koi8-r', 'koi8-u', 'pt154', 'pt254', \
65 'latin1', 'latin2', 'latin3', 'latin4', 'latin5', 'latin9', 'latin10']
66 # Translation table from HTML encoding names to inputenc encoding names
67 encodings = {'utf-8' : 'utf8', 'csutf8' : 'utf8', \
68 'iso-8859-1' : 'latin1', 'cp819' : 'latin1', \
69 'iso-8859-2' : 'latin2', \
70 'iso-8859-3' : 'latin3', \
71 'iso-8859-4' : 'latin4', \
72 'iso-8859-5' : 'iso88595', 'cyrillic' : 'iso88595', \
73 'iso-8859-6' : '8859-6', 'arabic' : '8859-6', \
74 'iso-8859-7' : 'iso-8859-7', 'greek' : 'iso-8859-7', \
75 'iso-8859-8' : '8859-8', 'hebrew' : '8859-8', \
76 'iso-8859-9' : 'latin5', \
77 'iso-8859-13' : 'l7xenc', \
78 'iso-8859-15' : 'latin9', \
79 'iso-8859-16' : 'latin10', \
88 'ibm1250' : 'cp1250', \
89 'ibm1251' : 'cp1251', \
90 'ibm1255' : 'cp1255', \
91 'ibm1256' : 'cp1256', \
92 'ibm1257' : 'cp1257', \
93 'macintosh' : 'applemac', 'mac' : 'applemac', 'csmacintosh' : 'applemac'}
95 if enc in encodings.keys():
97 elif enc not in same_enc:
100 # Read conversion result
101 lines = output.split('\n')
103 # Do not add the inputenc call if inputenc or CJK is already loaded
104 add_inputenc = (enc != '')
106 regexp = re.compile(r'^\s?\\usepackage\s?(\[[^]+]\])?\s?{(inputenc)|(CJK)|(CJKutf8)}')
108 if regexp.match(line):
112 # Write output file and insert inputenc call if needed
113 to_file = open(to_file_name, 'w')
115 to_file.write(line + '\n')
116 if add_inputenc and line.find('\\documentclass') == 0:
117 to_file.write('\\usepackage[%s]{inputenc}\n' % enc)
123 if __name__ == "__main__":