lib/scripts/html2latexwrapper.py

   1 # file html2latexwrapper.py
   2 # This file is part of LyX, the document processor.
   3 # Licence details can be found in the file COPYING.
   4
   5 # author Georg Baum
   6
   7 # Full author contact details are available in file CREDITS
   8
   9 # Usage:
  10 # html2latexwrapper.py <converter> <from file> <to file>
  11
  12 # This script will call <converter> -s <from file> > <to file>
  13 # and add a \usepackage{inputenc} line if needed.
  14
  15
  16 import os, string, sys, re
  17
  18 from lyxpreview_tools import error, run_command
  19
  20
  21 def usage(prog_name):
  22     return "Usage: %s <converter> <from file> <to file>" % prog_name
  23
  24
  25 def get_encoding(from_file_name):
  26     '''Read the encoding from a HTML or XHTML file'''
  27     try:
  28         from_file = open(from_file_name)
  29         regexpxml = re.compile(r'^\s?<\?xml\s+.*?encoding\s*=\s*"([^"]+)"', re.IGNORECASE)
  30         regexptype = re.compile(r'^\s?<meta\s+.*?charset\s*=\s*"([^"]+)"', re.IGNORECASE)
  31         for line in from_file.readlines():
  32             m = regexpxml.match(line)
  33             if not m:
  34                 m = regexptype.match(line)
  35             if m:
  36                 from_file.close()
  37                 return m.group(1).lower()
  38         from_file.close()
  39     except:
  40         pass
  41     return ''
  42
  43
  44 def main(argv):
  45     # Parse and manipulate the command line arguments.
  46     if len(argv) != 4:
  47         error(usage(argv[0]))
  48
  49     converter = argv[1]
  50     from_file_name = argv[2]
  51     to_file_name = argv[3]
  52
  53     # Run gnuhtml2latex
  54     cmd = f'{converter} -s {from_file_name}'
  55     (ret, output) = run_command(cmd, False)
  56
  57     # Determine encoding of HTML file
  58     enc = get_encoding(from_file_name).replace('iso_8859', 'iso-8859')
  59     # The HTML encodings were taken from http://www.iana.org/assignments/character-sets/character-sets.xml.
  60     # Only those with inputenc support were added, and only thge most important aliases.
  61     # List of encodings that have the same name in HTML (may be as an alias) and inputenc
  62     same_enc = ['cp437', 'cp850', 'cp852', 'cp855', 'cp858', 'cp862', 'cp865', 'cp866', \
  63                 'cp1250', 'cp1251', 'cp1252', 'cp1255', 'cp1256', 'cp1257', \
  64                 'koi8-r', 'koi8-u', 'pt154', 'pt254', \
  65                 'latin1', 'latin2', 'latin3', 'latin4', 'latin5', 'latin9', 'latin10']
  66     # Translation table from HTML encoding names to inputenc encoding names
  67     encodings = {'utf-8' : 'utf8', 'csutf8' : 'utf8', \
  68                  'iso-8859-1' : 'latin1', 'cp819' : 'latin1', \
  69                  'iso-8859-2' : 'latin2', \
  70                  'iso-8859-3' : 'latin3', \
  71                  'iso-8859-4' : 'latin4', \
  72                  'iso-8859-5' : 'iso88595', 'cyrillic' : 'iso88595', \
  73                  'iso-8859-6' : '8859-6', 'arabic' : '8859-6', \
  74                  'iso-8859-7' : 'iso-8859-7', 'greek' : 'iso-8859-7', \
  75                  'iso-8859-8' : '8859-8', 'hebrew' : '8859-8', \
  76                  'iso-8859-9' : 'latin5', \
  77                  'iso-8859-13' : 'l7xenc', \
  78                  'iso-8859-15' : 'latin9', \
  79                  'iso-8859-16' : 'latin10', \
  80                  'ibm437' : 'cp437', \
  81                  'ibm850' : 'cp850', \
  82                  'ibm852' : 'cp852', \
  83                  'ibm855' : 'cp855', \
  84                  'ibm858' : 'cp858', \
  85                  'ibm862' : 'cp862', \
  86                  'ibm865' : 'cp865', \
  87                  'ibm866' : 'cp866', \
  88                  'ibm1250' : 'cp1250', \
  89                  'ibm1251' : 'cp1251', \
  90                  'ibm1255' : 'cp1255', \
  91                  'ibm1256' : 'cp1256', \
  92                  'ibm1257' : 'cp1257', \
  93                  'macintosh' : 'applemac', 'mac' : 'applemac', 'csmacintosh' : 'applemac'}
  94     if enc != '':
  95         if enc in encodings.keys():
  96             enc = encodings[enc]
  97         elif enc not in same_enc:
  98             enc = ''
  99
 100     # Read conversion result
 101     lines = output.split('\n')
 102
 103     # Do not add the inputenc call if inputenc or CJK is already loaded
 104     add_inputenc = (enc != '')
 105     if add_inputenc:
 106         regexp = re.compile(r'^\s?\\usepackage\s?(\[[^]+]\])?\s?{(inputenc)|(CJK)|(CJKutf8)}')
 107         for line in lines:
 108             if regexp.match(line):
 109                 add_inputenc = False
 110                 break
 111
 112     # Write output file and insert inputenc call if needed
 113     to_file = open(to_file_name, 'w')
 114     for line in lines:
 115         to_file.write(line + '\n')
 116         if add_inputenc and line.find('\\documentclass') == 0:
 117             to_file.write('\\usepackage[%s]{inputenc}\n' % enc)
 118     to_file.close()
 119
 120     return ret
 121
 122
 123 if __name__ == "__main__":
 124     main(sys.argv)