Work around gnuhtml2latex encoding bug

author Georg Baum <baum@lyx.org>

Fri, 12 Apr 2013 19:35:50 +0000 (21:35 +0200)

committer Georg Baum <baum@lyx.org>

Fri, 12 Apr 2013 19:35:50 +0000 (21:35 +0200)
author Georg Baum <baum@lyx.org>
Fri, 12 Apr 2013 19:35:50 +0000 (21:35 +0200)
committer Georg Baum <baum@lyx.org>
Fri, 12 Apr 2013 19:35:50 +0000 (21:35 +0200)
diff --git a/lib/Makefile.am b/lib/Makefile.am

index 42f0167ed7754706d9ec059acdb249646b8d9634..cf147d55bba22ae9c0f550f559b93e85428448eb 100644 (file)
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@@ -2043,6 +2043,7 @@ dist_scripts_DATA += \
         scripts/fig2pdftex.py \
         scripts/fig2pstex.py \
         scripts/fig_copy.py \
+       scripts/html2latexwrapper.py \
         scripts/include_bib.py \
         scripts/layout2layout.py \
         scripts/legacy_lyxpreview2ppm.py \
diff --git a/lib/configure.py b/lib/configure.py

index 62ed2b9ec2826ca292b551c7d56e20f8d4acf099..acd4999a54e00330a8292597d91911d4a9fa0e68 100644 (file)
--- a/lib/configure.py
+++ b/lib/configure.py
@@ -679,12 +679,15 @@ def checkConverterEntries():
      checkProg('a Sweave -> R/S code converter', ['Rscript --verbose --no-save --no-restore $$s/scripts/lyxstangle.R $$i $$e $$r'], 
          rc_entry = [ r'\converter sweave      r      "%%"    ""' ])
      #
-    checkProg('a knitr -> R/S code converter', ['Rscript --verbose --no-save --no-restore $$s/scripts/lyxknitr.R $$p$$i $$p$$o $$e $$r tangle'], 
+    checkProg('a knitr -> R/S code converter', ['Rscript --verbose --no-save --no-restore $$s/scripts/lyxknitr.R $$p$$i $$p$$o $$e $$r tangle'],
          rc_entry = [ r'\converter knitr      r      "%%"    ""' ])
      #
-    checkProg('an HTML -> LaTeX converter', ['html2latex $$i', 'gnuhtml2latex -s $$i > $$o',
+    checkProg('an HTML -> LaTeX converter', ['html2latex $$i', 'gnuhtml2latex',
          'htmltolatex -input $$i -output $$o', 'htmltolatex.jar -input $$i -output $$o'],
-        rc_entry = [ r'\converter html       latex      "%%"   ""' ])
+        rc_entry = [ r'\converter html       latex      "%%"   ""', \
+                     r'\converter html       latex      "python -tt $$s/scripts/html2latexwrapper.py %% $$i $$o"       ""', \
+                     r'\converter html       latex      "%%"   ""', \
+                     r'\converter html       latex      "%%"   ""', '' ])
      #
      checkProg('an MS Word -> LaTeX converter', ['wvCleanLatex $$i $$o'],
          rc_entry = [ r'\converter word       latex      "%%"   ""' ])
@@ -801,14 +804,13 @@ def checkConverterEntries():
  \converter fig        pstex      "python -tt $$s/scripts/fig2pstex.py $$i $$o" ""''')
      #
      checkProg('a TIFF -> PS converter', ['tiff2ps $$i > $$o'],
-        rc_entry = [ r'\converter tiff       eps        "%%"   ""', ''])
+        rc_entry = [ r'\converter tiff       eps        "%%"   ""'])
      #
      checkProg('a TGIF -> EPS/PPM converter', ['tgif'],
          rc_entry = [
              r'''\converter tgif       eps        "tgif -print -color -eps -stdout $$i > $$o"   ""
  \converter tgif       png        "tgif -print -color -png -o $$d $$i"  ""
-\converter tgif       pdf6       "tgif -print -color -pdf -stdout $$i > $$o"   ""''',
-            ''])
+\converter tgif       pdf6       "tgif -print -color -pdf -stdout $$i > $$o"   ""'''])
      #
      checkProg('a WMF -> EPS converter', ['metafile2eps $$i $$o', 'wmf2eps -o $$o $$i'],
          rc_entry = [ r'\converter wmf        eps        "%%"   ""'])
@@ -817,10 +819,10 @@ def checkConverterEntries():
          rc_entry = [ r'\converter emf        eps        "%%"   ""'])
      # Only define a converter to pdf6 for graphics
      checkProg('an EPS -> PDF converter', ['epstopdf'],
-        rc_entry = [ r'\converter eps        pdf6       "epstopdf --outfile=$$o $$i"   ""', ''])
+        rc_entry = [ r'\converter eps        pdf6       "epstopdf --outfile=$$o $$i"   ""'])
      #
      checkProg('an EPS -> PNG converter', ['convert $$i $$o'],
-        rc_entry = [ r'\converter eps        png        "%%"   ""', ''])
+        rc_entry = [ r'\converter eps        png        "%%"   ""'])
      #
      # no agr -> pdf6 converter, since the pdf library used by gracebat is not
      # free software and therefore not compiled in in many installations.
@@ -831,14 +833,12 @@ def checkConverterEntries():
              r'''\converter agr        eps        "gracebat -hardcopy -printfile $$o -hdevice EPS $$i 2>/dev/null"      ""
  \converter agr        png        "gracebat -hardcopy -printfile $$o -hdevice PNG $$i 2>/dev/null"      ""
  \converter agr        jpg        "gracebat -hardcopy -printfile $$o -hdevice JPEG $$i 2>/dev/null"     ""
-\converter agr        ppm        "gracebat -hardcopy -printfile $$o -hdevice PNM $$i 2>/dev/null"      ""''',
-            ''])
+\converter agr        ppm        "gracebat -hardcopy -printfile $$o -hdevice PNM $$i 2>/dev/null"      ""'''])
      #
      checkProg('a Dot -> Image converter', ['dot'],
          rc_entry = [
              r'''\converter dot        eps        "dot -Teps $$i -o $$o"        ""
-\converter dot        png        "dot -Tpng $$i -o $$o"        ""''',
-            ''])
+\converter dot        png        "dot -Tpng $$i -o $$o"        ""'''])
      #
      checkProg('a Dia -> PNG converter', ['dia -e $$o -t png $$i'],
          rc_entry = [ r'\converter dia        png        "%%"   ""'])
@@ -866,8 +866,7 @@ def checkConverterEntries():
      checkProg('a spreadsheet -> latex converter', ['ssconvert'],
         rc_entry = [ r'''\converter gnumeric latex "ssconvert --export-type=Gnumeric_html:latex $$i $$o" ""
  \converter oocalc latex "ssconvert --export-type=Gnumeric_html:latex $$i $$o" ""
-\converter excel  latex "ssconvert --export-type=Gnumeric_html:latex $$i $$o" ""''',
-''])
+\converter excel  latex "ssconvert --export-type=Gnumeric_html:latex $$i $$o" ""'''])
  
      path, lilypond = checkProg('a LilyPond -> EPS/PDF/PNG converter', ['lilypond'])
      if (lilypond != ''):
@@ -918,7 +917,7 @@ def checkConverterEntries():
              logger.info('+  found LilyPond-book, but could not extract version number.')
      #
      checkProg('a Noteedit -> LilyPond converter', ['noteedit --export-lilypond $$i'],
-        rc_entry = [ r'\converter noteedit   lilypond   "%%"   ""', ''])
+        rc_entry = [ r'\converter noteedit   lilypond   "%%"   ""' ])
      #
      # Currently, lyxpak outputs a gzip compressed tar archive on *nix
      # and a zip archive on Windows.
diff --git a/lib/scripts/html2latexwrapper.py b/lib/scripts/html2latexwrapper.py

new file mode 100644 (file)

index 0000000..d97942c
--- /dev/null
+++ b/lib/scripts/html2latexwrapper.py
@@ -0,0 +1,127 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# file html2latexwrapper.py
+# This file is part of LyX, the document processor.
+# Licence details can be found in the file COPYING.
+
+# author Georg Baum
+
+# Full author contact details are available in file CREDITS
+
+# Usage:
+# html2latexwrapper.py <converter> <from file> <to file>
+
+# This script will call <converter> -s <from file> > <to file>
+# and add a \usepackage{inputenc} line if needed.
+
+
+import os, string, sys, re
+
+from lyxpreview_tools import error, run_command
+
+
+def usage(prog_name):
+    return "Usage: %s <converter> <from file> <to file>" % prog_name
+
+
+def get_encoding(from_file_name):
+    '''Read the encoding from a HTML or XHTML file'''
+    try:
+        from_file = open(from_file_name, 'rt')
+        regexpxml = re.compile(r'^\s?<\?xml\s+.*?encoding\s*=\s*"([^"]+)"', re.IGNORECASE)
+        regexptype = re.compile(r'^\s?<meta\s+.*?charset\s*=\s*"([^"]+)"', re.IGNORECASE)
+        for line in from_file.readlines():
+            m = regexpxml.match(line)
+            if not m:
+                m = regexptype.match(line)
+            if m:
+                from_file.close()
+                return m.group(1).lower()
+        from_file.close()
+    except:
+        pass
+    return ''
+
+
+def main(argv):
+    # Parse and manipulate the command line arguments.
+    if len(argv) != 4:
+        error(usage(argv[0]))
+
+    converter = argv[1]
+    from_file_name = argv[2]
+    to_file_name = argv[3]
+
+    # Run gnuhtml2latex
+    cmd = '%s -s %s' % (converter, from_file_name)
+    (ret, output) = run_command(cmd, False)
+
+    # Determine encoding of HTML file
+    enc = get_encoding(from_file_name).replace('iso_8859', 'iso-8859')
+    # The HTML encodings were taken from http://www.iana.org/assignments/character-sets/character-sets.xml.
+    # Only those with inputenc support were added, and only thge most important aliases.
+    # List of encodings that have the same name in HTML (may be as an alias) and inputenc
+    same_enc = ['cp437', 'cp850', 'cp852', 'cp855', 'cp858', 'cp862', 'cp865', 'cp866', \
+                'cp1250', 'cp1251', 'cp1252', 'cp1255', 'cp1256', 'cp1257', \
+                'koi8-r', 'koi8-u', 'pt154', 'pt254', \
+                'latin1', 'latin2', 'latin3', 'latin4', 'latin5', 'latin9', 'latin10']
+    # Translation table from HTML encoding names to inputenc encoding names
+    encodings = {'utf-8' : 'utf8', 'csutf8' : 'utf8', \
+                 'iso-8859-1' : 'latin1', 'cp819' : 'latin1', \
+                 'iso-8859-2' : 'latin2', \
+                 'iso-8859-3' : 'latin3', \
+                 'iso-8859-4' : 'latin4', \
+                 'iso-8859-5' : 'iso88595', 'cyrillic' : 'iso88595', \
+                 'iso-8859-6' : '8859-6', 'arabic' : '8859-6', \
+                 'iso-8859-7' : 'iso-8859-7', 'greek' : 'iso-8859-7', \
+                 'iso-8859-8' : '8859-8', 'hebrew' : '8859-8', \
+                 'iso-8859-9' : 'latin5', \
+                 'iso-8859-13' : 'l7xenc', \
+                 'iso-8859-15' : 'latin9', \
+                 'iso-8859-16' : 'latin10', \
+                 'ibm437' : 'cp437', \
+                 'ibm850' : 'cp850', \
+                 'ibm852' : 'cp852', \
+                 'ibm855' : 'cp855', \
+                 'ibm858' : 'cp858', \
+                 'ibm862' : 'cp862', \
+                 'ibm865' : 'cp865', \
+                 'ibm866' : 'cp866', \
+                 'ibm1250' : 'cp1250', \
+                 'ibm1251' : 'cp1251', \
+                 'ibm1255' : 'cp1255', \
+                 'ibm1256' : 'cp1256', \
+                 'ibm1257' : 'cp1257', \
+                 'macintosh' : 'applemac', 'mac' : 'applemac', 'csmacintosh' : 'applemac'}
+    if enc != '':
+        if enc in encodings.keys():
+            enc = encodings[enc]
+        elif enc not in same_enc:
+            enc = ''
+
+    # Read conversion result
+    lines = output.split('\n')
+
+    # Do not add the inputenc call if inputenc or CJK is already loaded
+    add_inputenc = (enc != '')
+    if add_inputenc:
+        regexp = re.compile(r'^\s?\\usepackage\s?(\[[^]+]\])?\s?{(inputenc)|(CJK)|(CJKutf8)}')
+        for line in lines:
+            if regexp.match(line):
+                add_inputenc = False
+                break
+
+    # Write output file and insert inputenc call if needed
+    to_file = open(to_file_name, 'wt')
+    for line in lines:
+        to_file.write(line + '\n')
+        if add_inputenc and line.find('\\documentclass') == 0:
+            to_file.write('\\usepackage[%s]{inputenc}\n' % enc)
+    to_file.close()
+
+    return ret
+
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/lib/scripts/lyxpreview_tools.py b/lib/scripts/lyxpreview_tools.py

index 8408b3a08fd854219c06cf3605e66397cb3dacb4..8cb356c66213cecb9567d7dddbdabcf3c69b0731 100644 (file)
--- a/lib/scripts/lyxpreview_tools.py
+++ b/lib/scripts/lyxpreview_tools.py
@@ -116,14 +116,21 @@ def find_exe_or_terminate(candidates):
      return exe
  
  
-def run_command_popen(cmd):
+def run_command_popen(cmd, stderr2stdout):
      if os.name == 'nt':
          unix = False
      else:
          unix = True
-    pipe = subprocess.Popen(cmd, shell=unix, close_fds=unix, stdin=subprocess.PIPE, \
-        stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
-    cmd_stdout = pipe.communicate()[0]
+    if stderr2stdout:
+        pipe = subprocess.Popen(cmd, shell=unix, close_fds=unix, stdin=subprocess.PIPE, \
+            stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
+        cmd_stdout = pipe.communicate()[0]
+    else:
+        pipe = subprocess.Popen(cmd, shell=unix, close_fds=unix, stdin=subprocess.PIPE, \
+            stdout=subprocess.PIPE, universal_newlines=True)
+        (cmd_stdout, cmd_stderr) = pipe.communicate()
+        if cmd_stderr:
+            sys.stderr.write(cmd_stderr)
      cmd_status = pipe.returncode
  
      global debug
@@ -182,12 +189,12 @@ def run_command_win32(cmd):
      return 0, data
  
  
-def run_command(cmd):
+def run_command(cmd, stderr2stdout = True):
      progress("Running %s" % cmd)
      if use_win32_modules:
          return run_command_win32(cmd)
      else:
-        return run_command_popen(cmd)
+        return run_command_popen(cmd, stderr2stdout)
  
  
  def get_version_info():
author	Georg Baum <baum@lyx.org>
	Fri, 12 Apr 2013 19:35:50 +0000 (21:35 +0200)
committer	Georg Baum <baum@lyx.org>
	Fri, 12 Apr 2013 19:35:50 +0000 (21:35 +0200)
lib/Makefile.am		patch \| blob \| history
lib/configure.py		patch \| blob \| history
lib/scripts/html2latexwrapper.py	[new file with mode: 0644]	patch \| blob
lib/scripts/lyxpreview_tools.py		patch \| blob \| history