lib/scripts/docbook2epub.py

   1 # -*- coding: utf-8 -*-
   2
   3 # file docbook2epub.py
   4 # This file is part of LyX, the document processor.
   5 # Licence details can be found in the file COPYING.
   6 #
   7 # \author Thibaut Cuvelier
   8 #
   9 # Full author contact details are available in file CREDITS
  10
  11 # Usage:
  12 #   python docbook2epub.py java_binary in.docbook out.epub
  13
  14 from __future__ import print_function
  15
  16 import glob
  17 import os
  18 import shutil
  19 import sys
  20 import tempfile
  21 import zipfile
  22
  23
  24 def parse_arguments():
  25     if len(sys.argv) != 4:
  26         sys.exit(1)
  27     own_path, java_path, input, output = sys.argv
  28     script_folder = os.path.dirname(own_path) + '/../'
  29
  30     print('Generating ePub with the following parameters:')
  31     print(own_path)
  32     print(java_path)
  33     print(input)
  34     print(output)
  35
  36     return java_path, input, output, script_folder
  37
  38
  39 def create_temporary_folder():
  40     output_dir = tempfile.mkdtemp().replace('\\', '/')
  41     print('Temporary output directory:')
  42     print(output_dir)
  43     return output_dir
  44
  45
  46 def start_xslt_transformation(input, output_dir, script_folder, java_path):
  47     xslt = script_folder + 'docbook/epub3/chunk.xsl'
  48     saxon_jar = script_folder + 'scripts/saxon6.5.5.jar'
  49     saxon_params = 'base.dir=%s' % output_dir
  50     command = '"' + java_path + '" -jar "' + saxon_jar + '" "' + input + '" "' + xslt + '" "' + saxon_params + '"'
  51
  52     print('XSLT style sheet to use:')
  53     print(xslt)
  54     print('Command to execute:')
  55     print(command)
  56
  57     quoted_command = command
  58     if os.name == 'nt':
  59         # On Windows, it is typical to have spaces in folder names, and that requires to wrap the whole command
  60         # in quotes. On Linux, this might create errors when starting the command.
  61         quoted_command = '"' + command + '"'
  62     # This could be simplified by using subprocess.run, but this requires Python 3.5.
  63
  64     if os.system(quoted_command) != 0:
  65         print('docbook2epub fails')
  66         shutil.rmtree(output_dir, ignore_errors=True)
  67         sys.exit(1)
  68
  69     print('Generated ePub contents.')
  70
  71
  72 def get_images_from_package_opf(package_opf):
  73     images = []
  74
  75     # Example in the OPF file:
  76     #     <item id="d436e1" href="D:/LyX/lib/images/buffer-view.svgz" media-type="image/SVGZ"/>
  77     # The XHTML files are also <item> tags:
  78     #     <item id="id-d0e2" href="index.xhtml" media-type="application/xhtml+xml"/>
  79     try:
  80         with open(package_opf, 'r') as f:
  81             for line in f.readlines():
  82                 if '<item' in line and 'media-type="image' in line:
  83                     images.append(line.split('href="')[1].split('"')[0])
  84     except FileNotFoundError:
  85         print('The package.opf file was not found, probably due to a DocBook error. The ePub file will be corrupt.')
  86
  87     return images
  88
  89
  90 def change_image_paths(file, renamed):
  91     # This could be optimised, as the same operation is performed a zillion times on many files:
  92     # https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s15.html
  93     with open(file, 'r', encoding='utf8') as f:
  94         contents = list(f)
  95
  96     with open(file, 'w', encoding='utf8') as f:
  97         for line in contents:
  98             for (old, new) in renamed.items():
  99                 line = line.replace(old, new)
 100             f.write(line)
 101
 102
 103 def copy_images(output_dir):
 104     # Copy the assets to the OEBPS/images/. All paths are available in OEBPS/package.opf, but they must also be changed
 105     # in the XHTML files. Typically, the current paths are absolute.
 106
 107     # First, get the mapping old file => file in the ePub archive.
 108     original_images = get_images_from_package_opf(output_dir + '/OEBPS/package.opf')
 109     renamed = {img: 'images/' + os.path.basename(img) for img in original_images}
 110
 111     # Then, transform all paths (both OPF and XHTML files).
 112     change_image_paths(output_dir + '/OEBPS/package.opf', renamed)
 113     for file in glob.glob(output_dir + '/OEBPS/*.xhtml'):
 114         change_image_paths(file, renamed)
 115
 116     # Ensure that the destination path exists.
 117     if not os.path.exists(output_dir + '/OEBPS/images/'):
 118         os.mkdir(output_dir + '/OEBPS/images/')
 119
 120     # Finally, actually copy the image files.
 121     for (old, new) in renamed.items():
 122         shutil.copyfile(old, output_dir + '/OEBPS/' + new)
 123
 124
 125 def create_zip_archive(output, output_dir):
 126     with zipfile.ZipFile(output, 'w', zipfile.ZIP_DEFLATED) as zip:
 127         # Python 3.5 brings the `recursive` argument. For older versions, this trick is required...
 128         # for file in glob.glob(output_dir + '/**/*', recursive=True):
 129         for file in [os.path.join(dp, f) for dp, dn, filenames in os.walk(output_dir) for f in filenames]:
 130             zip.write(file, os.path.relpath(file, output_dir), compress_type=zipfile.ZIP_STORED)
 131
 132     shutil.rmtree(output_dir)
 133     print('Generated ePub.')
 134
 135
 136 if __name__ == '__main__':
 137     java_path, input, output, script_folder = parse_arguments()
 138     output_dir = create_temporary_folder()
 139     start_xslt_transformation(input, output_dir, script_folder, java_path)
 140     copy_images(output_dir)
 141     create_zip_archive(output, output_dir)