lib/scripts/docbook2epub.py

   1 # -*- coding: utf-8 -*-
   2
   3 # file docbook2epub.py
   4 # This file is part of LyX, the document processor.
   5 # Licence details can be found in the file COPYING.
   6 #
   7 # \author Thibaut Cuvelier
   8 #
   9 # Full author contact details are available in file CREDITS
  10
  11 # Usage:
  12 #   python docbook2epub.py java_binary in.docbook out.epub
  13
  14 from __future__ import print_function
  15
  16 import glob
  17 import os
  18 import shutil
  19 import sys
  20 import tempfile
  21 import zipfile
  22
  23
  24 class DocBookToEpub:
  25     def __init__(self, args=None):
  26         if args is None:
  27             args = sys.argv
  28
  29         if len(args) != 6:
  30             print('Six arguments are expected, only %s found: %s.' % (len(args), args))
  31             sys.exit(1)
  32
  33         self.own_path = sys.argv[0]
  34         self.java_path = sys.argv[1] if sys.argv[1] != '' and sys.argv[1] != 'none' else None
  35         self.xsltproc_path = sys.argv[2] if sys.argv[2] != '' and sys.argv[2] != 'none' else None
  36         self.xslt_path = sys.argv[3] if sys.argv[3] != '' and sys.argv[3] != 'none' else None
  37         self.input = sys.argv[4]
  38         self.output = sys.argv[5]
  39         self.script_folder = os.path.dirname(self.own_path) + '/../'
  40
  41         print('Generating ePub with the following parameters:')
  42         print(self.own_path)
  43         print(self.java_path)
  44         print(self.xsltproc_path)
  45         print(self.input)
  46         print(self.output)
  47
  48         # Precompute paths that will be used later.
  49         self.output_dir = tempfile.mkdtemp().replace('\\', '/')
  50         self.package_opf = self.output_dir + '/OEBPS/package.opf'  # Does not exist yet,
  51         print('Temporary output directory: %s' % self.output_dir)
  52
  53         if self.xslt_path is None:
  54             self.xslt = self.script_folder + 'docbook/epub3/chunk.xsl'
  55         else:
  56             self.xslt = self.xslt_path + '/epub3/chunk.xsl'
  57         print('XSLT style sheet to use:')
  58         print(self.xslt)
  59
  60         # These will be filled during the execution of the script.
  61         self.renamed = None
  62
  63     def gracefully_fail(self, reason):
  64         print('docbook2epub fails: %s' % reason)
  65         shutil.rmtree(self.output_dir, ignore_errors=True)
  66         sys.exit(1)
  67
  68     def start_xslt_transformation(self):
  69         command = None
  70         if self.xsltproc_path is not None:
  71             command = self.start_xslt_transformation_xsltproc()
  72         elif self.java_path is not None:
  73             command = self.start_xslt_transformation_saxon6()
  74
  75         if command is None:
  76             self.gracefully_fail('no XSLT processor available')
  77
  78         print('Command to execute:')
  79         print(command)
  80
  81         quoted_command = command
  82         if os.name == 'nt':
  83             # On Windows, it is typical to have spaces in folder names, and that requires to wrap the whole command
  84             # in quotes. On Linux, this might create errors when starting the command.
  85             quoted_command = '"' + command + '"'
  86         # This could be simplified by using subprocess.run, but this requires Python 3.5.
  87
  88         if os.system(quoted_command) != 0:
  89             self.gracefully_fail('error from the XSLT processor')
  90
  91         print('Generated ePub contents.')
  92
  93     def start_xslt_transformation_xsltproc(self):
  94         params = '-stringparam base.dir "' + self.output_dir + '"'
  95         return '"' + self.xsltproc_path + '" ' + params + ' "' + self.xslt + '" "' + self.input + '"'
  96
  97     def start_xslt_transformation_saxon6(self):
  98         saxon_jar = self.script_folder + 'scripts/saxon6.5.5.jar'
  99         params = 'base.dir=%s' % self.output_dir
 100         executable = '"' + self.java_path + '" -jar "' + saxon_jar + '"'
 101         return executable + ' "' + self.input + '" "' + self.xslt + '" "' + params + '"'
 102
 103     def get_images_from_package_opf(self):
 104         images = []
 105
 106         # Example in the OPF file:
 107         #     <item id="d436e1" href="D:/LyX/lib/images/buffer-view.svgz" media-type="image/SVGZ"/>
 108         # The XHTML files are also <item> tags:
 109         #     <item id="id-d0e2" href="index.xhtml" media-type="application/xhtml+xml"/>
 110         try:
 111             with open(self.package_opf, 'r') as f:
 112                 for line in f.readlines():
 113                     if '<item' in line and 'media-type="image' in line:
 114                         images.append(line.split('href="')[1].split('"')[0])
 115         except FileNotFoundError:
 116             print('The package.opf file was not found, probably due to a DocBook error. The ePub file will be corrupt.')
 117
 118         return images
 119
 120     def change_image_paths(self, file):
 121         # This could be optimised, as the same operation is performed a zillion times on many files:
 122         # https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s15.html
 123         with open(file, 'r', encoding='utf8') as f:
 124             contents = list(f)
 125
 126         with open(file, 'w', encoding='utf8') as f:
 127             for line in contents:
 128                 for (old, new) in self.renamed.items():
 129                     line = line.replace(old, new)
 130                 f.write(line)
 131
 132     def copy_images(self):
 133         # Copy the assets to the OEBPS/images/. All paths are available in OEBPS/package.opf, but they must also be
 134         # changed in the XHTML files. Typically, the current paths are absolute.
 135
 136         # First, get the mapping old file => file in the ePub archive.
 137         original_images = self.get_images_from_package_opf()
 138         self.renamed = {img: 'images/' + os.path.basename(img) for img in original_images}
 139
 140         # Then, transform all paths (both OPF and XHTML files).
 141         self.change_image_paths(self.output_dir + '/OEBPS/package.opf')
 142         for file in glob.glob(self.output_dir + '/OEBPS/*.xhtml'):
 143             self.change_image_paths(file)
 144
 145         # Ensure that the destination path exists. OEBPS exists due to the DocBook-to-ePub transformation.
 146         if not os.path.exists(self.output_dir + '/OEBPS/images/'):
 147             os.mkdir(self.output_dir + '/OEBPS/images/')
 148
 149         # Finally, actually copy the image files.
 150         for (old, new) in self.renamed.items():
 151             shutil.copyfile(old, self.output_dir + '/OEBPS/' + new)
 152
 153     def create_zip_archive(self):
 154         with zipfile.ZipFile(self.output, 'w', zipfile.ZIP_DEFLATED) as zip:
 155             # Python 3.5 brings the `recursive` argument. For older versions, this trick is required...
 156             # for file in glob.glob(output_dir + '/**/*', recursive=True):
 157             for file in [os.path.join(dp, f) for dp, dn, filenames in os.walk(self.output_dir) for f in filenames]:
 158                 zip.write(file, os.path.relpath(file, self.output_dir), compress_type=zipfile.ZIP_STORED)
 159
 160         shutil.rmtree(self.output_dir)
 161         print('Generated ePub.')
 162
 163     def transform(self):
 164         self.start_xslt_transformation()
 165         self.copy_images()
 166         self.create_zip_archive()
 167
 168
 169 if __name__ == '__main__':
 170     DocBookToEpub(sys.argv).transform()