lib/scripts/docbook2epub.py

   1 # file docbook2epub.py
   2 # This file is part of LyX, the document processor.
   3 # Licence details can be found in the file COPYING.
   4 #
   5 # \author Thibaut Cuvelier
   6 #
   7 # Full author contact details are available in file CREDITS
   8
   9 # Usage:
  10 #   python docbook2epub.py java_binary saxon_path xsltproc_path xslt_path in.docbook in.orig.path out.epub
  11
  12
  13 import glob
  14 import os
  15 import shutil
  16 import sys
  17 import tempfile
  18 import zipfile
  19
  20
  21 def _parse_nullable_argument(arg):
  22     return arg if arg != '' and arg != 'none' else None
  23
  24
  25 class ImageRename:
  26     def __init__(self, opf_path, local_path, epub_path):
  27         self.opf_path = opf_path
  28         self.local_path = local_path
  29         self.epub_path = epub_path
  30
  31
  32 class DocBookToEpub:
  33     def __init__(self, args=None):
  34         if args is None:
  35             args = sys.argv
  36
  37         if len(args) != 8:
  38             print(f'Exactly eight arguments are expected, only {len(args)} found: {args}.')
  39             sys.exit(1)
  40
  41         self.own_path = sys.argv[0]
  42         self.java_path = _parse_nullable_argument(sys.argv[1])
  43         self.saxon_path = _parse_nullable_argument(sys.argv[2])
  44         self.xsltproc_path = _parse_nullable_argument(sys.argv[3])
  45         self.xslt_path = _parse_nullable_argument(sys.argv[4])
  46         self.input = sys.argv[5]
  47         self.input_path = sys.argv[6]
  48         self.output = sys.argv[7]
  49         self.script_folder = os.path.dirname(self.own_path) + '/../'
  50
  51         print('Generating ePub with the following parameters:')
  52         print(self.own_path)
  53         print(self.java_path)
  54         print(self.saxon_path)
  55         print(self.xsltproc_path)
  56         print(self.xslt_path)
  57         print(self.input)
  58         print(self.input_path)
  59         print(self.output)
  60
  61         # Precompute paths that will be used later.
  62         self.output_dir = tempfile.mkdtemp().replace('\\', '/')
  63         self.package_opf = self.output_dir + '/OEBPS/package.opf'  # Does not exist yet,
  64         print('Temporary output directory: %s' % self.output_dir)
  65
  66         if self.xslt_path is None:
  67             self.xslt = self.script_folder + 'docbook/epub3/chunk.xsl'
  68         else:
  69             self.xslt = self.xslt_path + '/epub3/chunk.xsl'
  70         print('XSLT style sheet to use:')
  71         print(self.xslt)
  72
  73         if self.saxon_path is None:
  74             self.saxon_path = self.script_folder + 'scripts/saxon6.5.5.jar'
  75
  76         # These will be filled during the execution of the script.
  77         self.renamed = None
  78
  79     def gracefully_fail(self, reason):
  80         print('docbook2epub fails: %s' % reason)
  81         shutil.rmtree(self.output_dir, ignore_errors=True)
  82         sys.exit(1)
  83
  84     def start_xslt_transformation(self):
  85         command = None
  86         if self.xsltproc_path is not None:
  87             command = self.start_xslt_transformation_xsltproc()
  88         elif self.java_path is not None:
  89             command = self.start_xslt_transformation_saxon6()
  90
  91         if command is None:
  92             self.gracefully_fail('no XSLT processor available')
  93
  94         print('Command to execute:')
  95         print(command)
  96
  97         quoted_command = command
  98         if os.name == 'nt':
  99             # On Windows, it is typical to have spaces in folder names, and that requires to wrap the whole command
 100             # in quotes. On Linux, this might create errors when starting the command.
 101             quoted_command = '"' + command + '"'
 102         # This could be simplified by using subprocess.run, but this requires Python 3.5.
 103
 104         if os.system(quoted_command) != 0:
 105             self.gracefully_fail('error from the XSLT processor')
 106
 107         print('Generated ePub contents.')
 108
 109     def start_xslt_transformation_xsltproc(self):
 110         params = '-stringparam base.dir "' + self.output_dir + '"'
 111         return '"' + self.xsltproc_path + '" ' + params + ' "' + self.xslt + '" "' + self.input + '"'
 112
 113     def start_xslt_transformation_saxon6(self):
 114         params = 'base.dir=%s' % self.output_dir
 115         executable = '"' + self.java_path + '" -jar "' + self.saxon_path + '"'
 116         return executable + ' "' + self.input + '" "' + self.xslt + '" "' + params + '"'
 117
 118     def get_images_from_package_opf(self):
 119         images = []
 120
 121         # Example in the OPF file:
 122         #     <item id="d436e1" href="D:/LyX/lib/images/buffer-view.svgz" media-type="image/SVGZ"/>
 123         # The XHTML files are also <item> tags:
 124         #     <item id="id-d0e2" href="index.xhtml" media-type="application/xhtml+xml"/>
 125         try:
 126             with open(self.package_opf) as f:
 127                 for line in f.readlines():
 128                     if '<item' in line and 'media-type="image' in line:
 129                         images.append(line.split('href="')[1].split('"')[0])
 130         except FileNotFoundError:
 131             print('The package.opf file was not found, probably due to a DocBook error. The ePub file will be corrupt.')
 132
 133         return images
 134
 135     def get_image_changes(self):
 136         epub_folder = 'images/'
 137
 138         changes = []
 139         for image in self.get_images_from_package_opf():
 140             if os.path.exists(image):
 141                 file_system_path = image
 142             elif os.path.exists(self.input_path + image):
 143                 file_system_path = self.input_path + image
 144             else:
 145                 file_system_path = ''
 146
 147             changes.append(ImageRename(image, file_system_path, epub_folder + os.path.basename(image)))
 148         return changes
 149
 150     def change_image_paths(self, file):
 151         # This could be optimised, as the same operation is performed a zillion times on many files:
 152         # https://www.oreilly.com/library/view/python-cookbook/0596001673/ch03s15.html
 153         with open(file, encoding='utf8') as f:
 154             contents = list(f)
 155
 156         with open(file, 'w', encoding='utf8') as f:
 157             for line in contents:
 158                 for change in self.renamed:
 159                     line = line.replace(change.opf_path, change.epub_path)
 160                 f.write(line)
 161
 162     def copy_images(self):
 163         # Copy the assets to the OEBPS/images/. All paths are available in OEBPS/package.opf, but they must also be
 164         # changed in the XHTML files. Typically, the current paths are absolute.
 165
 166         # First, get the mapping old file => file in the ePub archive.
 167         self.renamed = self.get_image_changes()
 168
 169         # Then, transform all paths (both OPF and XHTML files).
 170         self.change_image_paths(self.output_dir + '/OEBPS/package.opf')
 171         for file in glob.glob(self.output_dir + '/OEBPS/*.xhtml'):
 172             self.change_image_paths(file)
 173
 174         # Ensure that the destination path exists. OEBPS exists due to the DocBook-to-ePub transformation.
 175         if not os.path.exists(self.output_dir + '/OEBPS/images/'):
 176             os.mkdir(self.output_dir + '/OEBPS/images/')
 177
 178         # Finally, actually copy the image files.
 179         for change in self.renamed:
 180             shutil.copyfile(change.local_path, self.output_dir + '/OEBPS/' + change.epub_path)
 181
 182     def create_zip_archive(self):
 183         with zipfile.ZipFile(self.output, 'w', zipfile.ZIP_DEFLATED) as zip:
 184             # Python 3.5 brings the `recursive` argument. For older versions, this trick is required...
 185             # for file in glob.glob(output_dir + '/**/*', recursive=True):
 186             for file in [os.path.join(dp, f) for dp, dn, filenames in os.walk(self.output_dir) for f in filenames]:
 187                 zip.write(file, os.path.relpath(file, self.output_dir), compress_type=zipfile.ZIP_STORED)
 188
 189         shutil.rmtree(self.output_dir)
 190         print('Generated ePub.')
 191
 192     def transform(self):
 193         self.start_xslt_transformation()
 194         self.copy_images()
 195         self.create_zip_archive()
 196
 197
 198 if __name__ == '__main__':
 199     DocBookToEpub(sys.argv).transform()