Amend 47f1fec9

[lyx.git] / lib / lyx2lyx / LyX.py
diff --git a/lib/lyx2lyx/LyX.py b/lib/lyx2lyx/LyX.py

index 7e0276199b07d9f3c5b60fb27927942b076d0c1d..3d62da5d4b786985d3ca8c8b523ac1190d4e1ead 100644 (file)
--- a/lib/lyx2lyx/LyX.py
+++ b/lib/lyx2lyx/LyX.py
@@ -1,6 +1,6 @@
  # This file is part of lyx2lyx
  # -*- coding: utf-8 -*-
-# Copyright (C) 2002-2011 The LyX Team
+# Copyright (C) 2002-2018 The LyX Team
  # Copyright (C) 2002-2004 Dekel Tsur <dekel@lyx.org>
  # Copyright (C) 2002-2006 José Matos <jamatos@lyx.org>
  #
@@ -20,23 +20,31 @@
  
  " The LyX module has all the rules related with different lyx file formats."
  
-from parser_tools import get_value, check_token, find_token, \
-     find_tokens, find_end_of
+from parser_tools import (get_value, check_token, find_token, find_tokens,
+                          find_end_of, find_complete_lines)
  import os.path
  import gzip
  import locale
  import sys
  import re
  import time
+import io
+import codecs
  
  try:
      import lyx2lyx_version
      version__ = lyx2lyx_version.version
+    stable_version = True
  except: # we are running from build directory so assume the last version
-    version__ = '2.1'
+    version__ = '2.4'
+    stable_version = False
  
  default_debug__ = 2
  
+# Provide support for both python 2 and 3
+PY2 = sys.version_info[0] == 2
+# End of code to support for both python 2 and 3
+
  ####################################################################
  # Private helper functions
  
@@ -63,7 +71,8 @@ def minor_versions(major, last_minor_version):
  # Regular expressions used
  format_re = re.compile(r"(\d)[\.,]?(\d\d)")
  fileformat = re.compile(r"\\lyxformat\s*(\S*)")
-original_version = re.compile(r".*?LyX ([\d.]*)")
+original_version = re.compile(b".*?LyX ([\\d.]*)")
+original_tex2lyx_version = re.compile(b".*?tex2lyx ([\\d.]*)")
  
  ##
  # file format information:
@@ -79,11 +88,15 @@ format_relation = [("0_06",    [200], minor_versions("0.6" , 4)),
                     ("1_1_6_3", [218], ["1.1", "1.1.6.3","1.1.6.4"]),
                     ("1_2",     [220], minor_versions("1.2" , 4)),
                     ("1_3",     [221], minor_versions("1.3" , 7)),
-                   ("1_4", range(222,246), minor_versions("1.4" , 5)),
-                   ("1_5", range(246,277), minor_versions("1.5" , 7)),
-                   ("1_6", range(277,346), minor_versions("1.6" , 10)),
-                   ("2_0", range(347,414), minor_versions("2.0", 0)),
-                   ("2_1",     [], minor_versions("2.1", 0))
+                   # Note that range(i,j) is up to j *excluded*.
+                   ("1_4", list(range(222,246)), minor_versions("1.4" , 5)),
+                   ("1_5", list(range(246,277)), minor_versions("1.5" , 7)),
+                   ("1_6", list(range(277,346)), minor_versions("1.6" , 10)),
+                   ("2_0", list(range(346,414)), minor_versions("2.0" , 8)),
+                   ("2_1", list(range(414,475)), minor_versions("2.1" , 5)),
+                   ("2_2", list(range(475,509)), minor_versions("2.2" , 4)),
+                   ("2_3", list(range(509,545)), minor_versions("2.3" , 0)),
+                   ("2_4", (), minor_versions("2.4" , 0))
                    ]
  
  ####################################################################
@@ -109,19 +122,29 @@ def formats_list():
  
  
  def format_info():
-    " Returns a list with supported file formats."
-    out = """Major version:
-       minor versions
-       formats
+    " Returns a list with the supported file formats."
+    template = """
+%s\tstable format:       %s
+  \tstable versions:     %s
+  \tdevelopment formats: %s
  """
+
+    out = "version: formats and versions"
      for version in format_relation:
          major = str(version[2][0])
          versions = str(version[2][1:])
          if len(version[1]) == 1:
              formats = str(version[1][0])
+            stable_format = str(version[1][0])
+        elif not stable_version and major == version__:
+            stable_format = "-- not yet --"
+            versions = "-- not yet --"
+            formats = "%s - %s" % (version[1][0], version[1][-1])
          else:
-            formats = "%s - %s" % (version[1][-1], version[1][0])
-        out += "%s\n\t%s\n\t%s\n\n" % (major, versions, formats)
+            formats = "%s - %s" % (version[1][0], version[1][-2])
+            stable_format = str(version[1][-1])
+
+        out += template % (major, stable_format, versions, formats)
      return out + '\n'
  
  
@@ -145,12 +168,26 @@ def get_backend(textclass):
  
  def trim_eol(line):
      " Remove end of line char(s)."
+    if line[-1] != '\n' and line[-1] != '\r':
+        # May happen for the last line of a document
+        return line
      if line[-2:-1] == '\r':
          return line[:-2]
      else:
          return line[:-1]
  
  
+def trim_eol_binary(line):
+    " Remove end of line char(s)."
+    if line[-1] != 10 and line[-1] != 13:
+        # May happen for the last line of a document
+        return line
+    if line[-2:-1] == 13:
+        return line[:-2]
+    else:
+        return line[:-1]
+
+
  def get_encoding(language, inputencoding, format, cjk_encoding):
      " Returns enconding of the lyx file"
      if format > 248:
@@ -181,9 +218,10 @@ def get_encoding(language, inputencoding, format, cjk_encoding):
  class LyX_base:
      """This class carries all the information of the LyX file."""
  
-    def __init__(self, end_format = 0, input = "", output = "", error = "",
-                 debug = default_debug__, try_hard = 0, cjk_encoding = '',
-                 final_version = "", language = "english", encoding = "auto"):
+    def __init__(self, end_format = 0, input = u'', output = u'', error = u'',
+                 debug = default_debug__, try_hard = 0, cjk_encoding = u'',
+                 final_version = u'', systemlyxdir = u'', language = u'english',
+                 encoding = u'auto'):
  
          """Arguments:
          end_format: final format that the file should be converted. (integer)
@@ -192,7 +230,8 @@ class LyX_base:
          error: the name of the error file, if empty use the standard error.
          debug: debug level, O means no debug, as its value increases be more verbose.
          """
-        self.choose_io(input, output)
+        self.choose_input(input)
+        self.output = output
  
          if error:
              self.err = open(error, "w")
@@ -248,6 +287,7 @@ class LyX_base:
          self.status = 0
          self.encoding = encoding
          self.language = language
+        self.systemlyxdir = systemlyxdir
  
  
      def warning(self, message, debug_level= default_debug__):
@@ -271,23 +311,50 @@ class LyX_base:
          """Reads a file into the self.header and
          self.body parts, from self.input."""
  
+        # First pass: Read header to determine file encoding
+        # If we are running under python3 then all strings are binary in this
+        # pass. In some cases we need to convert binary to unicode in order to
+        # use our parser tools. Since we do not know the true encoding yet we
+        # use latin1. This works since a) the parts we are interested in are
+        # pure ASCII (subset of latin1) and b) in contrast to pure ascii or
+        # utf8, one can decode any 8byte string using latin1.
+        first_line = True
          while True:
              line = self.input.readline()
              if not line:
-                self.error("Invalid LyX file.")
+                # eof found before end of header
+                self.error("Invalid LyX file: Missing body.")
+
+            if first_line:
+                # Remove UTF8 BOM marker if present
+                if line.startswith(codecs.BOM_UTF8):
+                    line = line[len(codecs.BOM_UTF8):]
+
+                first_line = False
  
-            line = trim_eol(line)
-            if check_token(line, '\\begin_preamble'):
-                while 1:
+            if PY2:
+                line = trim_eol(line)
+                decoded = line
+            else:
+                line = trim_eol_binary(line)
+                decoded = line.decode('latin1')
+            if check_token(decoded, '\\begin_preamble'):
+                while True:
                      line = self.input.readline()
                      if not line:
-                        self.error("Invalid LyX file.")
+                        # eof found before end of header
+                        self.error("Invalid LyX file: Missing body.")
  
-                    line = trim_eol(line)
-                    if check_token(line, '\\end_preamble'):
+                    if PY2:
+                        line = trim_eol(line)
+                        decoded = line
+                    else:
+                        line = trim_eol_binary(line)
+                        decoded = line.decode('latin1')
+                    if check_token(decoded, '\\end_preamble'):
                          break
  
-                    if line.split()[:0] in ("\\layout",
+                    if decoded.split()[:0] in ("\\layout",
                                              "\\begin_layout", "\\begin_body"):
  
                          self.warning("Malformed LyX file:"
@@ -297,33 +364,49 @@ class LyX_base:
  
                      self.preamble.append(line)
  
-            if check_token(line, '\\end_preamble'):
+            if check_token(decoded, '\\end_preamble'):
                  continue
  
-            line = line.strip()
+            line = line.rstrip()
              if not line:
                  continue
  
-            if line.split()[0] in ("\\layout", "\\begin_layout",
+            if decoded.split()[0] in ("\\layout", "\\begin_layout",
                                     "\\begin_body", "\\begin_deeper"):
                  self.body.append(line)
                  break
  
              self.header.append(line)
  
-        i = find_token(self.header, '\\textclass', 0)
+        if PY2:
+            i = find_token(self.header, '\\textclass', 0)
+        else:
+            i = find_token(self.header, b'\\textclass', 0)
          if i == -1:
              self.warning("Malformed LyX file: Missing '\\textclass'.")
-            i = find_token(self.header, '\\lyxformat', 0) + 1
-            self.header[i:i] = ['\\textclass article']
-
-        self.textclass = get_value(self.header, "\\textclass", 0)
-        self.backend = get_backend(self.textclass)
-        self.format  = self.read_format()
-        self.language = get_value(self.header, "\\language", 0,
-                                  default = "english")
-        self.inputencoding = get_value(self.header, "\\inputencoding",
-                                       0, default = "auto")
+            if PY2:
+                i = find_token(self.header, '\\lyxformat', 0) + 1
+                self.header[i:i] = ['\\textclass article']
+            else:
+                i = find_token(self.header, b'\\lyxformat', 0) + 1
+                self.header[i:i] = [b'\\textclass article']
+
+        if PY2:
+            self.textclass = get_value(self.header, "\\textclass", 0,
+                                       default = "")
+            self.language = get_value(self.header, "\\language", 0,
+                                      default = "english")
+            self.inputencoding = get_value(self.header, "\\inputencoding", 0,
+                                           default = "auto")
+        else:
+            self.textclass = get_value(self.header, b"\\textclass", 0,
+                                       default = b"")
+            self.language = get_value(self.header, b"\\language", 0,
+                                      default = b"english").decode('ascii')
+            self.inputencoding = get_value(self.header, b"\\inputencoding", 0,
+                                           default = b"auto").decode('ascii')
+        self.format = self.read_format()
+        self.initial_format = self.format
          self.encoding = get_encoding(self.language,
                                       self.inputencoding, self.format,
                                       self.cjk_encoding)
@@ -332,13 +415,16 @@ class LyX_base:
          # Second pass over header and preamble, now we know the file encoding
          # Do not forget the textclass (Debian bug #700828)
          self.textclass = self.textclass.decode(self.encoding)
+        self.backend = get_backend(self.textclass)
          for i in range(len(self.header)):
              self.header[i] = self.header[i].decode(self.encoding)
          for i in range(len(self.preamble)):
              self.preamble[i] = self.preamble[i].decode(self.encoding)
+        for i in range(len(self.body)):
+            self.body[i] = self.body[i].decode(self.encoding)
  
          # Read document body
-        while 1:
+        while True:
              line = self.input.readline().decode(self.encoding)
              if not line:
                  break
@@ -347,6 +433,7 @@ class LyX_base:
  
      def write(self):
          " Writes the LyX file to self.output."
+        self.choose_output(self.output)
          self.set_version()
          self.set_format()
          self.set_textclass()
@@ -360,30 +447,54 @@ class LyX_base:
          else:
              header = self.header
  
-        for line in header + [''] + self.body:
-            self.output.write(line.encode(self.encoding)+"\n")
+        for line in header + [u''] + self.body:
+            self.output.write(line+u'\n')
  
  
-    def choose_io(self, input, output):
-        """Choose input and output streams, dealing transparently with
+    def choose_output(self, output):
+        """Choose output streams dealing transparently with
          compressed files."""
  
-        if output:
-            self.output = open(output, "wb")
+        # This is a bit complicated, because we need to be compatible both with
+        # python 2 and python 3. Therefore we handle the encoding here and not
+        # when writing individual lines and may need up to 3 layered file like
+        # interfaces.
+        if self.compressed:
+            if output:
+                outputfileobj = open(output, 'wb')
+            else:
+                # We cannot not use stdout directly since it needs text, not bytes in python 3
+                outputfileobj = os.fdopen(sys.stdout.fileno(), 'wb')
+            # We cannot not use gzip.open() since it is not supported by python 2
+            zipbuffer = gzip.GzipFile(mode='wb', fileobj=outputfileobj)
+            # We do not want to use different newlines on different OSes inside zipped files
+            self.output = io.TextIOWrapper(zipbuffer, encoding=self.encoding, newline='\n')
          else:
-            self.output = sys.stdout
+            if output:
+                self.output = io.open(output, 'w', encoding=self.encoding)
+            else:
+                self.output = io.open(sys.stdout.fileno(), 'w', encoding=self.encoding)
+
  
-        if input and input != '-':
+    def choose_input(self, input):
+        """Choose input stream, dealing transparently with
+        compressed files."""
+
+        # Since we do not know the encoding yet we need to read the input as
+        # bytes in binary mode, and convert later to unicode.
+        if input and input != u'-':
              self.dir = os.path.dirname(os.path.abspath(input))
              try:
                  gzip.open(input).readline()
                  self.input = gzip.open(input)
-                self.output = gzip.GzipFile(mode="wb", fileobj=self.output)
+                self.compressed = True
              except:
-                self.input = open(input)
+                self.input = open(input, 'rb')
+                self.compressed = False
          else:
-            self.dir = ''
-            self.input = sys.stdin
+            self.dir = u''
+            self.input = os.fdopen(sys.stdin.fileno(), 'rb')
+            self.compressed = False
  
  
      def lyxformat(self, format):
@@ -408,29 +519,47 @@ class LyX_base:
          file, returns the most likely value, or None otherwise."""
  
          for line in self.header:
-            if line[0] != "#":
+            if line[0:1] != b"#":
                  return None
  
-            line = line.replace("fix",".")
-            result = original_version.match(line)
+            line = line.replace(b"fix",b".")
+            # need to test original_tex2lyx_version first because tex2lyx
+            # writes "#LyX file created by tex2lyx 2.2"
+            result = original_tex2lyx_version.match(line)
+            if not result:
+                result = original_version.match(line)
+                if result:
+                    # Special know cases: reLyX and KLyX
+                    if line.find(b"reLyX") != -1 or line.find(b"KLyX") != -1:
+                        return "0.12"
              if result:
-                # Special know cases: reLyX and KLyX
-                if line.find("reLyX") != -1 or line.find("KLyX") != -1:
-                    return "0.12"
-
                  res = result.group(1)
                  if not res:
                      self.warning(line)
                  #self.warning("Version %s" % result.group(1))
-                return res
+                return res.decode('ascii') if not PY2 else res
          self.warning(str(self.header[:2]))
          return None
  
  
      def set_version(self):
          " Set the header with the version used."
-        self.header[0] = " ".join(["#LyX %s created this file." % version__,
-                                  "For more info see http://www.lyx.org/"])
+
+        initial_comment = " ".join(["#LyX %s created this file." % version__,
+                                    "For more info see https://www.lyx.org/"])
+
+        # Simple heuristic to determine the comment that always starts
+        # a lyx file
+        if self.header[0].startswith("#"):
+            self.header[0] = initial_comment
+        else:
+            self.header.insert(0, initial_comment)
+
+        # Old lyx files had a two lines comment header:
+        # 1) the first line had the user who had created it
+        # 2) the second line had the lyx version used
+        # later we decided that 1) was a privacy risk for no gain
+        # here we remove the second line effectively erasing 1)
          if self.header[1][0] == '#':
              del self.header[1]
  
@@ -438,11 +567,14 @@ class LyX_base:
      def read_format(self):
          " Read from the header the fileformat of the present LyX file."
          for line in self.header:
-            result = fileformat.match(line)
+            if PY2:
+                result = fileformat.match(line)
+            else:
+                result = fileformat.match(line.decode('ascii'))
              if result:
                  return self.lyxformat(result.group(1))
          else:
-            self.error("Invalid LyX File.")
+            self.error("Invalid LyX File: Missing format.")
          return None
  
  
@@ -463,6 +595,7 @@ class LyX_base:
  
      #Note that the module will be added at the END of the extant ones
      def add_module(self, module):
+      " Append module to the modules list."
        i = find_token(self.header, "\\begin_modules", 0)
        if i == -1:
          #No modules yet included
@@ -483,7 +616,16 @@ class LyX_base:
        self.header.insert(j, module)
  
  
+    def del_module(self, module):
+        " Delete `module` from module list, return success."
+        modlist = self.get_module_list()
+        if module not in modlist:
+            return False
+        self.set_module_list([line for line in modlist if line != module])
+        return True
+
      def get_module_list(self):
+      " Return list of modules."
        i = find_token(self.header, "\\begin_modules", 0)
        if (i == -1):
          return []
@@ -492,23 +634,23 @@ class LyX_base:
  
  
      def set_module_list(self, mlist):
-      modbegin = find_token(self.header, "\\begin_modules", 0)
-      newmodlist = ['\\begin_modules'] + mlist + ['\\end_modules']
-      if (modbegin == -1):
+      i = find_token(self.header, "\\begin_modules", 0)
+      if (i == -1):
          #No modules yet included
          tclass = find_token(self.header, "\\textclass", 0)
          if tclass == -1:
            self.warning("Malformed LyX document: No \\textclass!!")
            return
-        modbegin = tclass + 1
-        self.header[modbegin:modbegin] = newmodlist
-        return
-      modend = find_token(self.header, "\\end_modules", modbegin)
-      if modend == -1:
-        self.warning("(set_module_list)Malformed LyX document: No \\end_modules.")
-        return
-      newmodlist = ['\\begin_modules'] + mlist + ['\\end_modules']
-      self.header[modbegin:modend + 1] = newmodlist
+        i = j = tclass + 1
+      else:
+        j = find_token(self.header, "\\end_modules", i)
+        if j == -1:
+            self.warning("(set_module_list) Malformed LyX document: No \\end_modules.")
+            return
+        j += 1
+      if mlist:
+          mlist = ['\\begin_modules'] + mlist + ['\\end_modules']
+      self.header[i:j] = mlist
  
  
      def set_parameter(self, param, value):
@@ -530,6 +672,11 @@ class LyX_base:
  
      def convert(self):
          "Convert from current (self.format) to self.end_format."
+        if self.format == self.end_format:
+            self.warning("No conversion needed: Target format %s "
+                "same as current format!" % self.format, default_debug__)
+            return
+
          mode, conversion_chain = self.chain()
          self.warning("conversion chain: " + str(conversion_chain), 3)
  
@@ -575,7 +722,6 @@ class LyX_base:
          conversion are taken.  It returns a list of modules needed to
          convert the LyX file from self.format to self.end_format"""
  
-        self.start =  self.format
          format = self.format
          correct_version = 0
  
@@ -609,7 +755,7 @@ class LyX_base:
  
          # Convertion mode, back or forth
          steps = []
-        if (initial_step, self.start) < (final_step, self.end_format):
+        if (initial_step, self.initial_format) < (final_step, self.end_format):
              mode = "convert"
              full_steps = []
              for step in format_relation:
@@ -637,6 +783,53 @@ class LyX_base:
          return mode, steps
  
  
+    def append_local_layout(self, new_layout):
+        " Append `new_layout` to the local layouts."
+        # new_layout may be a string or a list of strings (lines)
+        try:
+            new_layout = new_layout.splitlines()
+        except AttributeError:
+            pass
+        i = find_token(self.header, "\\begin_local_layout", 0)
+        if i == -1:
+            k = find_token(self.header, "\\language", 0)
+            if k == -1:
+                # this should not happen
+                self.warning("Malformed LyX document! No \\language header found!")
+                return
+            self.header[k : k] = ["\\begin_local_layout", "\\end_local_layout"]
+            i = k
+
+        j = find_end_of(self.header, i, "\\begin_local_layout", "\\end_local_layout")
+        if j == -1:
+            # this should not happen
+            self.warning("Malformed LyX document: Can't find end of local layout!")
+            return
+
+        self.header[i+1 : i+1] = new_layout
+
+    def del_local_layout(self, layout_def):
+        " Delete `layout_def` from local layouts, return success."
+        i = find_complete_lines(self.header, layout_def)
+        if i == -1:
+            return False
+        j = i+len(layout_def)
+        if (self.header[i-1] == "\\begin_local_layout" and
+            self.header[j] == "\\end_local_layout"):
+            i -=1
+            j +=1
+        self.header[i:j] = []
+        return True
+
+    def del_from_header(self, lines):
+        " Delete `lines` from the document header, return success."
+        i = find_complete_lines(self.header, lines)
+        if i == -1:
+            return False
+        j = i + len(lines)
+        self.header[i:j] = []
+        return True
+
  # Part of an unfinished attempt to make lyx2lyx gave a more
  # structured view of the document.
  #    def get_toc(self, depth = 4):
@@ -653,7 +846,7 @@ class LyX_base:
  
  #        toc_par = []
  #        i = 0
-#        while 1:
+#        while True:
  #            i = find_tokens(self.body, sections, i)
  #            if i == -1:
  #                break
@@ -703,14 +896,16 @@ class LyX_base:
  class File(LyX_base):
      " This class reads existing LyX files."
  
-    def __init__(self, end_format = 0, input = "", output = "", error = "",
-                 debug = default_debug__, try_hard = 0, cjk_encoding = '',
-                 final_version = ''):
+    def __init__(self, end_format = 0, input = u'', output = u'', error = u'',
+                 debug = default_debug__, try_hard = 0, cjk_encoding = u'',
+                 final_version = u'', systemlyxdir = u''):
          LyX_base.__init__(self, end_format, input, output, error,
-                          debug, try_hard, cjk_encoding, final_version)
+                          debug, try_hard, cjk_encoding, final_version,
+                          systemlyxdir)
          self.read()
  
  
+# FIXME: header settings are completely outdated, don't use like this
  #class NewFile(LyX_base):
  #    " This class is to create new LyX files."
  #    def set_header(self, **params):
@@ -739,6 +934,7 @@ class File(LyX_base):
  #            "\\use_amsmath 1",
  #            "\\cite_engine basic",
  #            "\\use_bibtopic false",
+#            "\\use_indices false",
  #            "\\paperorientation portrait",
  #            "\\secnumdepth 3",
  #            "\\tocdepth 3",