Fix lyx2lyx conversion of dashes.

author Günter Milde <milde@lyx.org>

Sat, 30 Sep 2017 21:26:02 +0000 (23:26 +0200)

committer Günter Milde <milde@lyx.org>

Mon, 1 Jan 2018 21:47:56 +0000 (22:47 +0100)
author Günter Milde <milde@lyx.org>
Sat, 30 Sep 2017 21:26:02 +0000 (23:26 +0200)
committer Günter Milde <milde@lyx.org>
Mon, 1 Jan 2018 21:47:56 +0000 (22:47 +0100)
diff --git a/lib/RELEASE-NOTES b/lib/RELEASE-NOTES

index 14bf985153ddd92df88e5ba725e32297cf4674c4..dccd1f10ed42ba337107ee75984fcccd2f3b8edd 100644 (file)
--- a/lib/RELEASE-NOTES
+++ b/lib/RELEASE-NOTES
@@ -14,10 +14,9 @@
    if needed, as usual.
  
  * The new setting
-  "Document->Settings->Fonts->Disallow line breaks after dashes" forces
-  output of en- and em-dashes as \textendash and \textemdash when exporting
-  to LaTeX. It is is "false" by default but "true" when opening documents
-  edited with LyX 2.2.
+  "Document->Settings->Fonts->Disallow line breaks after dashes"
+  turns off the conversion of em- and en-dash characters to --- and --
+  respectively for LaTeX export. It is is "false" by default.
    See chapter 3.9.1.1 "Dashes and Line Breaks" of the User Guide and
    "Caveats when upgrading from earlier versions to 2.3.x" below.
  
@@ -214,27 +213,15 @@
    the external_templates file, you will have to move the modifications to
    the respective *.xtemplate file manually.
  
-* By default, LyX 2.3 outputs en- and em-dashes after which a line break can
-  occur in the output. Sometimes, this results in undesired line breaks.
+* By default, LyX 2.3 outputs en- and em-dashes as -- and --- respectively,
+  so that a line break can occur in the output immediately after the dash.
+  Sometimes, this results in undesired line breaks or overfull lines due to
+  suppression of hyphenation in the word preceding the dash.
    Select "Document->Settings->Fonts->Disallow line breaks after dashes"
-  to keep the LyX 2.2 behaviour, where such line breaks have been generally
-  suppressed. See chapter 3.9.1.1, "Dashes and Line Breaks", of the User Guide
+  to keep the LyX 2.2 behaviour.
+  See chapter 3.9.1.1, "Dashes and Line Breaks", of the User Guide
    for details.
  
-* ZWSP characters (u200b) following literal em- and en-dashes are deleted by
-  lyx2lyx when converting to 2.3 format. If you used them as optional line
-  breaks after dashes, convert them to space insets before opening your
-  document with LyX 2.3 or the optional line breaks will be lost!
-
-* If using TeX fonts and en- and em-dashes are output in breakable form,
-  when exporting documents containing en- and em-dashes to the format of
-  LyX 2.0 or earlier, the following line has to be manually added to the
-  unicodesymbols file of that LyX version:<br>
-  0x200b "\\hspace{0pt}" "" "" "" "" # ZERO WIDTH SPACE<br>
-  This avoids "uncodable character" issues if the document is actually
-  loaded by that LyX version. LyX 2.1 and later versions already have the
-  necessary definition in their unicodesymbols file.
-
  * If trying to compile documents using R scripts and sweave/knitr, LyX
    2.3.x would not allow for re-running the R scripts, unless the user:
    1) explicitly disables the "Forbid use of needauth converters"
diff --git a/lib/lyx2lyx/lyx_2_2.py b/lib/lyx2lyx/lyx_2_2.py

index 996c22684ef1be04a76505f5d69c940e3f7fe249..2f4ef3ac2a9c3a03b6958308d6603f02b82d6da7 100644 (file)
--- a/lib/lyx2lyx/lyx_2_2.py
+++ b/lib/lyx2lyx/lyx_2_2.py
@@ -659,6 +659,12 @@ def convert_dashes(document):
  def revert_dashes(document):
      "convert \\twohyphens and \\threehyphens to -- and ---"
  
+    # eventually remove preamble code from 2.3->2.2 conversion:
+    for i, line in enumerate(document.preamble):
+        if i > 1 and line == r'\renewcommand{\textemdash}{---}':
+            if (document.preamble[i-1] == r'\renewcommand{\textendash}{--}'
+                and document.preamble[i-2] == '% Added by lyx2lyx'):
+                del document.preamble[i-2:i+1]
      i = 0
      while i < len(document.body):
          words = document.body[i].split()
diff --git a/lib/lyx2lyx/lyx_2_3.py b/lib/lyx2lyx/lyx_2_3.py

index edc5b1ffa9384c83ec41f840ea6c8a38a3121a41..140f57fb36bd14e1f663dd531d9d5d0a9aa1811f 100644 (file)
--- a/lib/lyx2lyx/lyx_2_3.py
+++ b/lib/lyx2lyx/lyx_2_3.py
@@ -1841,58 +1841,63 @@ def revert_chapterbib(document):
  
  
  def convert_dashligatures(document):
-    " Remove a zero-length space (U+200B) after en- and em-dashes. "
-
-    i = find_token(document.header, "\\use_microtype", 0)
-    if i != -1:
-        if document.initial_format > 474 and document.initial_format < 509:
-            # This was created by LyX 2.2
-            document.header[i+1:i+1] = ["\\use_dash_ligatures false"]
-        else:
-            # This was created by LyX 2.1 or earlier
-            document.header[i+1:i+1] = ["\\use_dash_ligatures true"]
-
-    i = 0
-    while i < len(document.body):
-        words = document.body[i].split()
-        # Skip some document parts where dashes are not converted
-        if len(words) > 1 and words[0] == "\\begin_inset" and \
-           words[1] in ["CommandInset", "ERT", "External", "Formula", \
-                        "FormulaMacro", "Graphics", "IPA", "listings"]:
-            j = find_end_of_inset(document.body, i)
-            if j == -1:
-                document.warning("Malformed LyX document: Can't find end of " \
-                                 + words[1] + " inset at line " + str(i))
-                i += 1
-            else:
-                i = j
-            continue
-        if len(words) > 0 and words[0] in ["\\leftindent", \
-                "\\paragraph_spacing", "\\align", "\\labelwidthstring"]:
-            i += 1
-            continue
-
-        start = 0
-        while True:
-            j = document.body[i].find(u"\u2013", start) # en-dash
-            k = document.body[i].find(u"\u2014", start) # em-dash
-            if j == -1 and k == -1:
-                break
-            if j == -1 or (k != -1 and k < j):
-                j = k
-            after = document.body[i][j+1:]
-            if after.startswith(u"\u200B"):
-                document.body[i] = document.body[i][:j+1] + after[1:]
-            else:
-                if len(after) == 0 and document.body[i+1].startswith(u"\u200B"):
-                    document.body[i+1] = document.body[i+1][1:]
-                    break
-            start = j+1
-        i += 1
-
+    "Set 'use_dash_ligatures' according to content."
+    use_dash_ligatures = None
+    # eventually remove preamble code from 2.3->2.2 conversion:
+    for i, line in enumerate(document.preamble):
+        if i > 1 and line == r'\renewcommand{\textemdash}{---}':
+            if (document.preamble[i-1] == r'\renewcommand{\textendash}{--}'
+                and document.preamble[i-2] == '% Added by lyx2lyx'):
+                del document.preamble[i-2:i+1]
+                use_dash_ligatures = True
+    if use_dash_ligatures is None:
+        # Look for dashes:
+        # (Documents by LyX 2.1 or older have "\twohyphens\n" or "\threehyphens\n"
+        # as interim representation for dash ligatures in 2.2.)
+        has_literal_dashes = False
+        has_ligature_dashes = False
+        j = 0
+        for i, line in enumerate(document.body):
+            # Skip some document parts where dashes are not converted
+            if (i < j) or line.startswith("\\labelwidthstring"):
+                continue
+            words = line.split()
+            if len(words) > 1 and words[0] == "\\begin_inset" and \
+            words[1] in ["CommandInset", "ERT", "External", "Formula",
+                         "FormulaMacro", "Graphics", "IPA", "listings"]:
+                j = find_end_of_inset(document.body, i)
+                if j == -1:
+                    document.warning("Malformed LyX document: "
+                        "Can't find end of %s inset at line %d" % (words[1],i))
+                continue
+            # literal dash followed by a word or no-break space:
+            if re.search(u"[\u2013\u2014]([\w\u00A0]|$)", line,
+                         flags=re.UNICODE):
+                has_literal_dashes = True
+            # ligature dash followed by word or no-break space on next line:
+            if re.search(u"(\\\\twohyphens|\\\\threehyphens)", line,
+                            flags=re.UNICODE) and re.match(u"[\w\u00A0]",
+                            document.body[i+1], flags=re.UNICODE):
+                has_ligature_dashes = True
+        if has_literal_dashes and has_ligature_dashes:
+            # TODO: insert a warning note in the document?
+            document.warning('This document contained both literal and '
+                '"ligature" dashes.\n Line breaks may have changed. '
+                'See UserGuide chapter 3.9.1 for details.')
+        elif has_literal_dashes:
+            use_dash_ligatures = False
+        elif has_ligature_dashes:
+            use_dash_ligatures = True
+    # insert the setting if there is a preferred value
+    if use_dash_ligatures is not None:
+        i = find_token(document.header, "\\use_microtype", 0)
+        if i != -1:
+            document.header.insert(i+1, "\\use_dash_ligatures %s"
+                                % str(use_dash_ligatures).lower())
  
  def revert_dashligatures(document):
-    " Remove font ligature settings for en- and em-dashes. "
+    """Remove font ligature settings for en- and em-dashes.
+    Revert conversion of \twodashes or \threedashes to literal dashes."""
      i = find_token(document.header, "\\use_dash_ligatures", 0)
      if i == -1:
          return
@@ -1902,42 +1907,34 @@ def revert_dashligatures(document):
      i = find_token(document.header, "\\use_non_tex_fonts", 0)
      if i != -1:
          use_non_tex_fonts = get_bool_value(document.header, "\\use_non_tex_fonts", i)
-    if not use_dash_ligatures or use_non_tex_fonts:
+    if not use_dash_ligatures or document.backend != "latex":
          return
  
-    # Add a zero-length space (U+200B) after en- and em-dashes
-    i = 0
-    while i < len(document.body):
-        words = document.body[i].split()
+    j = 0
+    new_body = []
+    for i, line in enumerate(document.body):
          # Skip some document parts where dashes are not converted
+        if (i < j) or line.startswith("\\labelwidthstring"):
+            new_body.append(line)
+            continue
+        words = line.split()
          if len(words) > 1 and words[0] == "\\begin_inset" and \
-           words[1] in ["CommandInset", "ERT", "External", "Formula", \
+           words[1] in ["CommandInset", "ERT", "External", "Formula",
                          "FormulaMacro", "Graphics", "IPA", "listings"]:
              j = find_end_of_inset(document.body, i)
              if j == -1:
-                document.warning("Malformed LyX document: Can't find end of " \
+                document.warning("Malformed LyX document: Can't find end of "
                                   + words[1] + " inset at line " + str(i))
-                i += 1
-            else:
-                i = j
-            continue
-        if len(words) > 0 and words[0] in ["\\leftindent", \
-                "\\paragraph_spacing", "\\align", "\\labelwidthstring"]:
-            i += 1
+            new_body.append(line)
              continue
-
-        start = 0
-        while True:
-            j = document.body[i].find(u"\u2013", start) # en-dash
-            k = document.body[i].find(u"\u2014", start) # em-dash
-            if j == -1 and k == -1:
-                break
-            if j == -1 or (k != -1 and k < j):
-                j = k
-            after = document.body[i][j+1:]
-            document.body[i] = document.body[i][:j+1] + u"\u200B" + after
-            start = j+1
-        i += 1
+        line = line.replace(u'\u2013', '\\twohyphens\n')
+        line = line.replace(u'\u2014', '\\threehyphens\n')
+        lines = line.split('\n')
+        new_body.extend(line.split('\n'))
+    document.body = new_body
+    # redefine the dash LICRs to use ligature dashes:
+    add_to_preamble(document, [r'\renewcommand{\textendash}{--}',
+                               r'\renewcommand{\textemdash}{---}'])
  
  
  def revert_noto(document):
@@ -2228,7 +2225,7 @@ def revert_mathnumberingname(document):
          else:
              l = find_token(document.header, "\\use_default_options", 0)
              document.header.insert(l, "\\options reqno")
-    # add the math_number_before tag   
+    # add the math_number_before tag
      regexp = re.compile(r'(\\math_numbering_side default)')
      i = find_re(document.header, regexp, 0)
      if i != -1:
author	Günter Milde <milde@lyx.org>
	Sat, 30 Sep 2017 21:26:02 +0000 (23:26 +0200)
committer	Günter Milde <milde@lyx.org>
	Mon, 1 Jan 2018 21:47:56 +0000 (22:47 +0100)
lib/RELEASE-NOTES		patch \| blob \| history
lib/lyx2lyx/lyx_2_2.py		patch \| blob \| history
lib/lyx2lyx/lyx_2_3.py		patch \| blob \| history