Fix character classification functions by using qt (bugs like 3270 and 1247)

author Georg Baum <Georg.Baum@post.rwth-aachen.de>

Mon, 26 Feb 2007 09:03:21 +0000 (09:03 +0000)

committer Georg Baum <Georg.Baum@post.rwth-aachen.de>

Mon, 26 Feb 2007 09:03:21 +0000 (09:03 +0000)
author Georg Baum <Georg.Baum@post.rwth-aachen.de>
Mon, 26 Feb 2007 09:03:21 +0000 (09:03 +0000)
committer Georg Baum <Georg.Baum@post.rwth-aachen.de>
Mon, 26 Feb 2007 09:03:21 +0000 (09:03 +0000)
diff --git a/development/scons/scons_manifest.py b/development/scons/scons_manifest.py

index edd5f126c7a0d08841723062f429a8752b7795a9..3e6d9a49b031667f3abfbbcd8363e7f3602402ae 100644 (file)
--- a/development/scons/scons_manifest.py
+++ b/development/scons/scons_manifest.py
@@ -160,6 +160,7 @@ src_support_files = Split('''
      socktools.C
      systemcall.C
      tempname.C
+    textutils.C
      unicode.C
      unlink.C
      userinfo.C
diff --git a/src/support/Makefile.am b/src/support/Makefile.am

index bb8e7a45c1263187a5966faeb9038db85f067884..20955224561581cdb54d16484d73e41288da8746 100644 (file)
--- a/src/support/Makefile.am
+++ b/src/support/Makefile.am
@@ -78,6 +78,7 @@ libsupport_la_SOURCES = \
         systemcall.C \
         systemcall.h \
         tempname.C \
+       textutils.C \
         textutils.h \
         translator.h \
         types.h \
diff --git a/src/support/lstrings.C b/src/support/lstrings.C

index 973b260d7cc93ef10501efc7c94200ebeb09a883..f1333eee0bd5d5faa4152a4168bf1013ba6ed82d 100644 (file)
--- a/src/support/lstrings.C
+++ b/src/support/lstrings.C
@@ -14,6 +14,7 @@
  #include "support/lstrings.h"
  #include "support/lyxlib.h"
  #include "support/convert.h"
+#include "support/qstring_helpers.h"
  
  #include "debug.h"
  
@@ -32,17 +33,6 @@
  #include <algorithm>
  #include <sstream>
  
-#ifdef LIBC_WCTYPE_USES_UCS4
-// We can use the libc ctype functions because we unset the LC_CTYPE
-// category of the current locale in gettext.C
-#include <wctype.h>
-#else
-// Steal some code from somewhere else, e.g. glib (look at gunicode.h)
-// The code that we currently use does not really work.
-#endif
-
-
-using lyx::docstring;
  
  using std::transform;
  using std::string;
@@ -321,38 +311,21 @@ char uppercase(char c)
  }
  
  
-// FIXME UNICODE
-// for lowercase() and uppercase() function below when wchar_t is not used:
-// 1) std::tolower() and std::toupper() are templates that
-// compile fine with char_type. With the test (c >= 256) we
-// do not trust these function to do the right thing with
-// unicode char.
-// 2) these functions use the current locale, which is wrong
-// if it is not latin1 based (latin1 is a subset of UCS4).
-
  char_type lowercase(char_type c)
  {
-#ifdef LIBC_WCTYPE_USES_UCS4
-       return towlower(c);
-#else
-       if (c >= 256)
+       if (!is_utf16(c))
+               // We don't know how to lowercase a non-utf16 char
                 return c;
-
-       return tolower(c);
-#endif
+       return qchar_to_ucs4(ucs4_to_qchar(c).toLower());
  }
  
  
  char_type uppercase(char_type c)
  {
-#ifdef LIBC_WCTYPE_USES_UCS4
-       return towupper(c);
-#else
-       if (c >= 256)
+       if (!is_utf16(c))
+               // We don't know how to uppercase a non-utf16 char
                 return c;
-
-       return toupper(c);
-#endif
+       return qchar_to_ucs4(ucs4_to_qchar(c).toUpper());
  }
  
  
@@ -361,10 +334,16 @@ namespace {
  // since we cannot use std::tolower and std::toupper directly in the
  // calls to std::transform yet, we use these helper clases. (Lgb)
  
-template<typename Char> struct local_lowercase {
-       Char operator()(Char c) const {
+struct local_lowercase {
+       char operator()(char c) const {
                 return tolower(c);
         }
+       char_type operator()(char_type c) const {
+               if (!is_utf16(c))
+                       // We don't know how to lowercase a non-utf16 char
+                       return c;
+               return qchar_to_ucs4(ucs4_to_qchar(c).toLower());
+       }
  };
  
  struct local_uppercase {
@@ -384,7 +363,7 @@ template<typename Char> struct local_ascii_lowercase {
  string const lowercase(string const & a)
  {
         string tmp(a);
-       transform(tmp.begin(), tmp.end(), tmp.begin(), local_lowercase<char>());
+       transform(tmp.begin(), tmp.end(), tmp.begin(), local_lowercase());
         return tmp;
  }
  
@@ -392,7 +371,7 @@ string const lowercase(string const & a)
  docstring const lowercase(docstring const & a)
  {
         docstring tmp(a);
-       transform(tmp.begin(), tmp.end(), tmp.begin(), local_lowercase<char_type>());
+       transform(tmp.begin(), tmp.end(), tmp.begin(), local_lowercase());
         return tmp;
  }
  
diff --git a/src/support/lstrings.h b/src/support/lstrings.h

index 5f9b559dc7de77d25660011590b82fe8ce139c5b..77b4ee5c3c620eb0bf3c50eb101abea7e2c82b12 100644 (file)
--- a/src/support/lstrings.h
+++ b/src/support/lstrings.h
@@ -24,17 +24,22 @@
  namespace lyx {
  namespace support {
  
-///
+/// Compare \p s and \p s2, ignoring the case.
+/// Caution: Depends on the locale
  int compare_no_case(std::string const & s, std::string const & s2);
+
+/// Compare \p s and \p s2, ignoring the case.
+/// Does not depend on the locale.
  int compare_no_case(docstring const & s, docstring const & s2);
  
-///
+/// Compare \p s and \p s2, ignoring the case of ASCII characters only.
  int compare_ascii_no_case(std::string const & s, std::string const & s2);
  
-///
+/// Compare \p s and \p s2, ignoring the case of ASCII characters only.
  int compare_ascii_no_case(docstring const & s, docstring const & s2);
  
-///
+/// Compare the first \p len characters of \p s and \p s2, ignoring the case.
+/// Caution: Depends on the locale
  int compare_no_case(std::string const & s, std::string const & s2, unsigned int len);
  
  ///
@@ -75,28 +80,37 @@ int hexToInt(lyx::docstring const & str);
  /// is \p str pure ascii?
  bool isAscii(docstring const & str);
  
-///
+/// Changes the case of \p c to lowercase.
+/// Caution: Depends on the locale
  char lowercase(char c);
  
-///
+/// Changes the case of \p c to uppercase.
+/// Caution: Depends on the locale
  char uppercase(char c);
  
-/// changes the case only if c is a one-byte char
+/// Changes the case of \p c to lowercase.
+/// Does not depend on the locale.
  char_type lowercase(char_type c);
  
-/// changes the case only if c is a one-byte char
+/// Changes the case of \p c to uppercase.
+/// Does not depend on the locale.
  char_type uppercase(char_type c);
  
  /// same as lowercase(), but ignores locale
  std::string const ascii_lowercase(std::string const &);
  docstring const ascii_lowercase(docstring const &);
  
-///
-std::string const lowercase(std::string const &);
-docstring const lowercase(docstring const &);
+/// Changes the case of \p s to lowercase.
+/// Caution: Depends on the locale
+std::string const lowercase(std::string const & s);
  
-///
-std::string const uppercase(std::string const &);
+/// Changes the case of \p s to lowercase.
+/// Does not depend on the locale.
+docstring const lowercase(docstring const & s);
+
+/// Changes the case of \p s to uppercase.
+/// Caution: Depends on the locale
+std::string const uppercase(std::string const & s);
  
  /// Does the string start with this prefix?
  bool prefixIs(docstring const &, char_type);
diff --git a/src/support/qstring_helpers.C b/src/support/qstring_helpers.C

index d1a2ef8a6dbc3d7803aa5af83297af9158705e0b..5b9fa1548115edab05a6bd213669c68311ee6a1e 100644 (file)
--- a/src/support/qstring_helpers.C
+++ b/src/support/qstring_helpers.C
@@ -24,6 +24,7 @@ using std::string;
  // We use QString::fromUcs4 in Qt 4.2 and higher
  QString const toqstr(docstring const & str)
  {
+       // This does not properly convert surrogate pairs
         QString s;
         int i = static_cast<int>(str.size()); 
         s.resize(i);
@@ -44,7 +45,7 @@ docstring const qstring_to_ucs4(QString const & qstr)
         int const ls = qstr.size();
         docstring ucs4;
         for (int i = 0; i < ls; ++i)
-               ucs4 += static_cast<char_type>(qstr[i].unicode());
+               ucs4 += qchar_to_ucs4(qstr[i].unicode());
         return ucs4;
  #endif
  }
diff --git a/src/support/qstring_helpers.h b/src/support/qstring_helpers.h

index d1525de8f4b9536bfe24af0898c3b72f0e2f4ac7..dfb22e6c1bf802ab86ef5163d74329287d0c945a 100644 (file)
--- a/src/support/qstring_helpers.h
+++ b/src/support/qstring_helpers.h
@@ -45,6 +45,14 @@ inline QString const toqstr(std::string const & str)
  }
  
  
+/// Is \p c a valid utf16 char?
+inline bool is_utf16(char_type c)
+{
+       // 0xd800 ... 0xdfff is the range of surrogate pairs.
+       return c < 0xd800 || (c > 0xdfff && c < 0x10000);
+}
+
+
  /**
   * Convert a QChar into a UCS4 character.
   * This is a hack (it does only make sense for the common part of the UCS4
@@ -54,6 +62,7 @@ inline QString const toqstr(std::string const & str)
   */
  inline char_type const qchar_to_ucs4(QChar const & qchar)
  {
+       BOOST_ASSERT(is_utf16(static_cast<char_type>(qchar.unicode())));
         return static_cast<char_type>(qchar.unicode());
  }
  
@@ -71,7 +80,7 @@ inline QChar const ucs4_to_qchar(char_type const ucs4)
         // for the ucs2 subrange of unicode. Instead of an assertion we should
         // return some special characters that indicates that its display is
         // not supported.
-       BOOST_ASSERT(ucs4 < 65536);
+       BOOST_ASSERT(is_utf16(ucs4));
         return QChar(static_cast<unsigned short>(ucs4));
  }
  
diff --git a/src/support/textutils.h b/src/support/textutils.h

index 3d70c69df121a5c60c435f3873fbed7b95f26f9d..283db0316bc3011134f5b90c08f195a0894a16a6 100644 (file)
--- a/src/support/textutils.h
+++ b/src/support/textutils.h
@@ -17,15 +17,6 @@
  
  #include "support/types.h"
  
-#ifdef LIBC_WCTYPE_USES_UCS4
-// We can use the libc ctype functions because we unset the LC_CTYPE
-// category of the current locale in gettext.C
-#include <wctype.h>
-#else
-// Steal some code from somewhere else, e.g. glib (look at gunicode.h)
-// The code that we currently use does not really work.
-#endif
-
  
  namespace lyx {
  
@@ -36,61 +27,17 @@ bool isLineSeparatorChar(char_type c)
         return c == ' ';
  }
  
-
  /// return true if a char is alphabetical (including accented chars)
-inline
-bool isLetterChar(char_type c)
-{
-#ifdef LIBC_WCTYPE_USES_UCS4
-       return iswalpha(c);
-#else
-       // FIXME UNICODE This is wrong!
-       return (c >= 'A' && c <= 'Z')
-               || (c >= 'a' && c <= 'z')
-               || (c >= 192 && c < 256); // in iso-8859-x these are accented chars
-#endif
-}
-
+bool isLetterChar(char_type c);
  
  /// return true if the char is printable
-inline
-bool isPrintable(char_type c)
-{
-#ifdef LIBC_WCTYPE_USES_UCS4
-       return iswprint(c);
-#else
-       // FIXME UNICODE This is wrong!
-       return (c & 127) >= ' ';
-#endif
-}
-
+bool isPrintable(char_type c);
  
  /// return true if the char is printable and not a space
-inline
-bool isPrintableNonspace(char_type c)
-{
-#ifdef LIBC_WCTYPE_USES_UCS4
-       return iswprint(c) && !iswspace(c);
-#else
-       // FIXME UNICODE This is wrong!
-       return (c & 127) > ' ';
-#endif
-}
-
+bool isPrintableNonspace(char_type c);
  
  /// return true if a unicode char is a digit.
-inline
-bool isDigit(char_type c)
-{
-#ifdef LIBC_WCTYPE_USES_UCS4
-       return iswdigit(c);
-#else
-       // FIXME UNICODE This is wrong!
-       return c >= '0' && c <= '9';
-#endif
-}
-
-
+bool isDigit(char_type c);
  
  } // namespace lyx
author	Georg Baum <Georg.Baum@post.rwth-aachen.de>
	Mon, 26 Feb 2007 09:03:21 +0000 (09:03 +0000)
committer	Georg Baum <Georg.Baum@post.rwth-aachen.de>
	Mon, 26 Feb 2007 09:03:21 +0000 (09:03 +0000)
development/scons/scons_manifest.py		patch \| blob \| history
src/support/Makefile.am		patch \| blob \| history
src/support/lstrings.C		patch \| blob \| history
src/support/lstrings.h		patch \| blob \| history
src/support/qstring_helpers.C		patch \| blob \| history
src/support/qstring_helpers.h		patch \| blob \| history
src/support/textutils.h		patch \| blob \| history