From 0418d1470468f626868895e7f30a7d0de394c5e8 Mon Sep 17 00:00:00 2001 From: Georg Baum Date: Mon, 26 Feb 2007 09:03:21 +0000 Subject: [PATCH] Fix character classification functions by using qt (bugs like 3270 and 1247) * src/support/lstrings.C (uppercase): Use qt instead of non working libc/home grown solution (lowercase): ditto (local_lowercase): Use qt instead of libc tolower for ucs4 chars * src/support/qstring_helpers.C (qstring_to_ucs4): Use qchar_to_ucs4 because of the assertion * src/support/lstrings.h: Add some documentation * src/support/qstring_helpers.h (is_utf16): New function: Tests whether an ucs4 character is also a valid utf16 character (qchar_to_ucs4): Assert on is_utf16() (ucs4_to_qchar): Replace old assertion with better is_utf16() * src/support/textutils.h (isLetterChar): Delete non-working implementation (isPrintable): Ditto (isPrintableNonspace): Ditto (isDigit): * src/support/textutils.C: New file, contains new implementations using qt of the functions in textutils.h * src/support/Makefile.am: Add textutils.C * development/scons/scons_manifest.py: ditto git-svn-id: svn://svn.lyx.org/lyx/lyx-devel/trunk@17354 a592a061-630c-0410-9148-cb99ea01b6c8 --- development/scons/scons_manifest.py | 1 + src/support/Makefile.am | 1 + src/support/lstrings.C | 55 ++++++++------------------ src/support/lstrings.h | 40 +++++++++++++------ src/support/qstring_helpers.C | 3 +- src/support/qstring_helpers.h | 11 +++++- src/support/textutils.h | 61 ++--------------------------- 7 files changed, 62 insertions(+), 110 deletions(-) diff --git a/development/scons/scons_manifest.py b/development/scons/scons_manifest.py index edd5f126c7..3e6d9a49b0 100644 --- a/development/scons/scons_manifest.py +++ b/development/scons/scons_manifest.py @@ -160,6 +160,7 @@ src_support_files = Split(''' socktools.C systemcall.C tempname.C + textutils.C unicode.C unlink.C userinfo.C diff --git a/src/support/Makefile.am b/src/support/Makefile.am index bb8e7a45c1..2095522456 100644 --- a/src/support/Makefile.am +++ b/src/support/Makefile.am @@ -78,6 +78,7 @@ libsupport_la_SOURCES = \ systemcall.C \ systemcall.h \ tempname.C \ + textutils.C \ textutils.h \ translator.h \ types.h \ diff --git a/src/support/lstrings.C b/src/support/lstrings.C index 973b260d7c..f1333eee0b 100644 --- a/src/support/lstrings.C +++ b/src/support/lstrings.C @@ -14,6 +14,7 @@ #include "support/lstrings.h" #include "support/lyxlib.h" #include "support/convert.h" +#include "support/qstring_helpers.h" #include "debug.h" @@ -32,17 +33,6 @@ #include #include -#ifdef LIBC_WCTYPE_USES_UCS4 -// We can use the libc ctype functions because we unset the LC_CTYPE -// category of the current locale in gettext.C -#include -#else -// Steal some code from somewhere else, e.g. glib (look at gunicode.h) -// The code that we currently use does not really work. -#endif - - -using lyx::docstring; using std::transform; using std::string; @@ -321,38 +311,21 @@ char uppercase(char c) } -// FIXME UNICODE -// for lowercase() and uppercase() function below when wchar_t is not used: -// 1) std::tolower() and std::toupper() are templates that -// compile fine with char_type. With the test (c >= 256) we -// do not trust these function to do the right thing with -// unicode char. -// 2) these functions use the current locale, which is wrong -// if it is not latin1 based (latin1 is a subset of UCS4). - char_type lowercase(char_type c) { -#ifdef LIBC_WCTYPE_USES_UCS4 - return towlower(c); -#else - if (c >= 256) + if (!is_utf16(c)) + // We don't know how to lowercase a non-utf16 char return c; - - return tolower(c); -#endif + return qchar_to_ucs4(ucs4_to_qchar(c).toLower()); } char_type uppercase(char_type c) { -#ifdef LIBC_WCTYPE_USES_UCS4 - return towupper(c); -#else - if (c >= 256) + if (!is_utf16(c)) + // We don't know how to uppercase a non-utf16 char return c; - - return toupper(c); -#endif + return qchar_to_ucs4(ucs4_to_qchar(c).toUpper()); } @@ -361,10 +334,16 @@ namespace { // since we cannot use std::tolower and std::toupper directly in the // calls to std::transform yet, we use these helper clases. (Lgb) -template struct local_lowercase { - Char operator()(Char c) const { +struct local_lowercase { + char operator()(char c) const { return tolower(c); } + char_type operator()(char_type c) const { + if (!is_utf16(c)) + // We don't know how to lowercase a non-utf16 char + return c; + return qchar_to_ucs4(ucs4_to_qchar(c).toLower()); + } }; struct local_uppercase { @@ -384,7 +363,7 @@ template struct local_ascii_lowercase { string const lowercase(string const & a) { string tmp(a); - transform(tmp.begin(), tmp.end(), tmp.begin(), local_lowercase()); + transform(tmp.begin(), tmp.end(), tmp.begin(), local_lowercase()); return tmp; } @@ -392,7 +371,7 @@ string const lowercase(string const & a) docstring const lowercase(docstring const & a) { docstring tmp(a); - transform(tmp.begin(), tmp.end(), tmp.begin(), local_lowercase()); + transform(tmp.begin(), tmp.end(), tmp.begin(), local_lowercase()); return tmp; } diff --git a/src/support/lstrings.h b/src/support/lstrings.h index 5f9b559dc7..77b4ee5c3c 100644 --- a/src/support/lstrings.h +++ b/src/support/lstrings.h @@ -24,17 +24,22 @@ namespace lyx { namespace support { -/// +/// Compare \p s and \p s2, ignoring the case. +/// Caution: Depends on the locale int compare_no_case(std::string const & s, std::string const & s2); + +/// Compare \p s and \p s2, ignoring the case. +/// Does not depend on the locale. int compare_no_case(docstring const & s, docstring const & s2); -/// +/// Compare \p s and \p s2, ignoring the case of ASCII characters only. int compare_ascii_no_case(std::string const & s, std::string const & s2); -/// +/// Compare \p s and \p s2, ignoring the case of ASCII characters only. int compare_ascii_no_case(docstring const & s, docstring const & s2); -/// +/// Compare the first \p len characters of \p s and \p s2, ignoring the case. +/// Caution: Depends on the locale int compare_no_case(std::string const & s, std::string const & s2, unsigned int len); /// @@ -75,28 +80,37 @@ int hexToInt(lyx::docstring const & str); /// is \p str pure ascii? bool isAscii(docstring const & str); -/// +/// Changes the case of \p c to lowercase. +/// Caution: Depends on the locale char lowercase(char c); -/// +/// Changes the case of \p c to uppercase. +/// Caution: Depends on the locale char uppercase(char c); -/// changes the case only if c is a one-byte char +/// Changes the case of \p c to lowercase. +/// Does not depend on the locale. char_type lowercase(char_type c); -/// changes the case only if c is a one-byte char +/// Changes the case of \p c to uppercase. +/// Does not depend on the locale. char_type uppercase(char_type c); /// same as lowercase(), but ignores locale std::string const ascii_lowercase(std::string const &); docstring const ascii_lowercase(docstring const &); -/// -std::string const lowercase(std::string const &); -docstring const lowercase(docstring const &); +/// Changes the case of \p s to lowercase. +/// Caution: Depends on the locale +std::string const lowercase(std::string const & s); -/// -std::string const uppercase(std::string const &); +/// Changes the case of \p s to lowercase. +/// Does not depend on the locale. +docstring const lowercase(docstring const & s); + +/// Changes the case of \p s to uppercase. +/// Caution: Depends on the locale +std::string const uppercase(std::string const & s); /// Does the string start with this prefix? bool prefixIs(docstring const &, char_type); diff --git a/src/support/qstring_helpers.C b/src/support/qstring_helpers.C index d1a2ef8a6d..5b9fa15481 100644 --- a/src/support/qstring_helpers.C +++ b/src/support/qstring_helpers.C @@ -24,6 +24,7 @@ using std::string; // We use QString::fromUcs4 in Qt 4.2 and higher QString const toqstr(docstring const & str) { + // This does not properly convert surrogate pairs QString s; int i = static_cast(str.size()); s.resize(i); @@ -44,7 +45,7 @@ docstring const qstring_to_ucs4(QString const & qstr) int const ls = qstr.size(); docstring ucs4; for (int i = 0; i < ls; ++i) - ucs4 += static_cast(qstr[i].unicode()); + ucs4 += qchar_to_ucs4(qstr[i].unicode()); return ucs4; #endif } diff --git a/src/support/qstring_helpers.h b/src/support/qstring_helpers.h index d1525de8f4..dfb22e6c1b 100644 --- a/src/support/qstring_helpers.h +++ b/src/support/qstring_helpers.h @@ -45,6 +45,14 @@ inline QString const toqstr(std::string const & str) } +/// Is \p c a valid utf16 char? +inline bool is_utf16(char_type c) +{ + // 0xd800 ... 0xdfff is the range of surrogate pairs. + return c < 0xd800 || (c > 0xdfff && c < 0x10000); +} + + /** * Convert a QChar into a UCS4 character. * This is a hack (it does only make sense for the common part of the UCS4 @@ -54,6 +62,7 @@ inline QString const toqstr(std::string const & str) */ inline char_type const qchar_to_ucs4(QChar const & qchar) { + BOOST_ASSERT(is_utf16(static_cast(qchar.unicode()))); return static_cast(qchar.unicode()); } @@ -71,7 +80,7 @@ inline QChar const ucs4_to_qchar(char_type const ucs4) // for the ucs2 subrange of unicode. Instead of an assertion we should // return some special characters that indicates that its display is // not supported. - BOOST_ASSERT(ucs4 < 65536); + BOOST_ASSERT(is_utf16(ucs4)); return QChar(static_cast(ucs4)); } diff --git a/src/support/textutils.h b/src/support/textutils.h index 3d70c69df1..283db0316b 100644 --- a/src/support/textutils.h +++ b/src/support/textutils.h @@ -17,15 +17,6 @@ #include "support/types.h" -#ifdef LIBC_WCTYPE_USES_UCS4 -// We can use the libc ctype functions because we unset the LC_CTYPE -// category of the current locale in gettext.C -#include -#else -// Steal some code from somewhere else, e.g. glib (look at gunicode.h) -// The code that we currently use does not really work. -#endif - namespace lyx { @@ -36,61 +27,17 @@ bool isLineSeparatorChar(char_type c) return c == ' '; } - /// return true if a char is alphabetical (including accented chars) -inline -bool isLetterChar(char_type c) -{ -#ifdef LIBC_WCTYPE_USES_UCS4 - return iswalpha(c); -#else - // FIXME UNICODE This is wrong! - return (c >= 'A' && c <= 'Z') - || (c >= 'a' && c <= 'z') - || (c >= 192 && c < 256); // in iso-8859-x these are accented chars -#endif -} - +bool isLetterChar(char_type c); /// return true if the char is printable -inline -bool isPrintable(char_type c) -{ -#ifdef LIBC_WCTYPE_USES_UCS4 - return iswprint(c); -#else - // FIXME UNICODE This is wrong! - return (c & 127) >= ' '; -#endif -} - +bool isPrintable(char_type c); /// return true if the char is printable and not a space -inline -bool isPrintableNonspace(char_type c) -{ -#ifdef LIBC_WCTYPE_USES_UCS4 - return iswprint(c) && !iswspace(c); -#else - // FIXME UNICODE This is wrong! - return (c & 127) > ' '; -#endif -} - +bool isPrintableNonspace(char_type c); /// return true if a unicode char is a digit. -inline -bool isDigit(char_type c) -{ -#ifdef LIBC_WCTYPE_USES_UCS4 - return iswdigit(c); -#else - // FIXME UNICODE This is wrong! - return c >= '0' && c <= '9'; -#endif -} - - +bool isDigit(char_type c); } // namespace lyx -- 2.39.5