* src/encoding.C (latexChar,read):

[lyx.git] / src / support / lstrings.C
diff --git a/src/support/lstrings.C b/src/support/lstrings.C

index f3023313325a4dc4c7321e778472c10d9d7c2f26..fe161be9913257702d9375a4be697dae9de562db 100644 (file)
--- a/src/support/lstrings.C
+++ b/src/support/lstrings.C
@@ -14,6 +14,7 @@
  #include "support/lstrings.h"
  #include "support/lyxlib.h"
  #include "support/convert.h"
+#include "support/qstring_helpers.h"
  
  #include "debug.h"
  
@@ -32,7 +33,6 @@
  #include <algorithm>
  #include <sstream>
  
-using lyx::docstring;
  
  using std::transform;
  using std::string;
@@ -48,36 +48,14 @@ using std::toupper;
  namespace lyx {
  namespace support {
  
-int compare_no_case(string const & s, string const & s2)
-{
-       string::const_iterator p = s.begin();
-       string::const_iterator p2 = s2.begin();
-
-       while (p != s.end() && p2 != s2.end()) {
-               int const lc1 = tolower(*p);
-               int const lc2 = tolower(*p2);
-               if (lc1 != lc2)
-                       return (lc1 < lc2) ? -1 : 1;
-               ++p;
-               ++p2;
-       }
-
-       if (s.size() == s2.size())
-               return 0;
-       if (s.size() < s2.size())
-               return -1;
-       return 1;
-}
-
-
  int compare_no_case(docstring const & s, docstring const & s2)
  {
         docstring::const_iterator p = s.begin();
         docstring::const_iterator p2 = s2.begin();
  
         while (p != s.end() && p2 != s2.end()) {
-               int const lc1 = tolower(*p);
-               int const lc2 = tolower(*p2);
+               char_type const lc1 = lowercase(*p);
+               char_type const lc2 = lowercase(*p2);
                 if (lc1 != lc2)
                         return (lc1 < lc2) ? -1 : 1;
                 ++p;
@@ -94,18 +72,20 @@ int compare_no_case(docstring const & s, docstring const & s2)
  
  namespace {
  
-int ascii_tolower(int c) {
+template<typename Char>
+Char ascii_tolower(Char c) {
         if (c >= 'A' && c <= 'Z')
                 return c - 'A' + 'a';
         return c;
  }
  
+}
+
  
-template<typename String> inline
-int do_compare_ascii_no_case(String const & s, String const & s2)
+int compare_ascii_no_case(string const & s, string const & s2)
  {
-       typename String::const_iterator p = s.begin();
-       typename String::const_iterator p2 = s2.begin();
+       string::const_iterator p = s.begin();
+       string::const_iterator p2 = s2.begin();
  
         while (p != s.end() && p2 != s2.end()) {
                 int const lc1 = ascii_tolower(*p);
@@ -123,37 +103,22 @@ int do_compare_ascii_no_case(String const & s, String const & s2)
         return 1;
  }
  
-}
-
-
-int compare_ascii_no_case(string const & s, string const & s2)
-{
-       return do_compare_ascii_no_case(s, s2);
-}
-
  
  int compare_ascii_no_case(docstring const & s, docstring const & s2)
  {
-       return do_compare_ascii_no_case(s, s2);
-}
-
+       docstring::const_iterator p = s.begin();
+       docstring::const_iterator p2 = s2.begin();
  
-int compare_no_case(string const & s, string const & s2, unsigned int len)
-{
-       string::const_iterator p = s.begin();
-       string::const_iterator p2 = s2.begin();
-       unsigned int i = 0;
-       while (i < len && p != s.end() && p2 != s2.end()) {
-               int const lc1 = tolower(*p);
-               int const lc2 = tolower(*p2);
+       while (p != s.end() && p2 != s2.end()) {
+               char_type const lc1 = ascii_tolower(*p);
+               char_type const lc2 = ascii_tolower(*p2);
                 if (lc1 != lc2)
                         return (lc1 < lc2) ? -1 : 1;
-               ++i;
                 ++p;
                 ++p2;
         }
  
-       if (s.size() >= len && s2.size() >= len)
+       if (s.size() == s2.size())
                 return 0;
         if (s.size() < s2.size())
                 return -1;
@@ -226,40 +191,98 @@ bool isStrDbl(string const & str)
  }
  
  
+namespace {
+
+inline
+bool isHexChar(char_type c)
+{
+       return c == '0' ||
+               c == '1' ||
+               c == '2' ||
+               c == '3' ||
+               c == '4' ||
+               c == '5' ||
+               c == '6' ||
+               c == '7' ||
+               c == '8' ||
+               c == '9' ||
+               c == 'a' || c == 'A' ||
+               c == 'b' || c == 'B' ||
+               c == 'c' || c == 'C' ||
+               c == 'd' || c == 'D' ||
+               c == 'e' || c == 'E' ||
+               c == 'f' || c == 'F';
+}
+
+} // anon namespace
+
+
+bool isHex(docstring const & str)
+{
+       int index = 0;
+
+       if (str.length() > 2 && str[0] == '0' &&
+           (str[1] == 'x' || str[1] == 'X'))
+               index = 2;
+
+       int const len = str.length();
+
+       for (; index < len; ++index) {
+               if (!isHexChar(str[index]))
+                       return false;
+       }
+       return true;
+}
+
+
+int hexToInt(docstring const & str)
+{
+       string s = to_ascii(str);
+       int h;
+       sscanf(s.c_str(), "%x", &h);
+       return h;
+}
+
+
+bool isAscii(docstring const & str)
+{
+       int const len = str.length();
+       for (int i = 0; i < len; ++i)
+               if (str[i] >= 0x80)
+                       return false;
+       return true;
+}
+
+
  char lowercase(char c)
  {
+       BOOST_ASSERT(static_cast<unsigned char>(c) < 0x80);
         return char(tolower(c));
  }
  
  
  char uppercase(char c)
  {
+       BOOST_ASSERT(static_cast<unsigned char>(c) < 0x80);
         return char(toupper(c));
  }
  
-// FIXME for lowercase() and uppercase() function below:
-// 1) std::tolower() and std::toupper() are templates that
-// compile fine with char_type. With the test (c >= 256) we
-// do not trust these function to do the right thing with
-// unicode char.
-// 2) these functions use the current locale, which is wrong
-// if it is not latin1 based (latin1 is a subset of UCS4).
  
  char_type lowercase(char_type c)
  {
-       if (c >= 256)
+       if (!is_utf16(c))
+               // We don't know how to lowercase a non-utf16 char
                 return c;
-
-       return tolower(c);
+       return qchar_to_ucs4(ucs4_to_qchar(c).toLower());
  }
  
  
  char_type uppercase(char_type c)
  {
-       if (c >= 256)
+       if (!is_utf16(c))
+               // We don't know how to uppercase a non-utf16 char
                 return c;
-
-       return toupper(c);
+       return qchar_to_ucs4(ucs4_to_qchar(c).toUpper());
  }
  
  
@@ -269,35 +292,42 @@ namespace {
  // calls to std::transform yet, we use these helper clases. (Lgb)
  
  struct local_lowercase {
-       char operator()(char c) const {
-               return tolower(c);
+       char_type operator()(char_type c) const {
+               if (!is_utf16(c))
+                       // We don't know how to lowercase a non-utf16 char
+                       return c;
+               return qchar_to_ucs4(ucs4_to_qchar(c).toLower());
         }
  };
  
  struct local_uppercase {
-       char operator()(char c) const {
-               return toupper(c);
+       char_type operator()(char_type c) const {
+               if (!is_utf16(c))
+                       // We don't know how to uppercase a non-utf16 char
+                       return c;
+               return qchar_to_ucs4(ucs4_to_qchar(c).toUpper());
         }
  };
  
-struct local_ascii_lowercase {
-       char operator()(char c) const {
+template<typename Char> struct local_ascii_lowercase {
+       Char operator()(Char c) const {
                 return ascii_tolower(c);
         }
  };
  
  } // end of anon namespace
  
-string const lowercase(string const & a)
+docstring const lowercase(docstring const & a)
  {
-       string tmp(a);
+       docstring tmp(a);
         transform(tmp.begin(), tmp.end(), tmp.begin(), local_lowercase());
         return tmp;
  }
  
-string const uppercase(string const & a)
+
+docstring const uppercase(docstring const & a)
  {
-       string tmp(a);
+       docstring tmp(a);
         transform(tmp.begin(), tmp.end(), tmp.begin(), local_uppercase());
         return tmp;
  }
@@ -307,11 +337,28 @@ string const ascii_lowercase(string const & a)
  {
         string tmp(a);
         transform(tmp.begin(), tmp.end(), tmp.begin(),
-                 local_ascii_lowercase());
+                 local_ascii_lowercase<char>());
+       return tmp;
+}
+
+
+docstring const ascii_lowercase(docstring const & a)
+{
+       docstring tmp(a);
+       transform(tmp.begin(), tmp.end(), tmp.begin(),
+                 local_ascii_lowercase<char_type>());
         return tmp;
  }
  
  
+bool prefixIs(docstring const & a, char_type c)
+{
+       if (a.empty())
+               return false;
+       return a[0] == c;
+}
+
+
  bool prefixIs(string const & a, string const & pre)
  {
         string::size_type const prelen = pre.length();
@@ -348,6 +395,14 @@ bool suffixIs(string const & a, char c)
  }
  
  
+bool suffixIs(docstring const & a, char_type c)
+{
+       if (a.empty())
+               return false;
+       return a[a.length() - 1] == c;
+}
+
+
  bool suffixIs(string const & a, string const & suf)
  {
         string::size_type const suflen = suf.length();
@@ -555,22 +610,46 @@ string const rtrim(string const & a, char const * p)
  }
  
  
-string const ltrim(string const & a, char const * p)
+docstring const rtrim(docstring const & a, char const * p)
  {
         BOOST_ASSERT(p);
  
         if (a.empty() || !*p)
                 return a;
  
-       string::size_type l = a.find_first_not_of(p);
+       docstring::size_type r = a.find_last_not_of(from_ascii(p));
+
+       // Is this test really needed? (Lgb)
+       if (r == docstring::npos)
+               return docstring();
  
+       return a.substr(0, r + 1);
+}
+
+
+string const ltrim(string const & a, char const * p)
+{
+       BOOST_ASSERT(p);
+       if (a.empty() || !*p)
+               return a;
+       string::size_type l = a.find_first_not_of(p);
         if (l == string::npos)
                 return string();
-
         return a.substr(l, string::npos);
  }
  
  
+docstring const ltrim(docstring const & a, char const * p)
+{
+       BOOST_ASSERT(p);
+       if (a.empty() || !*p)
+               return a;
+       size_t l = a.find_first_not_of(from_ascii(p));
+       if (l == docstring::npos)
+               return docstring();
+       return a.substr(l, docstring::npos);
+}
+
  namespace {
  
  template<typename String, typename Char> inline
@@ -632,21 +711,27 @@ string const rsplit(string const & a, string & piece, char delim)
  }
  
  
-// This function escapes 8-bit characters and other problematic
-// characters that cause problems in latex labels.
  docstring const escape(docstring const & lab)
  {
-       lyx::char_type hexdigit[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
-                             '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
+       char_type hexdigit[16] = { '0', '1', '2', '3', '4', '5', '6', '7',
+                                  '8', '9', 'A', 'B', 'C', 'D', 'E', 'F' };
         docstring enc;
         for (docstring::size_type i = 0; i < lab.length(); ++i) {
-               lyx::char_type c= lab[i];
-               // FIXME We must change the following algorithm for UCS4
-               // chars, but that will be a file format change.
+               char_type c = lab[i];
                 if (c >= 128 || c == '=' || c == '%') {
+                       // Although char_type is a 32 bit type we know that
+                       // UCS4 occupies only 21 bits, so we don't need to
+                       // encode bigger values. Test for 2^24 because we
+                       // can encode that with the 6 hex digits that are
+                       // needed for 21 bits anyway.
+                       BOOST_ASSERT(c < (1 << 24));
                         enc += '=';
-                       enc += hexdigit[c>>4];
-                       enc += hexdigit[c & 15];
+                       enc += hexdigit[(c>>20) & 15];
+                       enc += hexdigit[(c>>16) & 15];
+                       enc += hexdigit[(c>>12) & 15];
+                       enc += hexdigit[(c>> 8) & 15];
+                       enc += hexdigit[(c>> 4) & 15];
+                       enc += hexdigit[ c      & 15];
                 } else {
                         enc += c;
                 }
@@ -655,38 +740,57 @@ docstring const escape(docstring const & lab)
  }
  
  
-/// gives a vector of stringparts which have the delimiter delim
-vector<string> const getVectorFromString(string const & str,
-                                        string const & delim)
+namespace {
+
+template<typename String> vector<String> const
+getVectorFromStringT(String const & str, String const & delim)
  {
  // Lars would like this code to go, but for now his replacement (below)
  // doesn't fullfil the same function. I have, therefore, reactivated the
  // old code for now. Angus 11 Nov 2002.
  #if 1
-       vector<string> vec;
+       vector<String> vec;
         if (str.empty())
                 return vec;
-       string keys = rtrim(str);
+       String keys = rtrim(str);
         for(;;) {
-               string::size_type const idx = keys.find(delim);
-               if (idx == string::npos) {
+               typename String::size_type const idx = keys.find(delim);
+               if (idx == String::npos) {
                         vec.push_back(ltrim(keys));
                         break;
                 }
-               string const key = trim(keys.substr(0, idx));
+               String const key = trim(keys.substr(0, idx));
                 if (!key.empty())
                         vec.push_back(key);
-               string::size_type const start = idx + delim.size();
+               typename String::size_type const start = idx + delim.size();
                 keys = keys.substr(start);
         }
         return vec;
  #else
-       boost::char_separator<char> sep(delim.c_str());
-       boost::tokenizer<boost::char_separator<char> > tokens(str, sep);
-       return vector<string>(tokens.begin(), tokens.end());
+       typedef boost::char_separator<typename String::value_type> Separator;
+       typedef boost::tokenizer<Separator, typename String::const_iterator, String> Tokenizer;
+       Separator sep(delim.c_str());
+       Tokenizer tokens(str, sep);
+       return vector<String>(tokens.begin(), tokens.end());
  #endif
  }
  
+}
+
+
+vector<string> const getVectorFromString(string const & str,
+                                         string const & delim)
+{
+       return getVectorFromStringT<string>(str, delim);
+}
+
+
+vector<docstring> const getVectorFromString(docstring const & str,
+                                            docstring const & delim)
+{
+       return getVectorFromStringT<docstring>(str, delim);
+}
+
  
  // the same vice versa
  string const getStringFromVector(vector<string> const & vec,