src/support/docstring.C

   1 /**
   2  * \file docstring.C
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Georg Baum
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "docstring.h"
  14 #include "unicode.h"
  15
  16 #include <locale>
  17
  18 #include <boost/assert.hpp>
  19
  20
  21 namespace lyx {
  22
  23 docstring const from_ascii(char const * ascii)
  24 {
  25         docstring s;
  26         for (char const * c = ascii; *c; ++c) {
  27                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
  28                 s.push_back(*c);
  29         }
  30         return s;
  31 }
  32
  33
  34 docstring const from_ascii(std::string const & ascii)
  35 {
  36         int const len = ascii.length();
  37         for (int i = 0; i < len; ++i)
  38                 BOOST_ASSERT(static_cast<unsigned char>(ascii[i]) < 0x80);
  39         return docstring(ascii.begin(), ascii.end());
  40 }
  41
  42
  43 docstring const from_utf8(std::string const & utf8)
  44 {
  45         std::vector<lyx::char_type> const ucs4 =
  46                 utf8_to_ucs4(utf8.data(), utf8.size());
  47         return docstring(ucs4.begin(), ucs4.end());
  48 }
  49
  50
  51 std::string const to_utf8(docstring const & ucs4)
  52 {
  53         std::vector<char> const utf8 =
  54                 ucs4_to_utf8(ucs4.data(), ucs4.size());
  55         return std::string(utf8.begin(), utf8.end());
  56 }
  57
  58 }
  59
  60
  61 bool operator==(lyx::docstring const & l, char const * r)
  62 {
  63         int const len = l.length();
  64         for (int i = 0; i < len; ++i) {
  65                 BOOST_ASSERT(static_cast<unsigned char>(r[i]) < 0x80);
  66                 if (!r[i])
  67                         return false;
  68                 if (l[i] != lyx::docstring::value_type(r[i]))
  69                         return false;
  70         }
  71         return r[len] == '\0';
  72 }
  73
  74
  75 lyx::docstring operator+(lyx::docstring const & l, char const * r)
  76 {
  77         lyx::docstring s(l);
  78         for (char const * c = r; *c; ++c) {
  79                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
  80                 s.push_back(*c);
  81         }
  82         return s;
  83 }
  84
  85
  86 lyx::docstring operator+(char const * l, lyx::docstring const & r)
  87 {
  88         lyx::docstring s;
  89         for (char const * c = l; *c; ++c) {
  90                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
  91                 s.push_back(*c);
  92         }
  93         s += r;
  94         return s;
  95 }
  96
  97
  98 lyx::docstring operator+(lyx::docstring const & l, char r)
  99 {
 100         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 101         return l + lyx::docstring::value_type(r);
 102 }
 103
 104
 105 lyx::docstring operator+(char l, lyx::docstring const & r)
 106 {
 107         BOOST_ASSERT(static_cast<unsigned char>(l) < 0x80);
 108         return lyx::docstring::value_type(l) + r;
 109 }
 110
 111
 112 #if (!defined(HAVE_WCHAR_T) || SIZEOF_WCHAR_T != 4) && defined(__GNUC__)
 113
 114 // gcc does not have proper locale facets for lyx::char_type if
 115 // sizeof(wchar_t) == 2, so we have to implement them on our own.
 116
 117
 118 // We get undefined references to these virtual methods. This looks like
 119 // a bug in gcc. The implementation here does not do anything useful, since
 120 // it is overriden in ascii_ctype_facet.
 121 namespace std {
 122 template<> ctype<lyx::char_type>::~ctype() {}
 123 template<> bool
 124 ctype<lyx::char_type>::do_is(ctype<lyx::char_type>::mask, lyx::char_type) const { return false; }
 125 template<> lyx::char_type const *
 126 ctype<lyx::char_type>::do_is(const lyx::char_type *, const lyx::char_type *, ctype<lyx::char_type>::mask *) const { return 0; }
 127 template<> const lyx::char_type *
 128 ctype<lyx::char_type>::do_scan_is(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 129 template<> const lyx::char_type *
 130 ctype<lyx::char_type>::do_scan_not(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 131 template<> lyx::char_type ctype<lyx::char_type>::do_toupper(lyx::char_type) const { return 0; }
 132 template<> const lyx::char_type * ctype<lyx::char_type>::do_toupper(lyx::char_type *, lyx::char_type const *) const { return 0; }
 133 template<> lyx::char_type ctype<lyx::char_type>::do_tolower(lyx::char_type) const { return 0; }
 134 template<> const lyx::char_type * ctype<lyx::char_type>::do_tolower(lyx::char_type *, lyx::char_type const *) const { return 0; }
 135 template<> lyx::char_type ctype<lyx::char_type>::do_widen(char) const { return 0; }
 136 template<> const char *
 137 ctype<lyx::char_type>::do_widen(const char *, const char *, lyx::char_type *) const { return 0; }
 138 template<> char
 139 ctype<lyx::char_type>::do_narrow(const lyx::char_type, char) const { return 0; }
 140 template<> const lyx::char_type *
 141 ctype<lyx::char_type>::do_narrow(const lyx::char_type *, const lyx::char_type *, char, char *) const { return 0; }
 142 }
 143
 144
 145 namespace lyx {
 146
 147 class ctype_failure : public std::bad_cast {
 148 public:
 149         ctype_failure() throw() : std::bad_cast() {}
 150         virtual ~ctype_failure() throw() {}
 151         virtual const char* what() const throw()
 152         {
 153                 return "The ctype<lyx::char_type> locale facet does only support ASCII characters on this platform.";
 154         }
 155 };
 156
 157
 158 /// ctype facet for UCS4 characters. The implementation does only support pure
 159 /// ASCII, since we do not need anything else for now.
 160 /// The code is partly stolen from std::ctype<wchar_t> from gcc.
 161 class ascii_ctype_facet : public std::ctype<lyx::char_type>
 162 {
 163 public:
 164         typedef lyx::char_type char_type;
 165         typedef wctype_t wmask_type;
 166         explicit ascii_ctype_facet(size_t refs = 0) : std::ctype<char_type>(refs)
 167         {
 168                 M_initialize_ctype();
 169         }
 170 protected:
 171         bool       M_narrow_ok;
 172         char       M_narrow[128];
 173         wint_t     M_widen[1 + static_cast<unsigned char>(-1)];
 174         mask       M_bit[16];
 175         wmask_type M_wmask[16];
 176         wmask_type M_convert_to_wmask(const mask m) const
 177         {
 178                 wmask_type ret;
 179                 switch (m) {
 180                         case space:  ret = wctype("space");  break;
 181                         case print:  ret = wctype("print");  break;
 182                         case cntrl:  ret = wctype("cntrl");  break;
 183                         case upper:  ret = wctype("upper");  break;
 184                         case lower:  ret = wctype("lower");  break;
 185                         case alpha:  ret = wctype("alpha");  break;
 186                         case digit:  ret = wctype("digit");  break;
 187                         case punct:  ret = wctype("punct");  break;
 188                         case xdigit: ret = wctype("xdigit"); break;
 189                         case alnum:  ret = wctype("alnum");  break;
 190                         case graph:  ret = wctype("graph");  break;
 191                         default:     ret = wmask_type();
 192                 }
 193                 return ret;
 194         }
 195         void M_initialize_ctype()
 196         {
 197                 wint_t i;
 198                 for (i = 0; i < 128; ++i) {
 199                         const int c = wctob(i);
 200                         if (c == EOF)
 201                                 break;
 202                         else
 203                                 M_narrow[i] = static_cast<char>(c);
 204                 }
 205                 if (i == 128)
 206                         M_narrow_ok = true;
 207                 else
 208                         M_narrow_ok = false;
 209                 for (size_t i = 0; i < sizeof(M_widen) / sizeof(wint_t); ++i)
 210                         M_widen[i] = btowc(i);
 211
 212                 for (size_t i = 0; i <= 15; ++i) {
 213                         M_bit[i] = static_cast<mask>(1 << i);
 214                         M_wmask[i] = M_convert_to_wmask(M_bit[i]);
 215                 }
 216         }
 217         virtual ~ascii_ctype_facet() {}
 218         char_type do_toupper(char_type c) const
 219         {
 220                 if (c >= 0x80)
 221                         throw ctype_failure();
 222                 return toupper(static_cast<int>(c));
 223         }
 224         char_type const * do_toupper(char_type * lo, char_type const * hi) const
 225         {
 226                 while (lo < hi) {
 227                         if (*lo >= 0x80)
 228                                 throw ctype_failure();
 229                         *lo = toupper(static_cast<int>(*lo));
 230                         ++lo;
 231                 }
 232                 return hi;
 233         }
 234         char_type do_tolower(char_type c) const
 235         {
 236                 if (c >= 0x80)
 237                         throw ctype_failure();
 238                 return tolower(c);
 239         }
 240         char_type const * do_tolower(char_type * lo, char_type const * hi) const
 241         {
 242                 while (lo < hi) {
 243                         if (*lo >= 0x80)
 244                                 throw ctype_failure();
 245                         *lo = tolower(*lo);
 246                         ++lo;
 247                 }
 248                 return hi;
 249         }
 250         bool do_is(mask m, char_type c) const
 251         {
 252                 if (c >= 0x80)
 253                         throw ctype_failure();
 254                 // The code below works because c is in the ASCII range.
 255                 // We could not use iswctype() which is designed for a 2byte
 256                 // whar_t without encoding conversion otherwise.
 257                 bool ret = false;
 258                 // Generically, 15 (instead of 10) since we don't know the numerical
 259                 // encoding of the various categories in /usr/include/ctype.h.
 260                 const size_t bitmasksize = 15;
 261                 for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 262                         if (m & M_bit[bitcur] &&
 263                             iswctype(static_cast<int>(c), M_wmask[bitcur])) {
 264                                 ret = true;
 265                                 break;
 266                         }
 267                 return ret;
 268         }
 269         char_type const * do_is(char_type const * lo, char_type const * hi, mask * vec) const
 270         {
 271                 for (;lo < hi; ++vec, ++lo) {
 272                         if (*lo >= 0x80)
 273                                 throw ctype_failure();
 274                         // The code below works because c is in the ASCII range.
 275                         // We could not use iswctype() which is designed for a 2byte
 276                         // whar_t without encoding conversion otherwise.
 277                         // Generically, 15 (instead of 10) since we don't know the numerical
 278                         // encoding of the various categories in /usr/include/ctype.h.
 279                         const size_t bitmasksize = 15;
 280                         mask m = 0;
 281                         for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 282                                 if (iswctype(static_cast<int>(*lo), M_wmask[bitcur]))
 283                                         m |= M_bit[bitcur];
 284                         *vec = m;
 285                 }
 286                 return hi;
 287         }
 288         char_type const * do_scan_is(mask m, char_type const * lo, char_type const * hi) const
 289         {
 290                 while (lo < hi && !this->do_is(m, *lo))
 291                         ++lo;
 292                 return lo;
 293         }
 294         char_type const * do_scan_not(mask m, char_type const * lo, char_type const * hi) const
 295         {
 296                 while (lo < hi && this->do_is(m, *lo) != 0)
 297                         ++lo;
 298                 return lo;
 299         }
 300         char_type do_widen(char c) const
 301         {
 302                 if (static_cast<unsigned char>(c) < 0x80)
 303                         return c;
 304                 throw ctype_failure();
 305         }
 306         const char* do_widen(const char* lo, const char* hi, char_type* dest) const
 307         {
 308                 while (lo < hi) {
 309                         if (static_cast<unsigned char>(*lo) >= 0x80)
 310                                 throw ctype_failure();
 311                         *dest = *lo;
 312                         ++lo;
 313                         ++dest;
 314                 }
 315                 return hi;
 316         }
 317         char do_narrow(char_type wc, char) const
 318         {
 319                 if (wc < 0x80)
 320                         return static_cast<char>(wc);
 321                 throw ctype_failure();
 322         }
 323         const char_type * do_narrow(const char_type * lo, const char_type * hi, char, char * dest) const
 324         {
 325                 while (lo < hi) {
 326                         if (*lo < 0x80)
 327                                 *dest = static_cast<char>(*lo);
 328                         else
 329                                 throw ctype_failure();
 330                         ++lo;
 331                         ++dest;
 332                 }
 333                 return hi;
 334         }
 335 };
 336
 337
 338 /// class to add our ascii_ctype_facet to the global locale
 339 class locale_initializer {
 340 public:
 341         locale_initializer()
 342         {
 343                 std::locale global;
 344                 std::locale const loc(global, new ascii_ctype_facet);
 345                 std::locale::global(loc);
 346         }
 347 };
 348
 349
 350 namespace {
 351
 352 /// make sure that our ascii_ctype_facet gets used
 353 static locale_initializer initializer;
 354
 355 }
 356 }
 357 #endif