src/support/docstring.C

   1 /**
   2  * \file docstring.C
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Georg Baum
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "docstring.h"
  14 #include "unicode.h"
  15
  16 #include <locale>
  17
  18 #include <boost/assert.hpp>
  19
  20
  21 namespace lyx {
  22
  23
  24 docstring const from_ascii(char const * ascii)
  25 {
  26         docstring s;
  27         for (char const * c = ascii; *c; ++c) {
  28                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
  29                 s.push_back(*c);
  30         }
  31         return s;
  32 }
  33
  34
  35 docstring const from_ascii(std::string const & ascii)
  36 {
  37         int const len = ascii.length();
  38         for (int i = 0; i < len; ++i)
  39                 BOOST_ASSERT(static_cast<unsigned char>(ascii[i]) < 0x80);
  40         return docstring(ascii.begin(), ascii.end());
  41 }
  42
  43
  44 std::string const to_ascii(docstring const & ucs4)
  45 {
  46         int const len = ucs4.length();
  47         std::string ascii;
  48         ascii.resize(len);
  49         for (int i = 0; i < len; ++i) {
  50                 BOOST_ASSERT(ucs4[i] < 0x80);
  51                 ascii[i] = static_cast<char>(ucs4[i]);
  52         }
  53         return ascii;
  54 }
  55
  56
  57 void utf8_to_ucs4(std::string const & utf8, docstring & ucs4)
  58 {
  59         static IconvProcessor iconv(ucs4_codeset, "UTF-8");
  60
  61         size_t n = utf8.size();
  62         // as utf8 is a multi-byte encoding, there would be at most
  63         // n characters:
  64         ucs4.resize(n);
  65         if (n == 0)
  66                 return;
  67
  68         int maxoutsize = n * 4;
  69         int cd = -1;
  70         // basic_string::data() is not recognized by some old gcc version
  71         // so we use &(ucs4[0]) instead.
  72         char * outbuf = (char *)(&(ucs4[0]));
  73         int bytes = iconv.convert(utf8.c_str(), n, outbuf, maxoutsize);
  74
  75         // adjust to the real converted size
  76         ucs4.resize(bytes/4);
  77 }
  78
  79
  80 docstring const from_utf8(std::string const & utf8)
  81 {
  82         docstring ucs4;
  83         utf8_to_ucs4(utf8, ucs4);
  84         return ucs4;
  85 }
  86
  87
  88 std::string const to_utf8(docstring const & ucs4)
  89 {
  90         std::vector<char> const utf8 =
  91                 ucs4_to_utf8(ucs4.data(), ucs4.size());
  92         return std::string(utf8.begin(), utf8.end());
  93 }
  94
  95
  96 bool operator==(lyx::docstring const & l, char const * r)
  97 {
  98         int const len = l.length();
  99         for (int i = 0; i < len; ++i) {
 100                 BOOST_ASSERT(static_cast<unsigned char>(r[i]) < 0x80);
 101                 if (!r[i])
 102                         return false;
 103                 if (l[i] != lyx::docstring::value_type(r[i]))
 104                         return false;
 105         }
 106         return r[len] == '\0';
 107 }
 108
 109
 110 lyx::docstring operator+(lyx::docstring const & l, char const * r)
 111 {
 112         lyx::docstring s(l);
 113         for (char const * c = r; *c; ++c) {
 114                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 115                 s.push_back(*c);
 116         }
 117         return s;
 118 }
 119
 120
 121 lyx::docstring operator+(char const * l, lyx::docstring const & r)
 122 {
 123         lyx::docstring s;
 124         for (char const * c = l; *c; ++c) {
 125                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 126                 s.push_back(*c);
 127         }
 128         s += r;
 129         return s;
 130 }
 131
 132
 133 lyx::docstring operator+(lyx::docstring const & l, char r)
 134 {
 135         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 136         return l + lyx::docstring::value_type(r);
 137 }
 138
 139
 140 lyx::docstring operator+(char l, lyx::docstring const & r)
 141 {
 142         BOOST_ASSERT(static_cast<unsigned char>(l) < 0x80);
 143         return lyx::docstring::value_type(l) + r;
 144 }
 145
 146
 147 lyx::docstring & operator+=(lyx::docstring & l, char const * r)
 148 {
 149         for (char const * c = r; *c; ++c) {
 150                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 151                 l.push_back(*c);
 152         }
 153         return l;
 154 }
 155
 156
 157 lyx::docstring & operator+=(lyx::docstring & l, char r)
 158 {
 159         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 160         l.push_back(r);
 161         return l;
 162 }
 163
 164 } // namespace lyx
 165
 166 #if (!defined(HAVE_WCHAR_T) || SIZEOF_WCHAR_T != 4) && defined(__GNUC__)
 167
 168 // gcc does not have proper locale facets for lyx::char_type if
 169 // sizeof(wchar_t) == 2, so we have to implement them on our own.
 170
 171
 172 // We get undefined references to these virtual methods. This looks like
 173 // a bug in gcc. The implementation here does not do anything useful, since
 174 // it is overriden in ascii_ctype_facet.
 175 namespace std {
 176 template<> ctype<lyx::char_type>::~ctype() {}
 177 template<> bool
 178 ctype<lyx::char_type>::do_is(ctype<lyx::char_type>::mask, lyx::char_type) const { return false; }
 179 template<> lyx::char_type const *
 180 ctype<lyx::char_type>::do_is(const lyx::char_type *, const lyx::char_type *, ctype<lyx::char_type>::mask *) const { return 0; }
 181 template<> const lyx::char_type *
 182 ctype<lyx::char_type>::do_scan_is(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 183 template<> const lyx::char_type *
 184 ctype<lyx::char_type>::do_scan_not(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 185 template<> lyx::char_type ctype<lyx::char_type>::do_toupper(lyx::char_type) const { return 0; }
 186 template<> const lyx::char_type * ctype<lyx::char_type>::do_toupper(lyx::char_type *, lyx::char_type const *) const { return 0; }
 187 template<> lyx::char_type ctype<lyx::char_type>::do_tolower(lyx::char_type) const { return 0; }
 188 template<> const lyx::char_type * ctype<lyx::char_type>::do_tolower(lyx::char_type *, lyx::char_type const *) const { return 0; }
 189 template<> lyx::char_type ctype<lyx::char_type>::do_widen(char) const { return 0; }
 190 template<> const char *
 191 ctype<lyx::char_type>::do_widen(const char *, const char *, lyx::char_type *) const { return 0; }
 192 template<> char
 193 ctype<lyx::char_type>::do_narrow(const lyx::char_type, char) const { return 0; }
 194 template<> const lyx::char_type *
 195 ctype<lyx::char_type>::do_narrow(const lyx::char_type *, const lyx::char_type *, char, char *) const { return 0; }
 196 }
 197
 198
 199 namespace lyx {
 200
 201 class ctype_failure : public std::bad_cast {
 202 public:
 203         ctype_failure() throw() : std::bad_cast() {}
 204         virtual ~ctype_failure() throw() {}
 205         virtual const char* what() const throw()
 206         {
 207                 return "The ctype<lyx::char_type> locale facet does only support ASCII characters on this platform.";
 208         }
 209 };
 210
 211
 212 /// ctype facet for UCS4 characters. The implementation does only support pure
 213 /// ASCII, since we do not need anything else for now.
 214 /// The code is partly stolen from std::ctype<wchar_t> from gcc.
 215 class ascii_ctype_facet : public std::ctype<lyx::char_type>
 216 {
 217 public:
 218         typedef lyx::char_type char_type;
 219         typedef wctype_t wmask_type;
 220         explicit ascii_ctype_facet(size_t refs = 0) : std::ctype<char_type>(refs)
 221         {
 222                 M_initialize_ctype();
 223         }
 224 protected:
 225         bool       M_narrow_ok;
 226         char       M_narrow[128];
 227         wint_t     M_widen[1 + static_cast<unsigned char>(-1)];
 228         mask       M_bit[16];
 229         wmask_type M_wmask[16];
 230         wmask_type M_convert_to_wmask(const mask m) const
 231         {
 232                 wmask_type ret;
 233                 switch (m) {
 234                         case space:  ret = wctype("space");  break;
 235                         case print:  ret = wctype("print");  break;
 236                         case cntrl:  ret = wctype("cntrl");  break;
 237                         case upper:  ret = wctype("upper");  break;
 238                         case lower:  ret = wctype("lower");  break;
 239                         case alpha:  ret = wctype("alpha");  break;
 240                         case digit:  ret = wctype("digit");  break;
 241                         case punct:  ret = wctype("punct");  break;
 242                         case xdigit: ret = wctype("xdigit"); break;
 243                         case alnum:  ret = wctype("alnum");  break;
 244                         case graph:  ret = wctype("graph");  break;
 245                         default:     ret = wmask_type();
 246                 }
 247                 return ret;
 248         }
 249         void M_initialize_ctype()
 250         {
 251                 wint_t i;
 252                 for (i = 0; i < 128; ++i) {
 253                         const int c = wctob(i);
 254                         if (c == EOF)
 255                                 break;
 256                         else
 257                                 M_narrow[i] = static_cast<char>(c);
 258                 }
 259                 if (i == 128)
 260                         M_narrow_ok = true;
 261                 else
 262                         M_narrow_ok = false;
 263                 for (size_t i = 0; i < sizeof(M_widen) / sizeof(wint_t); ++i)
 264                         M_widen[i] = btowc(i);
 265
 266                 for (size_t i = 0; i <= 15; ++i) {
 267                         M_bit[i] = static_cast<mask>(1 << i);
 268                         M_wmask[i] = M_convert_to_wmask(M_bit[i]);
 269                 }
 270         }
 271         virtual ~ascii_ctype_facet() {}
 272         char_type do_toupper(char_type c) const
 273         {
 274                 if (c >= 0x80)
 275                         throw ctype_failure();
 276                 return toupper(static_cast<int>(c));
 277         }
 278         char_type const * do_toupper(char_type * lo, char_type const * hi) const
 279         {
 280                 while (lo < hi) {
 281                         if (*lo >= 0x80)
 282                                 throw ctype_failure();
 283                         *lo = toupper(static_cast<int>(*lo));
 284                         ++lo;
 285                 }
 286                 return hi;
 287         }
 288         char_type do_tolower(char_type c) const
 289         {
 290                 if (c >= 0x80)
 291                         throw ctype_failure();
 292                 return tolower(c);
 293         }
 294         char_type const * do_tolower(char_type * lo, char_type const * hi) const
 295         {
 296                 while (lo < hi) {
 297                         if (*lo >= 0x80)
 298                                 throw ctype_failure();
 299                         *lo = tolower(*lo);
 300                         ++lo;
 301                 }
 302                 return hi;
 303         }
 304         bool do_is(mask m, char_type c) const
 305         {
 306                 if (c >= 0x80)
 307                         throw ctype_failure();
 308                 // The code below works because c is in the ASCII range.
 309                 // We could not use iswctype() which is designed for a 2byte
 310                 // whar_t without encoding conversion otherwise.
 311                 bool ret = false;
 312                 // Generically, 15 (instead of 10) since we don't know the numerical
 313                 // encoding of the various categories in /usr/include/ctype.h.
 314                 const size_t bitmasksize = 15;
 315                 for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 316                         if (m & M_bit[bitcur] &&
 317                             iswctype(static_cast<int>(c), M_wmask[bitcur])) {
 318                                 ret = true;
 319                                 break;
 320                         }
 321                 return ret;
 322         }
 323         char_type const * do_is(char_type const * lo, char_type const * hi, mask * vec) const
 324         {
 325                 for (;lo < hi; ++vec, ++lo) {
 326                         if (*lo >= 0x80)
 327                                 throw ctype_failure();
 328                         // The code below works because c is in the ASCII range.
 329                         // We could not use iswctype() which is designed for a 2byte
 330                         // whar_t without encoding conversion otherwise.
 331                         // Generically, 15 (instead of 10) since we don't know the numerical
 332                         // encoding of the various categories in /usr/include/ctype.h.
 333                         const size_t bitmasksize = 15;
 334                         mask m = 0;
 335                         for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 336                                 if (iswctype(static_cast<int>(*lo), M_wmask[bitcur]))
 337                                         m |= M_bit[bitcur];
 338                         *vec = m;
 339                 }
 340                 return hi;
 341         }
 342         char_type const * do_scan_is(mask m, char_type const * lo, char_type const * hi) const
 343         {
 344                 while (lo < hi && !this->do_is(m, *lo))
 345                         ++lo;
 346                 return lo;
 347         }
 348         char_type const * do_scan_not(mask m, char_type const * lo, char_type const * hi) const
 349         {
 350                 while (lo < hi && this->do_is(m, *lo) != 0)
 351                         ++lo;
 352                 return lo;
 353         }
 354         char_type do_widen(char c) const
 355         {
 356                 if (static_cast<unsigned char>(c) < 0x80)
 357                         return c;
 358                 throw ctype_failure();
 359         }
 360         const char* do_widen(const char* lo, const char* hi, char_type* dest) const
 361         {
 362                 while (lo < hi) {
 363                         if (static_cast<unsigned char>(*lo) >= 0x80)
 364                                 throw ctype_failure();
 365                         *dest = *lo;
 366                         ++lo;
 367                         ++dest;
 368                 }
 369                 return hi;
 370         }
 371         char do_narrow(char_type wc, char) const
 372         {
 373                 if (wc < 0x80)
 374                         return static_cast<char>(wc);
 375                 throw ctype_failure();
 376         }
 377         const char_type * do_narrow(const char_type * lo, const char_type * hi, char, char * dest) const
 378         {
 379                 while (lo < hi) {
 380                         if (*lo < 0x80)
 381                                 *dest = static_cast<char>(*lo);
 382                         else
 383                                 throw ctype_failure();
 384                         ++lo;
 385                         ++dest;
 386                 }
 387                 return hi;
 388         }
 389 };
 390
 391
 392 /// class to add our ascii_ctype_facet to the global locale
 393 class locale_initializer {
 394 public:
 395         locale_initializer()
 396         {
 397                 std::locale global;
 398                 std::locale const loc(global, new ascii_ctype_facet);
 399                 std::locale::global(loc);
 400         }
 401 };
 402
 403
 404 namespace {
 405
 406 /// make sure that our ascii_ctype_facet gets used
 407 static locale_initializer initializer;
 408
 409 }
 410 }
 411 #endif