src/support/docstring.C

   1 /**
   2  * \file docstring.C
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Georg Baum
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "docstring.h"
  14 #include "unicode.h"
  15
  16 #include <locale>
  17 #include <iostream>
  18
  19 #include <boost/assert.hpp>
  20
  21
  22 namespace lyx {
  23
  24
  25 docstring const from_ascii(char const * ascii)
  26 {
  27         docstring s;
  28         for (char const * c = ascii; *c; ++c) {
  29                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
  30                 s.push_back(*c);
  31         }
  32         return s;
  33 }
  34
  35
  36 docstring const from_ascii(std::string const & ascii)
  37 {
  38         int const len = ascii.length();
  39         for (int i = 0; i < len; ++i)
  40                 BOOST_ASSERT(static_cast<unsigned char>(ascii[i]) < 0x80);
  41         return docstring(ascii.begin(), ascii.end());
  42 }
  43
  44
  45 std::string const to_ascii(docstring const & ucs4)
  46 {
  47         int const len = ucs4.length();
  48         std::string ascii;
  49         ascii.resize(len);
  50         for (int i = 0; i < len; ++i) {
  51                 BOOST_ASSERT(ucs4[i] < 0x80);
  52                 ascii[i] = static_cast<char>(ucs4[i]);
  53         }
  54         return ascii;
  55 }
  56
  57
  58 void utf8_to_ucs4(std::string const & utf8, docstring & ucs4)
  59 {
  60         size_t n = utf8.size();
  61         // as utf8 is a multi-byte encoding, there would be at most
  62         // n characters:
  63         ucs4.resize(n);
  64         if (n == 0)
  65                 return;
  66
  67         int maxoutsize = n * 4;
  68         // basic_string::data() is not recognized by some old gcc version
  69         // so we use &(ucs4[0]) instead.
  70         char * outbuf = (char *)(&(ucs4[0]));
  71         int bytes = utf8ToUcs4().convert(utf8.c_str(), n, outbuf, maxoutsize);
  72
  73         // adjust to the real converted size
  74         ucs4.resize(bytes/4);
  75 }
  76
  77
  78 docstring const from_utf8(std::string const & utf8)
  79 {
  80         docstring ucs4;
  81         utf8_to_ucs4(utf8, ucs4);
  82         return ucs4;
  83 }
  84
  85
  86 std::string const to_utf8(docstring const & ucs4)
  87 {
  88         std::vector<char> const utf8 =
  89                 ucs4_to_utf8(ucs4.data(), ucs4.size());
  90         return std::string(utf8.begin(), utf8.end());
  91 }
  92
  93
  94 bool operator==(lyx::docstring const & l, char const * r)
  95 {
  96         int const len = l.length();
  97         for (int i = 0; i < len; ++i) {
  98                 BOOST_ASSERT(static_cast<unsigned char>(r[i]) < 0x80);
  99                 if (!r[i])
 100                         return false;
 101                 if (l[i] != lyx::docstring::value_type(r[i]))
 102                         return false;
 103         }
 104         return r[len] == '\0';
 105 }
 106
 107
 108 lyx::docstring operator+(lyx::docstring const & l, char const * r)
 109 {
 110         lyx::docstring s(l);
 111         for (char const * c = r; *c; ++c) {
 112                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 113                 s.push_back(*c);
 114         }
 115         return s;
 116 }
 117
 118
 119 lyx::docstring operator+(char const * l, lyx::docstring const & r)
 120 {
 121         lyx::docstring s;
 122         for (char const * c = l; *c; ++c) {
 123                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 124                 s.push_back(*c);
 125         }
 126         s += r;
 127         return s;
 128 }
 129
 130
 131 lyx::docstring operator+(lyx::docstring const & l, char r)
 132 {
 133         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 134         return l + lyx::docstring::value_type(r);
 135 }
 136
 137
 138 lyx::docstring operator+(char l, lyx::docstring const & r)
 139 {
 140         BOOST_ASSERT(static_cast<unsigned char>(l) < 0x80);
 141         return lyx::docstring::value_type(l) + r;
 142 }
 143
 144
 145 lyx::docstring & operator+=(lyx::docstring & l, char const * r)
 146 {
 147         for (char const * c = r; *c; ++c) {
 148                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 149                 l.push_back(*c);
 150         }
 151         return l;
 152 }
 153
 154
 155 lyx::docstring & operator+=(lyx::docstring & l, char r)
 156 {
 157         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 158         l.push_back(r);
 159         return l;
 160 }
 161
 162 } // namespace lyx
 163
 164 #if (!defined(HAVE_WCHAR_T) || SIZEOF_WCHAR_T != 4) && defined(__GNUC__)
 165
 166 // gcc does not have proper locale facets for lyx::char_type if
 167 // sizeof(wchar_t) == 2, so we have to implement them on our own.
 168
 169
 170 // We get undefined references to these virtual methods. This looks like
 171 // a bug in gcc. The implementation here does not do anything useful, since
 172 // it is overriden in ascii_ctype_facet.
 173 namespace std {
 174 template<> ctype<lyx::char_type>::~ctype() {}
 175 template<> bool
 176 ctype<lyx::char_type>::do_is(ctype<lyx::char_type>::mask, lyx::char_type) const { return false; }
 177 template<> lyx::char_type const *
 178 ctype<lyx::char_type>::do_is(const lyx::char_type *, const lyx::char_type *, ctype<lyx::char_type>::mask *) const { return 0; }
 179 template<> const lyx::char_type *
 180 ctype<lyx::char_type>::do_scan_is(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 181 template<> const lyx::char_type *
 182 ctype<lyx::char_type>::do_scan_not(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 183 template<> lyx::char_type ctype<lyx::char_type>::do_toupper(lyx::char_type) const { return 0; }
 184 template<> const lyx::char_type * ctype<lyx::char_type>::do_toupper(lyx::char_type *, lyx::char_type const *) const { return 0; }
 185 template<> lyx::char_type ctype<lyx::char_type>::do_tolower(lyx::char_type) const { return 0; }
 186 template<> const lyx::char_type * ctype<lyx::char_type>::do_tolower(lyx::char_type *, lyx::char_type const *) const { return 0; }
 187 template<> lyx::char_type ctype<lyx::char_type>::do_widen(char) const { return 0; }
 188 template<> const char *
 189 ctype<lyx::char_type>::do_widen(const char *, const char *, lyx::char_type *) const { return 0; }
 190 template<> char
 191 ctype<lyx::char_type>::do_narrow(const lyx::char_type, char) const { return 0; }
 192 template<> const lyx::char_type *
 193 ctype<lyx::char_type>::do_narrow(const lyx::char_type *, const lyx::char_type *, char, char *) const { return 0; }
 194 }
 195
 196
 197 namespace lyx {
 198
 199 class ctype_failure : public std::bad_cast {
 200 public:
 201         ctype_failure() throw() : std::bad_cast() {}
 202         virtual ~ctype_failure() throw() {}
 203         virtual const char* what() const throw()
 204         {
 205                 return "The ctype<lyx::char_type> locale facet does only support ASCII characters on this platform.";
 206         }
 207 };
 208
 209
 210 class num_put_failure : public std::bad_cast {
 211 public:
 212         num_put_failure() throw() : std::bad_cast() {}
 213         virtual ~num_put_failure() throw() {}
 214         virtual const char* what() const throw()
 215         {
 216                 return "The num_put locale facet does only support ASCII characters on this platform.";
 217         }
 218 };
 219
 220
 221 /// ctype facet for UCS4 characters. The implementation does only support pure
 222 /// ASCII, since we do not need anything else for now.
 223 /// The code is partly stolen from std::ctype<wchar_t> from gcc.
 224 class ascii_ctype_facet : public std::ctype<lyx::char_type>
 225 {
 226 public:
 227         typedef lyx::char_type char_type;
 228         typedef wctype_t wmask_type;
 229         explicit ascii_ctype_facet(size_t refs = 0) : std::ctype<char_type>(refs)
 230         {
 231                 M_initialize_ctype();
 232         }
 233 protected:
 234         bool       M_narrow_ok;
 235         char       M_narrow[128];
 236         wint_t     M_widen[1 + static_cast<unsigned char>(-1)];
 237         mask       M_bit[16];
 238         wmask_type M_wmask[16];
 239         wmask_type M_convert_to_wmask(const mask m) const
 240         {
 241                 wmask_type ret;
 242                 switch (m) {
 243                         case space:  ret = wctype("space");  break;
 244                         case print:  ret = wctype("print");  break;
 245                         case cntrl:  ret = wctype("cntrl");  break;
 246                         case upper:  ret = wctype("upper");  break;
 247                         case lower:  ret = wctype("lower");  break;
 248                         case alpha:  ret = wctype("alpha");  break;
 249                         case digit:  ret = wctype("digit");  break;
 250                         case punct:  ret = wctype("punct");  break;
 251                         case xdigit: ret = wctype("xdigit"); break;
 252                         case alnum:  ret = wctype("alnum");  break;
 253                         case graph:  ret = wctype("graph");  break;
 254                         default:     ret = wmask_type();
 255                 }
 256                 return ret;
 257         }
 258         void M_initialize_ctype()
 259         {
 260                 wint_t i;
 261                 for (i = 0; i < 128; ++i) {
 262                         const int c = wctob(i);
 263                         if (c == EOF)
 264                                 break;
 265                         else
 266                                 M_narrow[i] = static_cast<char>(c);
 267                 }
 268                 if (i == 128)
 269                         M_narrow_ok = true;
 270                 else
 271                         M_narrow_ok = false;
 272                 for (size_t i = 0; i < sizeof(M_widen) / sizeof(wint_t); ++i)
 273                         M_widen[i] = btowc(i);
 274
 275                 for (size_t i = 0; i <= 15; ++i) {
 276                         M_bit[i] = static_cast<mask>(1 << i);
 277                         M_wmask[i] = M_convert_to_wmask(M_bit[i]);
 278                 }
 279         }
 280         virtual ~ascii_ctype_facet() {}
 281         char_type do_toupper(char_type c) const
 282         {
 283                 if (c >= 0x80)
 284                         throw ctype_failure();
 285                 return toupper(static_cast<int>(c));
 286         }
 287         char_type const * do_toupper(char_type * lo, char_type const * hi) const
 288         {
 289                 while (lo < hi) {
 290                         if (*lo >= 0x80)
 291                                 throw ctype_failure();
 292                         *lo = toupper(static_cast<int>(*lo));
 293                         ++lo;
 294                 }
 295                 return hi;
 296         }
 297         char_type do_tolower(char_type c) const
 298         {
 299                 if (c >= 0x80)
 300                         throw ctype_failure();
 301                 return tolower(c);
 302         }
 303         char_type const * do_tolower(char_type * lo, char_type const * hi) const
 304         {
 305                 while (lo < hi) {
 306                         if (*lo >= 0x80)
 307                                 throw ctype_failure();
 308                         *lo = tolower(*lo);
 309                         ++lo;
 310                 }
 311                 return hi;
 312         }
 313         bool do_is(mask m, char_type c) const
 314         {
 315                 if (c >= 0x80)
 316                         throw ctype_failure();
 317                 // The code below works because c is in the ASCII range.
 318                 // We could not use iswctype() which is designed for a 2byte
 319                 // whar_t without encoding conversion otherwise.
 320                 bool ret = false;
 321                 // Generically, 15 (instead of 10) since we don't know the numerical
 322                 // encoding of the various categories in /usr/include/ctype.h.
 323                 const size_t bitmasksize = 15;
 324                 for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 325                         if (m & M_bit[bitcur] &&
 326                             iswctype(static_cast<int>(c), M_wmask[bitcur])) {
 327                                 ret = true;
 328                                 break;
 329                         }
 330                 return ret;
 331         }
 332         char_type const * do_is(char_type const * lo, char_type const * hi, mask * vec) const
 333         {
 334                 for (;lo < hi; ++vec, ++lo) {
 335                         if (*lo >= 0x80)
 336                                 throw ctype_failure();
 337                         // The code below works because c is in the ASCII range.
 338                         // We could not use iswctype() which is designed for a 2byte
 339                         // whar_t without encoding conversion otherwise.
 340                         // Generically, 15 (instead of 10) since we don't know the numerical
 341                         // encoding of the various categories in /usr/include/ctype.h.
 342                         const size_t bitmasksize = 15;
 343                         mask m = 0;
 344                         for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 345                                 if (iswctype(static_cast<int>(*lo), M_wmask[bitcur]))
 346                                         m |= M_bit[bitcur];
 347                         *vec = m;
 348                 }
 349                 return hi;
 350         }
 351         char_type const * do_scan_is(mask m, char_type const * lo, char_type const * hi) const
 352         {
 353                 while (lo < hi && !this->do_is(m, *lo))
 354                         ++lo;
 355                 return lo;
 356         }
 357         char_type const * do_scan_not(mask m, char_type const * lo, char_type const * hi) const
 358         {
 359                 while (lo < hi && this->do_is(m, *lo) != 0)
 360                         ++lo;
 361                 return lo;
 362         }
 363         char_type do_widen(char c) const
 364         {
 365                 if (static_cast<unsigned char>(c) < 0x80)
 366                         return c;
 367                 throw ctype_failure();
 368         }
 369         const char* do_widen(const char* lo, const char* hi, char_type* dest) const
 370         {
 371                 while (lo < hi) {
 372                         if (static_cast<unsigned char>(*lo) >= 0x80)
 373                                 throw ctype_failure();
 374                         *dest = *lo;
 375                         ++lo;
 376                         ++dest;
 377                 }
 378                 return hi;
 379         }
 380         char do_narrow(char_type wc, char) const
 381         {
 382                 if (wc < 0x80)
 383                         return static_cast<char>(wc);
 384                 throw ctype_failure();
 385         }
 386         const char_type * do_narrow(const char_type * lo, const char_type * hi, char, char * dest) const
 387         {
 388                 while (lo < hi) {
 389                         if (*lo < 0x80)
 390                                 *dest = static_cast<char>(*lo);
 391                         else
 392                                 throw ctype_failure();
 393                         ++lo;
 394                         ++dest;
 395                 }
 396                 return hi;
 397         }
 398 };
 399
 400
 401 /// Facet for outputting numbers to odocstreams as ascii.
 402 /// Here we simply need defining the virtual do_put functions.
 403 class ascii_num_put_facet : public std::num_put<lyx::char_type, std::ostreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > >
 404 {
 405         typedef std::ostreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > iter_type;
 406 public:
 407         ascii_num_put_facet(size_t refs = 0) : std::num_put<lyx::char_type, iter_type>(refs) {}
 408
 409         /// Facet for converting numbers to ascii strings.
 410         class string_num_put_facet : public std::num_put<char, std::basic_string<char>::iterator>
 411         {
 412         public:
 413                 string_num_put_facet() : std::num_put<char, std::basic_string<char>::iterator>(1) {}
 414         };
 415
 416 protected:
 417         iter_type
 418         do_put(iter_type oit, std::ios_base & b, char_type fill, long v) const
 419         {
 420                 if (fill >= 0x80)
 421                         throw num_put_failure();
 422
 423                 std::string s;
 424                 // 64 is large enough
 425                 s.resize(64);
 426                 string_num_put_facet f;
 427                 std::string::const_iterator cit = s.begin();
 428                 std::string::const_iterator end =
 429                         f.put(s.begin(), b, fill, v);
 430                 for (; cit != end; ++cit, ++oit)
 431                         *oit = *cit;
 432
 433                 return oit;
 434         }
 435 };
 436
 437
 438 /// Facet for inputting ascii representations of numbers from idocstreams.
 439 /// Here we simply need defining the virtual do_get functions.
 440 class ascii_num_get_facet : public std::num_get<lyx::char_type, std::istreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > >
 441 {
 442         typedef std::istreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > iter_type;
 443 public:
 444         ascii_num_get_facet(size_t refs = 0) : std::num_get<lyx::char_type, iter_type>(refs) {}
 445
 446         /// Facet for converting ascii representation of numbers to a value.
 447         class string_num_get_facet : public std::num_get<char, std::basic_string<char>::iterator>
 448         {
 449         public:
 450                 string_num_get_facet() : std::num_get<char, std::basic_string<char>::iterator>(1) {}
 451         };
 452
 453 private:
 454         bool isNumpunct(lyx::char_type const c) const
 455         {
 456                 /// Only account for the standard numpunct "C" locale facet.
 457                 return c < 0x80 && (c == '-' || c == '+' || isdigit(c)
 458                         || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
 459                         || c == 'x' || c == 'X');
 460         }
 461
 462 protected:
 463         iter_type
 464         do_get(iter_type iit, iter_type eit, std::ios_base & b,
 465                 std::ios_base::iostate & err, long & v) const
 466         {
 467                 std::string s;
 468                 s.reserve(64);
 469                 for (; iit != eit && isNumpunct(*iit); ++iit)
 470                         s += static_cast<char>(*iit);
 471                 // We add another character, not part of the numpunct facet,
 472                 // in order to avoid setting the eofbit in the stream state,
 473                 // which would prevent any further read. The space seems a
 474                 // good choice here.
 475                 s += ' ';
 476                 string_num_get_facet f;
 477                 f.get(s.begin(), s.end(), b, err, v);
 478
 479                 return iit;
 480         }
 481 };
 482
 483
 484 /// class to add our facets to the global locale
 485 class locale_initializer {
 486 public:
 487         locale_initializer()
 488         {
 489                 std::locale global;
 490                 std::locale const loc1(global, new ascii_ctype_facet);
 491                 std::locale const loc2(loc1, new ascii_num_put_facet);
 492                 std::locale const loc3(loc2, new ascii_num_get_facet);
 493                 std::locale::global(loc3);
 494         }
 495 };
 496
 497
 498 namespace {
 499
 500 /// make sure that our facets get used
 501 static locale_initializer initializer;
 502
 503 }
 504 }
 505 #endif