src/support/docstring.C

   1 /**
   2  * \file docstring.C
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Georg Baum
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "docstring.h"
  14 #include "unicode.h"
  15
  16 #include <locale>
  17 #include <iostream>
  18
  19 #include <boost/assert.hpp>
  20
  21
  22 namespace lyx {
  23
  24
  25 docstring const from_ascii(char const * ascii)
  26 {
  27         docstring s;
  28         for (char const * c = ascii; *c; ++c) {
  29                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
  30                 s.push_back(*c);
  31         }
  32         return s;
  33 }
  34
  35
  36 docstring const from_ascii(std::string const & ascii)
  37 {
  38         int const len = ascii.length();
  39         for (int i = 0; i < len; ++i)
  40                 BOOST_ASSERT(static_cast<unsigned char>(ascii[i]) < 0x80);
  41         return docstring(ascii.begin(), ascii.end());
  42 }
  43
  44
  45 std::string const to_ascii(docstring const & ucs4)
  46 {
  47         int const len = ucs4.length();
  48         std::string ascii;
  49         ascii.resize(len);
  50         for (int i = 0; i < len; ++i) {
  51                 BOOST_ASSERT(ucs4[i] < 0x80);
  52                 ascii[i] = static_cast<char>(ucs4[i]);
  53         }
  54         return ascii;
  55 }
  56
  57
  58 void utf8_to_ucs4(std::string const & utf8, docstring & ucs4)
  59 {
  60         // FIXME (Abdel 17/11/06): static data are evil!
  61         // This function cannot be used in the final exit process on Mac because
  62         // static data are already destroyed at this stage.
  63         // One solution would be to instantiate the utf8 to ucs4 IconvProcessor as a
  64         // singleton inside the LyX main class to ensure that it does not get
  65         // destroyed too early.
  66         static IconvProcessor iconv(ucs4_codeset, "UTF-8");
  67
  68         size_t n = utf8.size();
  69         // as utf8 is a multi-byte encoding, there would be at most
  70         // n characters:
  71         ucs4.resize(n);
  72         if (n == 0)
  73                 return;
  74
  75         int maxoutsize = n * 4;
  76         // basic_string::data() is not recognized by some old gcc version
  77         // so we use &(ucs4[0]) instead.
  78         char * outbuf = (char *)(&(ucs4[0]));
  79         int bytes = iconv.convert(utf8.c_str(), n, outbuf, maxoutsize);
  80
  81         // adjust to the real converted size
  82         ucs4.resize(bytes/4);
  83 }
  84
  85
  86 docstring const from_utf8(std::string const & utf8)
  87 {
  88         docstring ucs4;
  89         utf8_to_ucs4(utf8, ucs4);
  90         return ucs4;
  91 }
  92
  93
  94 std::string const to_utf8(docstring const & ucs4)
  95 {
  96         std::vector<char> const utf8 =
  97                 ucs4_to_utf8(ucs4.data(), ucs4.size());
  98         return std::string(utf8.begin(), utf8.end());
  99 }
 100
 101
 102 bool operator==(lyx::docstring const & l, char const * r)
 103 {
 104         int const len = l.length();
 105         for (int i = 0; i < len; ++i) {
 106                 BOOST_ASSERT(static_cast<unsigned char>(r[i]) < 0x80);
 107                 if (!r[i])
 108                         return false;
 109                 if (l[i] != lyx::docstring::value_type(r[i]))
 110                         return false;
 111         }
 112         return r[len] == '\0';
 113 }
 114
 115
 116 lyx::docstring operator+(lyx::docstring const & l, char const * r)
 117 {
 118         lyx::docstring s(l);
 119         for (char const * c = r; *c; ++c) {
 120                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 121                 s.push_back(*c);
 122         }
 123         return s;
 124 }
 125
 126
 127 lyx::docstring operator+(char const * l, lyx::docstring const & r)
 128 {
 129         lyx::docstring s;
 130         for (char const * c = l; *c; ++c) {
 131                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 132                 s.push_back(*c);
 133         }
 134         s += r;
 135         return s;
 136 }
 137
 138
 139 lyx::docstring operator+(lyx::docstring const & l, char r)
 140 {
 141         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 142         return l + lyx::docstring::value_type(r);
 143 }
 144
 145
 146 lyx::docstring operator+(char l, lyx::docstring const & r)
 147 {
 148         BOOST_ASSERT(static_cast<unsigned char>(l) < 0x80);
 149         return lyx::docstring::value_type(l) + r;
 150 }
 151
 152
 153 lyx::docstring & operator+=(lyx::docstring & l, char const * r)
 154 {
 155         for (char const * c = r; *c; ++c) {
 156                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 157                 l.push_back(*c);
 158         }
 159         return l;
 160 }
 161
 162
 163 lyx::docstring & operator+=(lyx::docstring & l, char r)
 164 {
 165         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 166         l.push_back(r);
 167         return l;
 168 }
 169
 170 } // namespace lyx
 171
 172 #if (!defined(HAVE_WCHAR_T) || SIZEOF_WCHAR_T != 4) && defined(__GNUC__)
 173
 174 // gcc does not have proper locale facets for lyx::char_type if
 175 // sizeof(wchar_t) == 2, so we have to implement them on our own.
 176
 177
 178 // We get undefined references to these virtual methods. This looks like
 179 // a bug in gcc. The implementation here does not do anything useful, since
 180 // it is overriden in ascii_ctype_facet.
 181 namespace std {
 182 template<> ctype<lyx::char_type>::~ctype() {}
 183 template<> bool
 184 ctype<lyx::char_type>::do_is(ctype<lyx::char_type>::mask, lyx::char_type) const { return false; }
 185 template<> lyx::char_type const *
 186 ctype<lyx::char_type>::do_is(const lyx::char_type *, const lyx::char_type *, ctype<lyx::char_type>::mask *) const { return 0; }
 187 template<> const lyx::char_type *
 188 ctype<lyx::char_type>::do_scan_is(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 189 template<> const lyx::char_type *
 190 ctype<lyx::char_type>::do_scan_not(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 191 template<> lyx::char_type ctype<lyx::char_type>::do_toupper(lyx::char_type) const { return 0; }
 192 template<> const lyx::char_type * ctype<lyx::char_type>::do_toupper(lyx::char_type *, lyx::char_type const *) const { return 0; }
 193 template<> lyx::char_type ctype<lyx::char_type>::do_tolower(lyx::char_type) const { return 0; }
 194 template<> const lyx::char_type * ctype<lyx::char_type>::do_tolower(lyx::char_type *, lyx::char_type const *) const { return 0; }
 195 template<> lyx::char_type ctype<lyx::char_type>::do_widen(char) const { return 0; }
 196 template<> const char *
 197 ctype<lyx::char_type>::do_widen(const char *, const char *, lyx::char_type *) const { return 0; }
 198 template<> char
 199 ctype<lyx::char_type>::do_narrow(const lyx::char_type, char) const { return 0; }
 200 template<> const lyx::char_type *
 201 ctype<lyx::char_type>::do_narrow(const lyx::char_type *, const lyx::char_type *, char, char *) const { return 0; }
 202 }
 203
 204
 205 namespace lyx {
 206
 207 class ctype_failure : public std::bad_cast {
 208 public:
 209         ctype_failure() throw() : std::bad_cast() {}
 210         virtual ~ctype_failure() throw() {}
 211         virtual const char* what() const throw()
 212         {
 213                 return "The ctype<lyx::char_type> locale facet does only support ASCII characters on this platform.";
 214         }
 215 };
 216
 217
 218 class num_put_failure : public std::bad_cast {
 219 public:
 220         num_put_failure() throw() : std::bad_cast() {}
 221         virtual ~num_put_failure() throw() {}
 222         virtual const char* what() const throw()
 223         {
 224                 return "The num_put locale facet does only support ASCII characters on this platform.";
 225         }
 226 };
 227
 228
 229 /// ctype facet for UCS4 characters. The implementation does only support pure
 230 /// ASCII, since we do not need anything else for now.
 231 /// The code is partly stolen from std::ctype<wchar_t> from gcc.
 232 class ascii_ctype_facet : public std::ctype<lyx::char_type>
 233 {
 234 public:
 235         typedef lyx::char_type char_type;
 236         typedef wctype_t wmask_type;
 237         explicit ascii_ctype_facet(size_t refs = 0) : std::ctype<char_type>(refs)
 238         {
 239                 M_initialize_ctype();
 240         }
 241 protected:
 242         bool       M_narrow_ok;
 243         char       M_narrow[128];
 244         wint_t     M_widen[1 + static_cast<unsigned char>(-1)];
 245         mask       M_bit[16];
 246         wmask_type M_wmask[16];
 247         wmask_type M_convert_to_wmask(const mask m) const
 248         {
 249                 wmask_type ret;
 250                 switch (m) {
 251                         case space:  ret = wctype("space");  break;
 252                         case print:  ret = wctype("print");  break;
 253                         case cntrl:  ret = wctype("cntrl");  break;
 254                         case upper:  ret = wctype("upper");  break;
 255                         case lower:  ret = wctype("lower");  break;
 256                         case alpha:  ret = wctype("alpha");  break;
 257                         case digit:  ret = wctype("digit");  break;
 258                         case punct:  ret = wctype("punct");  break;
 259                         case xdigit: ret = wctype("xdigit"); break;
 260                         case alnum:  ret = wctype("alnum");  break;
 261                         case graph:  ret = wctype("graph");  break;
 262                         default:     ret = wmask_type();
 263                 }
 264                 return ret;
 265         }
 266         void M_initialize_ctype()
 267         {
 268                 wint_t i;
 269                 for (i = 0; i < 128; ++i) {
 270                         const int c = wctob(i);
 271                         if (c == EOF)
 272                                 break;
 273                         else
 274                                 M_narrow[i] = static_cast<char>(c);
 275                 }
 276                 if (i == 128)
 277                         M_narrow_ok = true;
 278                 else
 279                         M_narrow_ok = false;
 280                 for (size_t i = 0; i < sizeof(M_widen) / sizeof(wint_t); ++i)
 281                         M_widen[i] = btowc(i);
 282
 283                 for (size_t i = 0; i <= 15; ++i) {
 284                         M_bit[i] = static_cast<mask>(1 << i);
 285                         M_wmask[i] = M_convert_to_wmask(M_bit[i]);
 286                 }
 287         }
 288         virtual ~ascii_ctype_facet() {}
 289         char_type do_toupper(char_type c) const
 290         {
 291                 if (c >= 0x80)
 292                         throw ctype_failure();
 293                 return toupper(static_cast<int>(c));
 294         }
 295         char_type const * do_toupper(char_type * lo, char_type const * hi) const
 296         {
 297                 while (lo < hi) {
 298                         if (*lo >= 0x80)
 299                                 throw ctype_failure();
 300                         *lo = toupper(static_cast<int>(*lo));
 301                         ++lo;
 302                 }
 303                 return hi;
 304         }
 305         char_type do_tolower(char_type c) const
 306         {
 307                 if (c >= 0x80)
 308                         throw ctype_failure();
 309                 return tolower(c);
 310         }
 311         char_type const * do_tolower(char_type * lo, char_type const * hi) const
 312         {
 313                 while (lo < hi) {
 314                         if (*lo >= 0x80)
 315                                 throw ctype_failure();
 316                         *lo = tolower(*lo);
 317                         ++lo;
 318                 }
 319                 return hi;
 320         }
 321         bool do_is(mask m, char_type c) const
 322         {
 323                 if (c >= 0x80)
 324                         throw ctype_failure();
 325                 // The code below works because c is in the ASCII range.
 326                 // We could not use iswctype() which is designed for a 2byte
 327                 // whar_t without encoding conversion otherwise.
 328                 bool ret = false;
 329                 // Generically, 15 (instead of 10) since we don't know the numerical
 330                 // encoding of the various categories in /usr/include/ctype.h.
 331                 const size_t bitmasksize = 15;
 332                 for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 333                         if (m & M_bit[bitcur] &&
 334                             iswctype(static_cast<int>(c), M_wmask[bitcur])) {
 335                                 ret = true;
 336                                 break;
 337                         }
 338                 return ret;
 339         }
 340         char_type const * do_is(char_type const * lo, char_type const * hi, mask * vec) const
 341         {
 342                 for (;lo < hi; ++vec, ++lo) {
 343                         if (*lo >= 0x80)
 344                                 throw ctype_failure();
 345                         // The code below works because c is in the ASCII range.
 346                         // We could not use iswctype() which is designed for a 2byte
 347                         // whar_t without encoding conversion otherwise.
 348                         // Generically, 15 (instead of 10) since we don't know the numerical
 349                         // encoding of the various categories in /usr/include/ctype.h.
 350                         const size_t bitmasksize = 15;
 351                         mask m = 0;
 352                         for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 353                                 if (iswctype(static_cast<int>(*lo), M_wmask[bitcur]))
 354                                         m |= M_bit[bitcur];
 355                         *vec = m;
 356                 }
 357                 return hi;
 358         }
 359         char_type const * do_scan_is(mask m, char_type const * lo, char_type const * hi) const
 360         {
 361                 while (lo < hi && !this->do_is(m, *lo))
 362                         ++lo;
 363                 return lo;
 364         }
 365         char_type const * do_scan_not(mask m, char_type const * lo, char_type const * hi) const
 366         {
 367                 while (lo < hi && this->do_is(m, *lo) != 0)
 368                         ++lo;
 369                 return lo;
 370         }
 371         char_type do_widen(char c) const
 372         {
 373                 if (static_cast<unsigned char>(c) < 0x80)
 374                         return c;
 375                 throw ctype_failure();
 376         }
 377         const char* do_widen(const char* lo, const char* hi, char_type* dest) const
 378         {
 379                 while (lo < hi) {
 380                         if (static_cast<unsigned char>(*lo) >= 0x80)
 381                                 throw ctype_failure();
 382                         *dest = *lo;
 383                         ++lo;
 384                         ++dest;
 385                 }
 386                 return hi;
 387         }
 388         char do_narrow(char_type wc, char) const
 389         {
 390                 if (wc < 0x80)
 391                         return static_cast<char>(wc);
 392                 throw ctype_failure();
 393         }
 394         const char_type * do_narrow(const char_type * lo, const char_type * hi, char, char * dest) const
 395         {
 396                 while (lo < hi) {
 397                         if (*lo < 0x80)
 398                                 *dest = static_cast<char>(*lo);
 399                         else
 400                                 throw ctype_failure();
 401                         ++lo;
 402                         ++dest;
 403                 }
 404                 return hi;
 405         }
 406 };
 407
 408
 409 /// Facet for outputting numbers to odocstreams as ascii.
 410 /// Here we simply need defining the virtual do_put functions.
 411 class ascii_num_put_facet : public std::num_put<lyx::char_type, std::ostreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > >
 412 {
 413         typedef std::ostreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > iter_type;
 414 public:
 415         ascii_num_put_facet(size_t refs = 0) : std::num_put<lyx::char_type, iter_type>(refs) {}
 416
 417         /// Facet for converting numbers to ascii strings.
 418         class string_num_put_facet : public std::num_put<char, std::basic_string<char>::iterator>
 419         {
 420         public:
 421                 string_num_put_facet() : std::num_put<char, std::basic_string<char>::iterator>(1) {}
 422         };
 423
 424 protected:
 425         iter_type
 426         do_put(iter_type oit, std::ios_base & b, char_type fill, long v) const
 427         {
 428                 if (fill >= 0x80)
 429                         throw num_put_failure();
 430
 431                 std::string s;
 432                 // 64 is large enough
 433                 s.resize(64);
 434                 string_num_put_facet f;
 435                 std::string::const_iterator cit = s.begin();
 436                 std::string::const_iterator end =
 437                         f.put(s.begin(), b, fill, v);
 438                 for (; cit != end; ++cit, ++oit)
 439                         *oit = *cit;
 440
 441                 return oit;
 442         }
 443 };
 444
 445
 446 /// Facet for inputting ascii representations of numbers from idocstreams.
 447 /// Here we simply need defining the virtual do_get functions.
 448 class ascii_num_get_facet : public std::num_get<lyx::char_type, std::istreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > >
 449 {
 450         typedef std::istreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > iter_type;
 451 public:
 452         ascii_num_get_facet(size_t refs = 0) : std::num_get<lyx::char_type, iter_type>(refs) {}
 453
 454         /// Facet for converting ascii representation of numbers to a value.
 455         class string_num_get_facet : public std::num_get<char, std::basic_string<char>::iterator>
 456         {
 457         public:
 458                 string_num_get_facet() : std::num_get<char, std::basic_string<char>::iterator>(1) {}
 459         };
 460
 461 private:
 462         bool isNumpunct(lyx::char_type const c) const
 463         {
 464                 /// Only account for the standard numpunct "C" locale facet.
 465                 return c < 0x80 && (c == '-' || c == '+' || isdigit(c)
 466                         || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
 467                         || c == 'x' || c == 'X');
 468         }
 469
 470 protected:
 471         iter_type
 472         do_get(iter_type iit, iter_type eit, std::ios_base & b,
 473                 std::ios_base::iostate & err, long & v) const
 474         {
 475                 std::string s;
 476                 s.resize(64);
 477                 for (int i = 0; iit != eit && isNumpunct(*iit); ++i, ++iit)
 478                         s[i] = static_cast<char>(*iit);
 479                 string_num_get_facet f;
 480                 f.get(s.begin(), s.end(), b, err, v);
 481
 482                 return iit;
 483         }
 484 };
 485
 486
 487 /// class to add our facets to the global locale
 488 class locale_initializer {
 489 public:
 490         locale_initializer()
 491         {
 492                 std::locale global;
 493                 std::locale const loc1(global, new ascii_ctype_facet);
 494                 std::locale const loc2(loc1, new ascii_num_put_facet);
 495                 std::locale const loc3(loc2, new ascii_num_get_facet);
 496                 std::locale::global(loc3);
 497         }
 498 };
 499
 500
 501 namespace {
 502
 503 /// make sure that our facets get used
 504 static locale_initializer initializer;
 505
 506 }
 507 }
 508 #endif