src/support/docstring.cpp

   1 /**
   2  * \file docstring.cpp
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Georg Baum
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "docstring.h"
  14 #include "qstring_helpers.h"
  15 #include "unicode.h"
  16
  17 #include <locale>
  18 #include <iostream>
  19
  20 #include <QFile>
  21
  22 #include <boost/assert.hpp>
  23
  24
  25 namespace lyx {
  26
  27
  28 docstring const from_ascii(char const * ascii)
  29 {
  30         docstring s;
  31         for (char const * c = ascii; *c; ++c) {
  32                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
  33                 s.push_back(*c);
  34         }
  35         return s;
  36 }
  37
  38
  39 docstring const from_ascii(std::string const & ascii)
  40 {
  41         int const len = ascii.length();
  42         for (int i = 0; i < len; ++i)
  43                 BOOST_ASSERT(static_cast<unsigned char>(ascii[i]) < 0x80);
  44         return docstring(ascii.begin(), ascii.end());
  45 }
  46
  47
  48 std::string const to_ascii(docstring const & ucs4)
  49 {
  50         int const len = ucs4.length();
  51         std::string ascii;
  52         ascii.resize(len);
  53         for (int i = 0; i < len; ++i) {
  54                 BOOST_ASSERT(ucs4[i] < 0x80);
  55                 ascii[i] = static_cast<char>(ucs4[i]);
  56         }
  57         return ascii;
  58 }
  59
  60
  61 IconvProcessor & utf8ToUcs4()
  62 {
  63         static IconvProcessor iconv(ucs4_codeset, "UTF-8");
  64         return iconv;
  65 }
  66
  67
  68
  69 void utf8_to_ucs4(std::string const & utf8, docstring & ucs4)
  70 {
  71         size_t n = utf8.size();
  72         // as utf8 is a multi-byte encoding, there would be at most
  73         // n characters:
  74         ucs4.resize(n);
  75         if (n == 0)
  76                 return;
  77
  78         int maxoutsize = n * 4;
  79         // basic_string::data() is not recognized by some old gcc version
  80         // so we use &(ucs4[0]) instead.
  81         char * outbuf = (char *)(&(ucs4[0]));
  82         int bytes = utf8ToUcs4().convert(utf8.c_str(), n, outbuf, maxoutsize);
  83
  84         // adjust to the real converted size
  85         ucs4.resize(bytes/4);
  86 }
  87
  88
  89 docstring const from_utf8(std::string const & utf8)
  90 {
  91         docstring ucs4;
  92         utf8_to_ucs4(utf8, ucs4);
  93         return ucs4;
  94 }
  95
  96
  97 std::string const to_utf8(docstring const & ucs4)
  98 {
  99         std::vector<char> const utf8 =
 100                 ucs4_to_utf8(ucs4.data(), ucs4.size());
 101         return std::string(utf8.begin(), utf8.end());
 102 }
 103
 104
 105 docstring const from_local8bit(std::string const & s)
 106 {
 107         return qstring_to_ucs4(QString::fromLocal8Bit(s.data(), s.length()));
 108 }
 109
 110
 111 const char* to_local8bit_failure::what() const throw()
 112 {
 113         return "A string could not be converted from unicode to the local 8 bit encoding.";
 114 }
 115
 116
 117 std::string const to_local8bit(docstring const & s)
 118 {
 119         // This conversion can fail, depending on input.
 120         if (s.empty())
 121                 return std::string();
 122         QByteArray const local = toqstr(s).toLocal8Bit();
 123         if (local.size() == 0)
 124                 throw to_local8bit_failure();
 125         return std::string(local.begin(), local.end());
 126 }
 127
 128
 129 docstring const from_filesystem8bit(std::string const & s)
 130 {
 131         QByteArray const encoded(s.c_str(), s.length());
 132         return qstring_to_ucs4(QFile::decodeName(encoded));
 133 }
 134
 135
 136 std::string const to_filesystem8bit(docstring const & s)
 137 {
 138         QByteArray const encoded = QFile::encodeName(toqstr(s));
 139         return std::string(encoded.begin(), encoded.end());
 140 }
 141
 142
 143 docstring const normalize_kc(docstring const & s)
 144 {
 145         return qstring_to_ucs4(toqstr(s).normalized(QString::NormalizationForm_KC));
 146 }
 147
 148
 149 bool operator==(lyx::docstring const & l, char const * r)
 150 {
 151         lyx::docstring::const_iterator it = l.begin();
 152         lyx::docstring::const_iterator end = l.end();
 153         for (; it != end; ++it, ++r) {
 154                 BOOST_ASSERT(static_cast<unsigned char>(*r) < 0x80);
 155                 if (!*r)
 156                         return false;
 157                 if (*it != static_cast<lyx::docstring::value_type>(*r))
 158                         return false;
 159         }
 160         return *r == '\0';
 161 }
 162
 163
 164 lyx::docstring operator+(lyx::docstring const & l, char const * r)
 165 {
 166         lyx::docstring s(l);
 167         for (char const * c = r; *c; ++c) {
 168                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 169                 s.push_back(*c);
 170         }
 171         return s;
 172 }
 173
 174
 175 lyx::docstring operator+(char const * l, lyx::docstring const & r)
 176 {
 177         lyx::docstring s;
 178         for (char const * c = l; *c; ++c) {
 179                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 180                 s.push_back(*c);
 181         }
 182         s += r;
 183         return s;
 184 }
 185
 186
 187 lyx::docstring operator+(lyx::docstring const & l, char r)
 188 {
 189         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 190         return l + lyx::docstring::value_type(r);
 191 }
 192
 193
 194 lyx::docstring operator+(char l, lyx::docstring const & r)
 195 {
 196         BOOST_ASSERT(static_cast<unsigned char>(l) < 0x80);
 197         return lyx::docstring::value_type(l) + r;
 198 }
 199
 200
 201 lyx::docstring & operator+=(lyx::docstring & l, char const * r)
 202 {
 203         for (char const * c = r; *c; ++c) {
 204                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 205                 l.push_back(*c);
 206         }
 207         return l;
 208 }
 209
 210
 211 lyx::docstring & operator+=(lyx::docstring & l, char r)
 212 {
 213         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 214         l.push_back(r);
 215         return l;
 216 }
 217
 218 } // namespace lyx
 219
 220 #if (!defined(HAVE_WCHAR_T) || SIZEOF_WCHAR_T != 4) && defined(__GNUC__)
 221
 222 // gcc does not have proper locale facets for lyx::char_type if
 223 // sizeof(wchar_t) == 2, so we have to implement them on our own.
 224
 225
 226 // We get undefined references to these virtual methods. This looks like
 227 // a bug in gcc. The implementation here does not do anything useful, since
 228 // it is overriden in ascii_ctype_facet.
 229 namespace std {
 230 template<> ctype<lyx::char_type>::~ctype() {}
 231 template<> bool
 232 ctype<lyx::char_type>::do_is(ctype<lyx::char_type>::mask, lyx::char_type) const { return false; }
 233 template<> lyx::char_type const *
 234 ctype<lyx::char_type>::do_is(const lyx::char_type *, const lyx::char_type *, ctype<lyx::char_type>::mask *) const { return 0; }
 235 template<> const lyx::char_type *
 236 ctype<lyx::char_type>::do_scan_is(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 237 template<> const lyx::char_type *
 238 ctype<lyx::char_type>::do_scan_not(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 239 template<> lyx::char_type ctype<lyx::char_type>::do_toupper(lyx::char_type) const { return 0; }
 240 template<> const lyx::char_type * ctype<lyx::char_type>::do_toupper(lyx::char_type *, lyx::char_type const *) const { return 0; }
 241 template<> lyx::char_type ctype<lyx::char_type>::do_tolower(lyx::char_type) const { return 0; }
 242 template<> const lyx::char_type * ctype<lyx::char_type>::do_tolower(lyx::char_type *, lyx::char_type const *) const { return 0; }
 243 template<> lyx::char_type ctype<lyx::char_type>::do_widen(char) const { return 0; }
 244 template<> const char *
 245 ctype<lyx::char_type>::do_widen(const char *, const char *, lyx::char_type *) const { return 0; }
 246 template<> char
 247 ctype<lyx::char_type>::do_narrow(const lyx::char_type, char) const { return 0; }
 248 template<> const lyx::char_type *
 249 ctype<lyx::char_type>::do_narrow(const lyx::char_type *, const lyx::char_type *, char, char *) const { return 0; }
 250 }
 251
 252
 253 namespace lyx {
 254
 255 class ctype_failure : public std::bad_cast {
 256 public:
 257         ctype_failure() throw() : std::bad_cast() {}
 258         virtual ~ctype_failure() throw() {}
 259         virtual const char* what() const throw()
 260         {
 261                 return "The ctype<lyx::char_type> locale facet does only support ASCII characters on this platform.";
 262         }
 263 };
 264
 265
 266 class num_put_failure : public std::bad_cast {
 267 public:
 268         num_put_failure() throw() : std::bad_cast() {}
 269         virtual ~num_put_failure() throw() {}
 270         virtual const char* what() const throw()
 271         {
 272                 return "The num_put locale facet does only support ASCII characters on this platform.";
 273         }
 274 };
 275
 276
 277 /// ctype facet for UCS4 characters. The implementation does only support pure
 278 /// ASCII, since we do not need anything else for now.
 279 /// The code is partly stolen from std::ctype<wchar_t> from gcc.
 280 class ascii_ctype_facet : public std::ctype<lyx::char_type>
 281 {
 282 public:
 283         typedef lyx::char_type char_type;
 284         typedef wctype_t wmask_type;
 285         explicit ascii_ctype_facet(size_t refs = 0) : std::ctype<char_type>(refs)
 286         {
 287                 M_initialize_ctype();
 288         }
 289 protected:
 290         bool       M_narrow_ok;
 291         char       M_narrow[128];
 292         wint_t     M_widen[1 + static_cast<unsigned char>(-1)];
 293         mask       M_bit[16];
 294         wmask_type M_wmask[16];
 295         wmask_type M_convert_to_wmask(const mask m) const
 296         {
 297                 wmask_type ret;
 298                 switch (m) {
 299                         case space:  ret = wctype("space");  break;
 300                         case print:  ret = wctype("print");  break;
 301                         case cntrl:  ret = wctype("cntrl");  break;
 302                         case upper:  ret = wctype("upper");  break;
 303                         case lower:  ret = wctype("lower");  break;
 304                         case alpha:  ret = wctype("alpha");  break;
 305                         case digit:  ret = wctype("digit");  break;
 306                         case punct:  ret = wctype("punct");  break;
 307                         case xdigit: ret = wctype("xdigit"); break;
 308                         case alnum:  ret = wctype("alnum");  break;
 309                         case graph:  ret = wctype("graph");  break;
 310                         default:     ret = wmask_type();
 311                 }
 312                 return ret;
 313         }
 314         void M_initialize_ctype()
 315         {
 316                 wint_t i;
 317                 for (i = 0; i < 128; ++i) {
 318                         const int c = wctob(i);
 319                         if (c == EOF)
 320                                 break;
 321                         else
 322                                 M_narrow[i] = static_cast<char>(c);
 323                 }
 324                 if (i == 128)
 325                         M_narrow_ok = true;
 326                 else
 327                         M_narrow_ok = false;
 328                 for (size_t i = 0; i < sizeof(M_widen) / sizeof(wint_t); ++i)
 329                         M_widen[i] = btowc(i);
 330
 331                 for (size_t i = 0; i <= 15; ++i) {
 332                         M_bit[i] = static_cast<mask>(1 << i);
 333                         M_wmask[i] = M_convert_to_wmask(M_bit[i]);
 334                 }
 335         }
 336         virtual ~ascii_ctype_facet() {}
 337         char_type do_toupper(char_type c) const
 338         {
 339                 if (c >= 0x80)
 340                         throw ctype_failure();
 341                 return toupper(static_cast<int>(c));
 342         }
 343         char_type const * do_toupper(char_type * lo, char_type const * hi) const
 344         {
 345                 while (lo < hi) {
 346                         if (*lo >= 0x80)
 347                                 throw ctype_failure();
 348                         *lo = toupper(static_cast<int>(*lo));
 349                         ++lo;
 350                 }
 351                 return hi;
 352         }
 353         char_type do_tolower(char_type c) const
 354         {
 355                 if (c >= 0x80)
 356                         throw ctype_failure();
 357                 return tolower(c);
 358         }
 359         char_type const * do_tolower(char_type * lo, char_type const * hi) const
 360         {
 361                 while (lo < hi) {
 362                         if (*lo >= 0x80)
 363                                 throw ctype_failure();
 364                         *lo = tolower(*lo);
 365                         ++lo;
 366                 }
 367                 return hi;
 368         }
 369         bool do_is(mask m, char_type c) const
 370         {
 371                 if (c >= 0x80)
 372                         throw ctype_failure();
 373                 // The code below works because c is in the ASCII range.
 374                 // We could not use iswctype() which is designed for a 2byte
 375                 // whar_t without encoding conversion otherwise.
 376                 bool ret = false;
 377                 // Generically, 15 (instead of 10) since we don't know the numerical
 378                 // encoding of the various categories in /usr/include/ctype.h.
 379                 const size_t bitmasksize = 15;
 380                 for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 381                         if (m & M_bit[bitcur] &&
 382                             iswctype(static_cast<int>(c), M_wmask[bitcur])) {
 383                                 ret = true;
 384                                 break;
 385                         }
 386                 return ret;
 387         }
 388         char_type const * do_is(char_type const * lo, char_type const * hi, mask * vec) const
 389         {
 390                 for (;lo < hi; ++vec, ++lo) {
 391                         if (*lo >= 0x80)
 392                                 throw ctype_failure();
 393                         // The code below works because c is in the ASCII range.
 394                         // We could not use iswctype() which is designed for a 2byte
 395                         // whar_t without encoding conversion otherwise.
 396                         // Generically, 15 (instead of 10) since we don't know the numerical
 397                         // encoding of the various categories in /usr/include/ctype.h.
 398                         const size_t bitmasksize = 15;
 399                         mask m = 0;
 400                         for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 401                                 if (iswctype(static_cast<int>(*lo), M_wmask[bitcur]))
 402                                         m |= M_bit[bitcur];
 403                         *vec = m;
 404                 }
 405                 return hi;
 406         }
 407         char_type const * do_scan_is(mask m, char_type const * lo, char_type const * hi) const
 408         {
 409                 while (lo < hi && !this->do_is(m, *lo))
 410                         ++lo;
 411                 return lo;
 412         }
 413         char_type const * do_scan_not(mask m, char_type const * lo, char_type const * hi) const
 414         {
 415                 while (lo < hi && this->do_is(m, *lo) != 0)
 416                         ++lo;
 417                 return lo;
 418         }
 419         char_type do_widen(char c) const
 420         {
 421                 if (static_cast<unsigned char>(c) < 0x80)
 422                         return c;
 423                 throw ctype_failure();
 424         }
 425         const char* do_widen(const char* lo, const char* hi, char_type* dest) const
 426         {
 427                 while (lo < hi) {
 428                         if (static_cast<unsigned char>(*lo) >= 0x80)
 429                                 throw ctype_failure();
 430                         *dest = *lo;
 431                         ++lo;
 432                         ++dest;
 433                 }
 434                 return hi;
 435         }
 436         char do_narrow(char_type wc, char) const
 437         {
 438                 if (wc < 0x80)
 439                         return static_cast<char>(wc);
 440                 throw ctype_failure();
 441         }
 442         const char_type * do_narrow(const char_type * lo, const char_type * hi, char, char * dest) const
 443         {
 444                 while (lo < hi) {
 445                         if (*lo < 0x80)
 446                                 *dest = static_cast<char>(*lo);
 447                         else
 448                                 throw ctype_failure();
 449                         ++lo;
 450                         ++dest;
 451                 }
 452                 return hi;
 453         }
 454 };
 455
 456
 457 /// Facet for outputting numbers to odocstreams as ascii.
 458 /// Here we simply need defining the virtual do_put functions.
 459 class ascii_num_put_facet : public std::num_put<lyx::char_type, std::ostreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > >
 460 {
 461         typedef std::ostreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > iter_type;
 462 public:
 463         ascii_num_put_facet(size_t refs = 0) : std::num_put<lyx::char_type, iter_type>(refs) {}
 464
 465         /// Facet for converting numbers to ascii strings.
 466         class string_num_put_facet : public std::num_put<char, std::basic_string<char>::iterator>
 467         {
 468         public:
 469                 string_num_put_facet() : std::num_put<char, std::basic_string<char>::iterator>(1) {}
 470         };
 471
 472 protected:
 473         iter_type
 474         do_put(iter_type oit, std::ios_base & b, char_type fill, bool v) const
 475         {
 476                 return do_put_helper(oit, b, fill, v);
 477         }
 478
 479         iter_type
 480         do_put(iter_type oit, std::ios_base & b, char_type fill, long v) const
 481         {
 482                 return do_put_helper(oit, b, fill, v);
 483         }
 484
 485         iter_type
 486         do_put(iter_type oit, std::ios_base & b, char_type fill, unsigned long v) const
 487         {
 488                 return do_put_helper(oit, b, fill, v);
 489         }
 490
 491 #ifdef _GLIBCXX_USE_LONG_LONG
 492         iter_type
 493         do_put(iter_type oit, std::ios_base & b, char_type fill, long long v) const
 494         {
 495                 return do_put_helper(oit, b, fill, v);
 496         }
 497
 498         iter_type
 499         do_put(iter_type oit, std::ios_base & b, char_type fill, unsigned long long v) const
 500         {
 501                 return do_put_helper(oit, b, fill, v);
 502         }
 503 #endif
 504
 505         iter_type
 506         do_put(iter_type oit, std::ios_base & b, char_type fill, double v) const
 507         {
 508                 return do_put_helper(oit, b, fill, v);
 509         }
 510
 511         iter_type
 512         do_put(iter_type oit, std::ios_base & b, char_type fill, long double v) const
 513         {
 514                 return do_put_helper(oit, b, fill, v);
 515         }
 516
 517         iter_type
 518         do_put(iter_type oit, std::ios_base & b, char_type fill, void const * v) const
 519         {
 520                 return do_put_helper(oit, b, fill, v);
 521         }
 522
 523 private:
 524         template <typename ValueType>
 525         iter_type
 526         do_put_helper(iter_type oit, std::ios_base & b, char_type fill, ValueType v) const
 527         {
 528                 if (fill >= 0x80)
 529                         throw num_put_failure();
 530
 531                 std::streamsize const sz = b.width() > b.precision() ?
 532                                            b.width() : b.precision();
 533                 // 64 is large enough, unless width or precision are bigger
 534                 std::streamsize const wd = (sz > 56 ? sz : 56) + 8;
 535                 std::string s(wd, '\0');
 536                 string_num_put_facet f;
 537                 std::string::const_iterator cit = s.begin();
 538                 std::string::const_iterator end =
 539                         f.put(s.begin(), b, fill, v);
 540                 for (; cit != end; ++cit, ++oit)
 541                         *oit = *cit;
 542
 543                 return oit;
 544         }
 545 };
 546
 547
 548 /// Facet for inputting ascii representations of numbers from idocstreams.
 549 /// Here we simply need defining the virtual do_get functions.
 550 class ascii_num_get_facet : public std::num_get<lyx::char_type, std::istreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > >
 551 {
 552         typedef std::istreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > iter_type;
 553 public:
 554         ascii_num_get_facet(size_t refs = 0) : std::num_get<lyx::char_type, iter_type>(refs) {}
 555
 556         /// Facet for converting ascii representation of numbers to a value.
 557         class string_num_get_facet : public std::num_get<char, std::basic_string<char>::iterator>
 558         {
 559         public:
 560                 string_num_get_facet() : std::num_get<char, std::basic_string<char>::iterator>(1) {}
 561         };
 562
 563 private:
 564         bool isNumpunct(lyx::char_type const c) const
 565         {
 566                 /// Only account for the standard numpunct "C" locale facet.
 567                 return c < 0x80 && (c == '-' || c == '+' || isdigit(c)
 568                         || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
 569                         || c == 'x' || c == 'X');
 570         }
 571
 572 protected:
 573         iter_type
 574         do_get(iter_type iit, iter_type eit, std::ios_base & b,
 575                 std::ios_base::iostate & err, long & v) const
 576         {
 577                 std::string s;
 578                 s.reserve(64);
 579                 for (; iit != eit && isNumpunct(*iit); ++iit)
 580                         s += static_cast<char>(*iit);
 581                 // We add another character, not part of the numpunct facet,
 582                 // in order to avoid setting the eofbit in the stream state,
 583                 // which would prevent any further read. The space seems a
 584                 // good choice here.
 585                 s += ' ';
 586                 string_num_get_facet f;
 587                 f.get(s.begin(), s.end(), b, err, v);
 588
 589                 return iit;
 590         }
 591 };
 592
 593
 594 /// class to add our facets to the global locale
 595 class locale_initializer {
 596 public:
 597         locale_initializer()
 598         {
 599                 std::locale global;
 600                 std::locale const loc1(global, new ascii_ctype_facet);
 601                 std::locale const loc2(loc1, new ascii_num_put_facet);
 602                 std::locale const loc3(loc2, new ascii_num_get_facet);
 603                 std::locale::global(loc3);
 604         }
 605 };
 606
 607
 608 namespace {
 609
 610 /// make sure that our facets get used
 611 static locale_initializer initializer;
 612
 613 }
 614 }
 615 #endif