src/support/docstring.C

   1 /**
   2  * \file docstring.C
   3  * This file is part of LyX, the document processor.
   4  * Licence details can be found in the file COPYING.
   5  *
   6  * \author Georg Baum
   7  *
   8  * Full author contact details are available in file CREDITS.
   9  */
  10
  11 #include <config.h>
  12
  13 #include "docstring.h"
  14 #include "qstring_helpers.h"
  15 #include "unicode.h"
  16
  17 #include <locale>
  18 #include <iostream>
  19
  20 #include <QFile>
  21
  22 #include <boost/assert.hpp>
  23
  24
  25 namespace lyx {
  26
  27
  28 docstring const from_ascii(char const * ascii)
  29 {
  30         docstring s;
  31         for (char const * c = ascii; *c; ++c) {
  32                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
  33                 s.push_back(*c);
  34         }
  35         return s;
  36 }
  37
  38
  39 docstring const from_ascii(std::string const & ascii)
  40 {
  41         int const len = ascii.length();
  42         for (int i = 0; i < len; ++i)
  43                 BOOST_ASSERT(static_cast<unsigned char>(ascii[i]) < 0x80);
  44         return docstring(ascii.begin(), ascii.end());
  45 }
  46
  47
  48 std::string const to_ascii(docstring const & ucs4)
  49 {
  50         int const len = ucs4.length();
  51         std::string ascii;
  52         ascii.resize(len);
  53         for (int i = 0; i < len; ++i) {
  54                 BOOST_ASSERT(ucs4[i] < 0x80);
  55                 ascii[i] = static_cast<char>(ucs4[i]);
  56         }
  57         return ascii;
  58 }
  59
  60
  61 void utf8_to_ucs4(std::string const & utf8, docstring & ucs4)
  62 {
  63         size_t n = utf8.size();
  64         // as utf8 is a multi-byte encoding, there would be at most
  65         // n characters:
  66         ucs4.resize(n);
  67         if (n == 0)
  68                 return;
  69
  70         int maxoutsize = n * 4;
  71         // basic_string::data() is not recognized by some old gcc version
  72         // so we use &(ucs4[0]) instead.
  73         char * outbuf = (char *)(&(ucs4[0]));
  74         int bytes = utf8ToUcs4().convert(utf8.c_str(), n, outbuf, maxoutsize);
  75
  76         // adjust to the real converted size
  77         ucs4.resize(bytes/4);
  78 }
  79
  80
  81 docstring const from_utf8(std::string const & utf8)
  82 {
  83         docstring ucs4;
  84         utf8_to_ucs4(utf8, ucs4);
  85         return ucs4;
  86 }
  87
  88
  89 std::string const to_utf8(docstring const & ucs4)
  90 {
  91         std::vector<char> const utf8 =
  92                 ucs4_to_utf8(ucs4.data(), ucs4.size());
  93         return std::string(utf8.begin(), utf8.end());
  94 }
  95
  96
  97 docstring const from_local8bit(std::string const & s)
  98 {
  99         return qstring_to_ucs4(QString::fromLocal8Bit(s.data(), s.length()));
 100 }
 101
 102
 103 const char* to_local8bit_failure::what() const throw()
 104 {
 105         return "A string could not be converted from unicode to the local 8 bit encoding.";
 106 }
 107
 108
 109 std::string const to_local8bit(docstring const & s)
 110 {
 111         // This conversion can fail, depending on input.
 112         if (s.empty())
 113                 return std::string();
 114         QByteArray const local = toqstr(s).toLocal8Bit();
 115         if (local.size() == 0)
 116                 throw to_local8bit_failure();
 117         return std::string(local.begin(), local.end());
 118 }
 119
 120
 121 docstring const from_filesystem8bit(std::string const & s)
 122 {
 123         QByteArray const encoded(s.c_str(), s.length());
 124         return qstring_to_ucs4(QFile::decodeName(encoded));
 125 }
 126
 127
 128 std::string const to_filesystem8bit(docstring const & s)
 129 {
 130         QByteArray const encoded = QFile::encodeName(toqstr(s));
 131         return std::string(encoded.begin(), encoded.end());
 132 }
 133
 134
 135 bool operator==(lyx::docstring const & l, char const * r)
 136 {
 137         int const len = l.length();
 138         for (int i = 0; i < len; ++i) {
 139                 BOOST_ASSERT(static_cast<unsigned char>(r[i]) < 0x80);
 140                 if (!r[i])
 141                         return false;
 142                 if (l[i] != lyx::docstring::value_type(r[i]))
 143                         return false;
 144         }
 145         return r[len] == '\0';
 146 }
 147
 148
 149 lyx::docstring operator+(lyx::docstring const & l, char const * r)
 150 {
 151         lyx::docstring s(l);
 152         for (char const * c = r; *c; ++c) {
 153                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 154                 s.push_back(*c);
 155         }
 156         return s;
 157 }
 158
 159
 160 lyx::docstring operator+(char const * l, lyx::docstring const & r)
 161 {
 162         lyx::docstring s;
 163         for (char const * c = l; *c; ++c) {
 164                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 165                 s.push_back(*c);
 166         }
 167         s += r;
 168         return s;
 169 }
 170
 171
 172 lyx::docstring operator+(lyx::docstring const & l, char r)
 173 {
 174         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 175         return l + lyx::docstring::value_type(r);
 176 }
 177
 178
 179 lyx::docstring operator+(char l, lyx::docstring const & r)
 180 {
 181         BOOST_ASSERT(static_cast<unsigned char>(l) < 0x80);
 182         return lyx::docstring::value_type(l) + r;
 183 }
 184
 185
 186 lyx::docstring & operator+=(lyx::docstring & l, char const * r)
 187 {
 188         for (char const * c = r; *c; ++c) {
 189                 BOOST_ASSERT(static_cast<unsigned char>(*c) < 0x80);
 190                 l.push_back(*c);
 191         }
 192         return l;
 193 }
 194
 195
 196 lyx::docstring & operator+=(lyx::docstring & l, char r)
 197 {
 198         BOOST_ASSERT(static_cast<unsigned char>(r) < 0x80);
 199         l.push_back(r);
 200         return l;
 201 }
 202
 203 } // namespace lyx
 204
 205 #if (!defined(HAVE_WCHAR_T) || SIZEOF_WCHAR_T != 4) && defined(__GNUC__)
 206
 207 // gcc does not have proper locale facets for lyx::char_type if
 208 // sizeof(wchar_t) == 2, so we have to implement them on our own.
 209
 210
 211 // We get undefined references to these virtual methods. This looks like
 212 // a bug in gcc. The implementation here does not do anything useful, since
 213 // it is overriden in ascii_ctype_facet.
 214 namespace std {
 215 template<> ctype<lyx::char_type>::~ctype() {}
 216 template<> bool
 217 ctype<lyx::char_type>::do_is(ctype<lyx::char_type>::mask, lyx::char_type) const { return false; }
 218 template<> lyx::char_type const *
 219 ctype<lyx::char_type>::do_is(const lyx::char_type *, const lyx::char_type *, ctype<lyx::char_type>::mask *) const { return 0; }
 220 template<> const lyx::char_type *
 221 ctype<lyx::char_type>::do_scan_is(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 222 template<> const lyx::char_type *
 223 ctype<lyx::char_type>::do_scan_not(ctype<lyx::char_type>::mask, const lyx::char_type *, const lyx::char_type *) const { return 0; }
 224 template<> lyx::char_type ctype<lyx::char_type>::do_toupper(lyx::char_type) const { return 0; }
 225 template<> const lyx::char_type * ctype<lyx::char_type>::do_toupper(lyx::char_type *, lyx::char_type const *) const { return 0; }
 226 template<> lyx::char_type ctype<lyx::char_type>::do_tolower(lyx::char_type) const { return 0; }
 227 template<> const lyx::char_type * ctype<lyx::char_type>::do_tolower(lyx::char_type *, lyx::char_type const *) const { return 0; }
 228 template<> lyx::char_type ctype<lyx::char_type>::do_widen(char) const { return 0; }
 229 template<> const char *
 230 ctype<lyx::char_type>::do_widen(const char *, const char *, lyx::char_type *) const { return 0; }
 231 template<> char
 232 ctype<lyx::char_type>::do_narrow(const lyx::char_type, char) const { return 0; }
 233 template<> const lyx::char_type *
 234 ctype<lyx::char_type>::do_narrow(const lyx::char_type *, const lyx::char_type *, char, char *) const { return 0; }
 235 }
 236
 237
 238 namespace lyx {
 239
 240 class ctype_failure : public std::bad_cast {
 241 public:
 242         ctype_failure() throw() : std::bad_cast() {}
 243         virtual ~ctype_failure() throw() {}
 244         virtual const char* what() const throw()
 245         {
 246                 return "The ctype<lyx::char_type> locale facet does only support ASCII characters on this platform.";
 247         }
 248 };
 249
 250
 251 class num_put_failure : public std::bad_cast {
 252 public:
 253         num_put_failure() throw() : std::bad_cast() {}
 254         virtual ~num_put_failure() throw() {}
 255         virtual const char* what() const throw()
 256         {
 257                 return "The num_put locale facet does only support ASCII characters on this platform.";
 258         }
 259 };
 260
 261
 262 /// ctype facet for UCS4 characters. The implementation does only support pure
 263 /// ASCII, since we do not need anything else for now.
 264 /// The code is partly stolen from std::ctype<wchar_t> from gcc.
 265 class ascii_ctype_facet : public std::ctype<lyx::char_type>
 266 {
 267 public:
 268         typedef lyx::char_type char_type;
 269         typedef wctype_t wmask_type;
 270         explicit ascii_ctype_facet(size_t refs = 0) : std::ctype<char_type>(refs)
 271         {
 272                 M_initialize_ctype();
 273         }
 274 protected:
 275         bool       M_narrow_ok;
 276         char       M_narrow[128];
 277         wint_t     M_widen[1 + static_cast<unsigned char>(-1)];
 278         mask       M_bit[16];
 279         wmask_type M_wmask[16];
 280         wmask_type M_convert_to_wmask(const mask m) const
 281         {
 282                 wmask_type ret;
 283                 switch (m) {
 284                         case space:  ret = wctype("space");  break;
 285                         case print:  ret = wctype("print");  break;
 286                         case cntrl:  ret = wctype("cntrl");  break;
 287                         case upper:  ret = wctype("upper");  break;
 288                         case lower:  ret = wctype("lower");  break;
 289                         case alpha:  ret = wctype("alpha");  break;
 290                         case digit:  ret = wctype("digit");  break;
 291                         case punct:  ret = wctype("punct");  break;
 292                         case xdigit: ret = wctype("xdigit"); break;
 293                         case alnum:  ret = wctype("alnum");  break;
 294                         case graph:  ret = wctype("graph");  break;
 295                         default:     ret = wmask_type();
 296                 }
 297                 return ret;
 298         }
 299         void M_initialize_ctype()
 300         {
 301                 wint_t i;
 302                 for (i = 0; i < 128; ++i) {
 303                         const int c = wctob(i);
 304                         if (c == EOF)
 305                                 break;
 306                         else
 307                                 M_narrow[i] = static_cast<char>(c);
 308                 }
 309                 if (i == 128)
 310                         M_narrow_ok = true;
 311                 else
 312                         M_narrow_ok = false;
 313                 for (size_t i = 0; i < sizeof(M_widen) / sizeof(wint_t); ++i)
 314                         M_widen[i] = btowc(i);
 315
 316                 for (size_t i = 0; i <= 15; ++i) {
 317                         M_bit[i] = static_cast<mask>(1 << i);
 318                         M_wmask[i] = M_convert_to_wmask(M_bit[i]);
 319                 }
 320         }
 321         virtual ~ascii_ctype_facet() {}
 322         char_type do_toupper(char_type c) const
 323         {
 324                 if (c >= 0x80)
 325                         throw ctype_failure();
 326                 return toupper(static_cast<int>(c));
 327         }
 328         char_type const * do_toupper(char_type * lo, char_type const * hi) const
 329         {
 330                 while (lo < hi) {
 331                         if (*lo >= 0x80)
 332                                 throw ctype_failure();
 333                         *lo = toupper(static_cast<int>(*lo));
 334                         ++lo;
 335                 }
 336                 return hi;
 337         }
 338         char_type do_tolower(char_type c) const
 339         {
 340                 if (c >= 0x80)
 341                         throw ctype_failure();
 342                 return tolower(c);
 343         }
 344         char_type const * do_tolower(char_type * lo, char_type const * hi) const
 345         {
 346                 while (lo < hi) {
 347                         if (*lo >= 0x80)
 348                                 throw ctype_failure();
 349                         *lo = tolower(*lo);
 350                         ++lo;
 351                 }
 352                 return hi;
 353         }
 354         bool do_is(mask m, char_type c) const
 355         {
 356                 if (c >= 0x80)
 357                         throw ctype_failure();
 358                 // The code below works because c is in the ASCII range.
 359                 // We could not use iswctype() which is designed for a 2byte
 360                 // whar_t without encoding conversion otherwise.
 361                 bool ret = false;
 362                 // Generically, 15 (instead of 10) since we don't know the numerical
 363                 // encoding of the various categories in /usr/include/ctype.h.
 364                 const size_t bitmasksize = 15;
 365                 for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 366                         if (m & M_bit[bitcur] &&
 367                             iswctype(static_cast<int>(c), M_wmask[bitcur])) {
 368                                 ret = true;
 369                                 break;
 370                         }
 371                 return ret;
 372         }
 373         char_type const * do_is(char_type const * lo, char_type const * hi, mask * vec) const
 374         {
 375                 for (;lo < hi; ++vec, ++lo) {
 376                         if (*lo >= 0x80)
 377                                 throw ctype_failure();
 378                         // The code below works because c is in the ASCII range.
 379                         // We could not use iswctype() which is designed for a 2byte
 380                         // whar_t without encoding conversion otherwise.
 381                         // Generically, 15 (instead of 10) since we don't know the numerical
 382                         // encoding of the various categories in /usr/include/ctype.h.
 383                         const size_t bitmasksize = 15;
 384                         mask m = 0;
 385                         for (size_t bitcur = 0; bitcur <= bitmasksize; ++bitcur)
 386                                 if (iswctype(static_cast<int>(*lo), M_wmask[bitcur]))
 387                                         m |= M_bit[bitcur];
 388                         *vec = m;
 389                 }
 390                 return hi;
 391         }
 392         char_type const * do_scan_is(mask m, char_type const * lo, char_type const * hi) const
 393         {
 394                 while (lo < hi && !this->do_is(m, *lo))
 395                         ++lo;
 396                 return lo;
 397         }
 398         char_type const * do_scan_not(mask m, char_type const * lo, char_type const * hi) const
 399         {
 400                 while (lo < hi && this->do_is(m, *lo) != 0)
 401                         ++lo;
 402                 return lo;
 403         }
 404         char_type do_widen(char c) const
 405         {
 406                 if (static_cast<unsigned char>(c) < 0x80)
 407                         return c;
 408                 throw ctype_failure();
 409         }
 410         const char* do_widen(const char* lo, const char* hi, char_type* dest) const
 411         {
 412                 while (lo < hi) {
 413                         if (static_cast<unsigned char>(*lo) >= 0x80)
 414                                 throw ctype_failure();
 415                         *dest = *lo;
 416                         ++lo;
 417                         ++dest;
 418                 }
 419                 return hi;
 420         }
 421         char do_narrow(char_type wc, char) const
 422         {
 423                 if (wc < 0x80)
 424                         return static_cast<char>(wc);
 425                 throw ctype_failure();
 426         }
 427         const char_type * do_narrow(const char_type * lo, const char_type * hi, char, char * dest) const
 428         {
 429                 while (lo < hi) {
 430                         if (*lo < 0x80)
 431                                 *dest = static_cast<char>(*lo);
 432                         else
 433                                 throw ctype_failure();
 434                         ++lo;
 435                         ++dest;
 436                 }
 437                 return hi;
 438         }
 439 };
 440
 441
 442 /// Facet for outputting numbers to odocstreams as ascii.
 443 /// Here we simply need defining the virtual do_put functions.
 444 class ascii_num_put_facet : public std::num_put<lyx::char_type, std::ostreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > >
 445 {
 446         typedef std::ostreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > iter_type;
 447 public:
 448         ascii_num_put_facet(size_t refs = 0) : std::num_put<lyx::char_type, iter_type>(refs) {}
 449
 450         /// Facet for converting numbers to ascii strings.
 451         class string_num_put_facet : public std::num_put<char, std::basic_string<char>::iterator>
 452         {
 453         public:
 454                 string_num_put_facet() : std::num_put<char, std::basic_string<char>::iterator>(1) {}
 455         };
 456
 457 protected:
 458         iter_type
 459         do_put(iter_type oit, std::ios_base & b, char_type fill, bool v) const
 460         {
 461                 return do_put_helper(oit, b, fill, v);
 462         }
 463
 464         iter_type
 465         do_put(iter_type oit, std::ios_base & b, char_type fill, long v) const
 466         {
 467                 return do_put_helper(oit, b, fill, v);
 468         }
 469
 470         iter_type
 471         do_put(iter_type oit, std::ios_base & b, char_type fill, unsigned long v) const
 472         {
 473                 return do_put_helper(oit, b, fill, v);
 474         }
 475
 476 #ifdef _GLIBCXX_USE_LONG_LONG
 477         iter_type
 478         do_put(iter_type oit, std::ios_base & b, char_type fill, long long v) const
 479         {
 480                 return do_put_helper(oit, b, fill, v);
 481         }
 482
 483         iter_type
 484         do_put(iter_type oit, std::ios_base & b, char_type fill, unsigned long long v) const
 485         {
 486                 return do_put_helper(oit, b, fill, v);
 487         }
 488 #endif
 489
 490         iter_type
 491         do_put(iter_type oit, std::ios_base & b, char_type fill, double v) const
 492         {
 493                 return do_put_helper(oit, b, fill, v);
 494         }
 495
 496         iter_type
 497         do_put(iter_type oit, std::ios_base & b, char_type fill, long double v) const
 498         {
 499                 return do_put_helper(oit, b, fill, v);
 500         }
 501
 502         iter_type
 503         do_put(iter_type oit, std::ios_base & b, char_type fill, void const * v) const
 504         {
 505                 return do_put_helper(oit, b, fill, v);
 506         }
 507
 508 private:
 509         template <typename ValueType>
 510         iter_type
 511         do_put_helper(iter_type oit, std::ios_base & b, char_type fill, ValueType v) const
 512         {
 513                 if (fill >= 0x80)
 514                         throw num_put_failure();
 515
 516                 std::streamsize const sz = b.width() > b.precision() ?
 517                                            b.width() : b.precision();
 518                 // 64 is large enough, unless width or precision are bigger
 519                 std::streamsize const wd = (sz > 56 ? sz : 56) + 8;
 520                 std::string s(wd, '\0');
 521                 string_num_put_facet f;
 522                 std::string::const_iterator cit = s.begin();
 523                 std::string::const_iterator end =
 524                         f.put(s.begin(), b, fill, v);
 525                 for (; cit != end; ++cit, ++oit)
 526                         *oit = *cit;
 527
 528                 return oit;
 529         }
 530 };
 531
 532
 533 /// Facet for inputting ascii representations of numbers from idocstreams.
 534 /// Here we simply need defining the virtual do_get functions.
 535 class ascii_num_get_facet : public std::num_get<lyx::char_type, std::istreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > >
 536 {
 537         typedef std::istreambuf_iterator<lyx::char_type, std::char_traits<lyx::char_type> > iter_type;
 538 public:
 539         ascii_num_get_facet(size_t refs = 0) : std::num_get<lyx::char_type, iter_type>(refs) {}
 540
 541         /// Facet for converting ascii representation of numbers to a value.
 542         class string_num_get_facet : public std::num_get<char, std::basic_string<char>::iterator>
 543         {
 544         public:
 545                 string_num_get_facet() : std::num_get<char, std::basic_string<char>::iterator>(1) {}
 546         };
 547
 548 private:
 549         bool isNumpunct(lyx::char_type const c) const
 550         {
 551                 /// Only account for the standard numpunct "C" locale facet.
 552                 return c < 0x80 && (c == '-' || c == '+' || isdigit(c)
 553                         || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
 554                         || c == 'x' || c == 'X');
 555         }
 556
 557 protected:
 558         iter_type
 559         do_get(iter_type iit, iter_type eit, std::ios_base & b,
 560                 std::ios_base::iostate & err, long & v) const
 561         {
 562                 std::string s;
 563                 s.reserve(64);
 564                 for (; iit != eit && isNumpunct(*iit); ++iit)
 565                         s += static_cast<char>(*iit);
 566                 // We add another character, not part of the numpunct facet,
 567                 // in order to avoid setting the eofbit in the stream state,
 568                 // which would prevent any further read. The space seems a
 569                 // good choice here.
 570                 s += ' ';
 571                 string_num_get_facet f;
 572                 f.get(s.begin(), s.end(), b, err, v);
 573
 574                 return iit;
 575         }
 576 };
 577
 578
 579 /// class to add our facets to the global locale
 580 class locale_initializer {
 581 public:
 582         locale_initializer()
 583         {
 584                 std::locale global;
 585                 std::locale const loc1(global, new ascii_ctype_facet);
 586                 std::locale const loc2(loc1, new ascii_num_put_facet);
 587                 std::locale const loc3(loc2, new ascii_num_get_facet);
 588                 std::locale::global(loc3);
 589         }
 590 };
 591
 592
 593 namespace {
 594
 595 /// make sure that our facets get used
 596 static locale_initializer initializer;
 597
 598 }
 599 }
 600 #endif