6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
13 * LOCATION: see http://www.boost.org for most recent version.
14 * FILE unicode_iterator.hpp
15 * VERSION see <boost/version.hpp>
16 * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
19 /****************************************************************************
24 1) Read Only, Input Adapters:
25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 template <class BaseIterator, class U8Type = ::boost::uint8_t>
28 class u32_to_u8_iterator;
30 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
32 template <class BaseIterator, class U32Type = ::boost::uint32_t>
33 class u8_to_u32_iterator;
35 Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
37 template <class BaseIterator, class U16Type = ::boost::uint16_t>
38 class u32_to_u16_iterator;
40 Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
42 template <class BaseIterator, class U32Type = ::boost::uint32_t>
43 class u16_to_u32_iterator;
45 Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
47 2) Single pass output iterator adapters:
49 template <class BaseIterator>
50 class utf8_output_iterator;
52 Accepts UTF-32 code points and forwards them on as UTF-8 code points.
54 template <class BaseIterator>
55 class utf16_output_iterator;
57 Accepts UTF-32 code points and forwards them on as UTF-16 code points.
59 ****************************************************************************/
61 #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
62 #define BOOST_REGEX_UNICODE_ITERATOR_HPP
63 #include <boost/cstdint.hpp>
64 #include <boost/assert.hpp>
65 #include <boost/iterator/iterator_facade.hpp>
66 #include <boost/static_assert.hpp>
67 #include <boost/throw_exception.hpp>
69 #ifndef BOOST_NO_STD_LOCALE
73 #include <limits.h> // CHAR_BIT
79 static const ::boost::uint16_t high_surrogate_base = 0xD7C0u;
80 static const ::boost::uint16_t low_surrogate_base = 0xDC00u;
81 static const ::boost::uint32_t ten_bit_mask = 0x3FFu;
83 inline bool is_high_surrogate(::boost::uint16_t v)
85 return (v & 0xFC00u) == 0xd800u;
87 inline bool is_low_surrogate(::boost::uint16_t v)
89 return (v & 0xFC00u) == 0xdc00u;
92 inline bool is_surrogate(T v)
94 return (v & 0xF800u) == 0xd800;
97 inline unsigned utf8_byte_count(boost::uint8_t c)
99 // if the most significant bit with a zero in it is in position
100 // 8-N then there are N bytes in this UTF-8 sequence:
101 boost::uint8_t mask = 0x80u;
108 return (result == 0) ? 1 : ((result > 4) ? 4 : result);
111 inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
113 return utf8_byte_count(c) - 1;
116 inline void invalid_utf32_code_point(::boost::uint32_t val)
118 #ifndef BOOST_NO_STD_LOCALE
119 std::stringstream ss;
120 ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
121 std::out_of_range e(ss.str());
123 std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence");
125 boost::throw_exception(e);
129 } // namespace detail
131 template <class BaseIterator, class U16Type = ::boost::uint16_t>
132 class u32_to_u16_iterator
133 : public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type>
135 typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
137 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
138 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
140 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
141 BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
145 typename base_type::reference
150 return m_values[m_current];
152 bool equal(const u32_to_u16_iterator& that)const
154 if(m_position == that.m_position)
156 // Both m_currents must be equal, or both even
157 // this is the same as saying their sum must be even:
158 return (m_current + that.m_current) & 1u ? false : true;
164 // if we have a pending read then read now, so that we know whether
165 // to skip a position, or move to a low-surrogate:
171 // move to the next surrogate position:
173 // if we've reached the end skip a position:
174 if(m_values[m_current] == 0)
184 // decrementing an iterator always leads to a valid position:
187 m_current = m_values[1] ? 1 : 0;
194 BaseIterator base()const
199 u32_to_u16_iterator() : m_position(), m_current(0)
205 u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
213 void extract_current()const
215 // begin by checking for a code point out of range:
216 ::boost::uint32_t v = *m_position;
220 detail::invalid_utf32_code_point(*m_position);
221 // split into two surrogates:
222 m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
223 m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
225 BOOST_ASSERT(detail::is_high_surrogate(m_values[0]));
226 BOOST_ASSERT(detail::is_low_surrogate(m_values[1]));
230 // 16-bit code point:
231 m_values[0] = static_cast<U16Type>(*m_position);
234 // value must not be a surrogate:
235 if(detail::is_surrogate(m_values[0]))
236 detail::invalid_utf32_code_point(*m_position);
239 BaseIterator m_position;
240 mutable U16Type m_values[3];
241 mutable unsigned m_current;
244 template <class BaseIterator, class U32Type = ::boost::uint32_t>
245 class u16_to_u32_iterator
246 : public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
248 typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
249 // special values for pending iterator reads:
250 BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
252 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
253 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
255 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
256 BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
260 typename base_type::reference
263 if(m_value == pending_read)
267 bool equal(const u16_to_u32_iterator& that)const
269 return m_position == that.m_position;
273 // skip high surrogate first if there is one:
274 if(detail::is_high_surrogate(*m_position)) ++m_position;
276 m_value = pending_read;
281 // if we have a low surrogate then go back one more:
282 if(detail::is_low_surrogate(*m_position))
284 m_value = pending_read;
286 BaseIterator base()const
291 u16_to_u32_iterator() : m_position()
293 m_value = pending_read;
295 u16_to_u32_iterator(BaseIterator b) : m_position(b)
297 m_value = pending_read;
300 static void invalid_code_point(::boost::uint16_t val)
302 #ifndef BOOST_NO_STD_LOCALE
303 std::stringstream ss;
304 ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
305 std::out_of_range e(ss.str());
307 std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence");
309 boost::throw_exception(e);
311 void extract_current()const
313 m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
314 // if the last value is a high surrogate then adjust m_position and m_value as needed:
315 if(detail::is_high_surrogate(*m_position))
317 // precondition; next value must have be a low-surrogate:
318 BaseIterator next(m_position);
319 ::boost::uint16_t t = *++next;
320 if((t & 0xFC00u) != 0xDC00u)
321 invalid_code_point(t);
322 m_value = (m_value - detail::high_surrogate_base) << 10;
323 m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
325 // postcondition; result must not be a surrogate:
326 if(detail::is_surrogate(m_value))
327 invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
329 BaseIterator m_position;
330 mutable U32Type m_value;
333 template <class BaseIterator, class U8Type = ::boost::uint8_t>
334 class u32_to_u8_iterator
335 : public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type>
337 typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type;
339 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
340 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
342 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
343 BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
347 typename base_type::reference
352 return m_values[m_current];
354 bool equal(const u32_to_u8_iterator& that)const
356 if(m_position == that.m_position)
358 // either the m_current's must be equal, or one must be 0 and
359 // the other 4: which means neither must have bits 1 or 2 set:
360 return (m_current == that.m_current)
361 || (((m_current | that.m_current) & 3) == 0);
367 // if we have a pending read then read now, so that we know whether
368 // to skip a position, or move to a low-surrogate:
374 // move to the next surrogate position:
376 // if we've reached the end skip a position:
377 if(m_values[m_current] == 0)
385 if((m_current & 3) == 0)
390 while(m_current && (m_values[m_current] == 0))
396 BaseIterator base()const
401 u32_to_u8_iterator() : m_position(), m_current(0)
409 u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
419 void extract_current()const
421 boost::uint32_t c = *m_position;
423 detail::invalid_utf32_code_point(c);
426 m_values[0] = static_cast<unsigned char>(c);
427 m_values[1] = static_cast<unsigned char>(0u);
428 m_values[2] = static_cast<unsigned char>(0u);
429 m_values[3] = static_cast<unsigned char>(0u);
433 m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
434 m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
435 m_values[2] = static_cast<unsigned char>(0u);
436 m_values[3] = static_cast<unsigned char>(0u);
438 else if(c < 0x10000u)
440 m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
441 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
442 m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
443 m_values[3] = static_cast<unsigned char>(0u);
447 m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
448 m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
449 m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
450 m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
454 BaseIterator m_position;
455 mutable U8Type m_values[5];
456 mutable unsigned m_current;
459 template <class BaseIterator, class U32Type = ::boost::uint32_t>
460 class u8_to_u32_iterator
461 : public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
463 typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
464 // special values for pending iterator reads:
465 BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
467 #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
468 typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
470 BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
471 BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
475 typename base_type::reference
478 if(m_value == pending_read)
482 bool equal(const u8_to_u32_iterator& that)const
484 return m_position == that.m_position;
488 // skip high surrogate first if there is one:
489 unsigned c = detail::utf8_byte_count(*m_position);
490 std::advance(m_position, c);
491 m_value = pending_read;
495 // Keep backtracking until we don't have a trailing character:
497 while((*--m_position & 0xC0u) == 0x80u) ++count;
498 // now check that the sequence was valid:
499 if(count != detail::utf8_trailing_byte_count(*m_position))
501 m_value = pending_read;
503 BaseIterator base()const
508 u8_to_u32_iterator() : m_position()
510 m_value = pending_read;
512 u8_to_u32_iterator(BaseIterator b) : m_position(b)
514 m_value = pending_read;
517 static void invalid_sequnce()
519 std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
520 boost::throw_exception(e);
522 void extract_current()const
524 m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
525 // we must not have a continuation character:
526 if((m_value & 0xC0u) == 0x80u)
528 // see how many extra byts we have:
529 unsigned extra = detail::utf8_trailing_byte_count(*m_position);
530 // extract the extra bits, 6 from each extra byte:
531 BaseIterator next(m_position);
532 for(unsigned c = 0; c < extra; ++c)
536 m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
538 // we now need to remove a few of the leftmost bits, but how many depends
539 // upon how many extra bytes we've extracted:
540 static const boost::uint32_t masks[4] =
547 m_value &= masks[extra];
549 if(m_value > static_cast<U32Type>(0x10FFFFu))
552 BaseIterator m_position;
553 mutable U32Type m_value;
556 template <class BaseIterator>
557 class utf16_output_iterator
560 typedef void difference_type;
561 typedef void value_type;
562 typedef boost::uint32_t* pointer;
563 typedef boost::uint32_t& reference;
564 typedef std::output_iterator_tag iterator_category;
566 utf16_output_iterator(const BaseIterator& b)
568 utf16_output_iterator(const utf16_output_iterator& that)
569 : m_position(that.m_position){}
570 utf16_output_iterator& operator=(const utf16_output_iterator& that)
572 m_position = that.m_position;
575 const utf16_output_iterator& operator*()const
579 void operator=(boost::uint32_t val)const
583 utf16_output_iterator& operator++()
587 utf16_output_iterator& operator++(int)
591 BaseIterator base()const
596 void push(boost::uint32_t v)const
600 // begin by checking for a code point out of range:
602 detail::invalid_utf32_code_point(v);
603 // split into two surrogates:
604 *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base;
605 *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
609 // 16-bit code point:
610 // value must not be a surrogate:
611 if(detail::is_surrogate(v))
612 detail::invalid_utf32_code_point(v);
613 *m_position++ = static_cast<boost::uint16_t>(v);
616 mutable BaseIterator m_position;
619 template <class BaseIterator>
620 class utf8_output_iterator
623 typedef void difference_type;
624 typedef void value_type;
625 typedef boost::uint32_t* pointer;
626 typedef boost::uint32_t& reference;
627 typedef std::output_iterator_tag iterator_category;
629 utf8_output_iterator(const BaseIterator& b)
631 utf8_output_iterator(const utf8_output_iterator& that)
632 : m_position(that.m_position){}
633 utf8_output_iterator& operator=(const utf8_output_iterator& that)
635 m_position = that.m_position;
638 const utf8_output_iterator& operator*()const
642 void operator=(boost::uint32_t val)const
646 utf8_output_iterator& operator++()
650 utf8_output_iterator& operator++(int)
654 BaseIterator base()const
659 void push(boost::uint32_t c)const
662 detail::invalid_utf32_code_point(c);
665 *m_position++ = static_cast<unsigned char>(c);
669 *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
670 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
672 else if(c < 0x10000u)
674 *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
675 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
676 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
680 *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
681 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
682 *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
683 *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
686 mutable BaseIterator m_position;
691 #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP