diff options
Diffstat (limited to 'source/core/StarUnicode.hpp')
-rw-r--r-- | source/core/StarUnicode.hpp | 229 |
1 files changed, 229 insertions, 0 deletions
diff --git a/source/core/StarUnicode.hpp b/source/core/StarUnicode.hpp new file mode 100644 index 0000000..845259f --- /dev/null +++ b/source/core/StarUnicode.hpp @@ -0,0 +1,229 @@ +#ifndef STAR_UTF8_HPP +#define STAR_UTF8_HPP + +#include "StarByteArray.hpp" +#include "StarMaybe.hpp" + +namespace Star { + +STAR_EXCEPTION(UnicodeException, StarException); + +typedef char Utf8Type; +typedef char32_t Utf32Type; + +#define STAR_UTF32_REPLACEMENT_CHAR 0x000000b7L + +void throwInvalidUtf8Sequence(); +void throwMissingUtf8End(); +void throwInvalidUtf32CodePoint(Utf32Type val); + +// If passed NPos as a size, assumes modified UTF-8 and stops on NULL byte. +// Otherwise, ignores NULL. +size_t utf8Length(Utf8Type const* utf8, size_t size = NPos); +// Encode up to six utf8 bytes into a utf32 character. If passed NPos as len, +// assumes modified UTF-8 and stops on NULL, otherwise ignores. +size_t utf8DecodeChar(Utf8Type const* utf8, Utf32Type* utf32, size_t len = NPos); +// Encode single utf32 char into up to 6 utf8 characters. +size_t utf8EncodeChar(Utf8Type* utf8, Utf32Type utf32, size_t len = 6); + +Utf32Type hexStringToUtf32(std::string const& codepoint, Maybe<Utf32Type> previousCodepoint = {}); +std::string hexStringFromUtf32(Utf32Type character); + +bool isUtf16LeadSurrogate(Utf32Type codepoint); +bool isUtf16TrailSurrogate(Utf32Type codepoint); + +Utf32Type utf32FromUtf16SurrogatePair(Utf32Type lead, Utf32Type trail); +pair<Utf32Type, Maybe<Utf32Type>> utf32ToUtf16SurrogatePair(Utf32Type codepoint); + +// Bidirectional iterator that can make utf8 appear as utf32 +template <class BaseIterator, class U32Type = Utf32Type> +class U8ToU32Iterator { +public: + typedef ptrdiff_t difference_type; + typedef U32Type value_type; + typedef U32Type* pointer; + typedef U32Type& reference; + typedef std::bidirectional_iterator_tag iterator_category; + + U8ToU32Iterator() : m_position(), m_value(pending_read) {} + + U8ToU32Iterator(BaseIterator b) : m_position(b), m_value(pending_read) {} + + U32Type const& operator*() const { + if (m_value == pending_read) + extract_current(); + return m_value; + } + + U8ToU32Iterator const& operator++() { + increment(); + return *this; + } + + U8ToU32Iterator operator++(int) { + U8ToU32Iterator clone(*this); + increment(); + return clone; + } + + U8ToU32Iterator const& operator--() { + decrement(); + return *this; + } + + U8ToU32Iterator operator--(int) { + U8ToU32Iterator clone(*this); + decrement(); + return clone; + } + + bool operator==(U8ToU32Iterator const& that) const { + return equal(that); + } + + bool operator!=(U8ToU32Iterator const& that) const { + return !equal(that); + } + +private: + // special values for pending iterator reads: + static U32Type const pending_read = 0xffffffffu; + + static void invalid_sequence() { + throwInvalidUtf8Sequence(); + } + + static unsigned utf8_byte_count(Utf8Type c) { + // if the most significant bit with a zero in it is in position + // 8-N then there are N bytes in this UTF-8 sequence: + uint8_t mask = 0x80u; + unsigned result = 0; + while (c & mask) { + ++result; + mask >>= 1; + } + return (result == 0) ? 1 : ((result > 4) ? 4 : result); + } + + static unsigned utf8_trailing_byte_count(Utf8Type c) { + return utf8_byte_count(c) - 1; + } + + void increment() { + // skip high surrogate first if there is one: + unsigned c = utf8_byte_count(*m_position); + std::advance(m_position, c); + m_value = pending_read; + } + + void decrement() { + // Keep backtracking until we don't have a trailing character: + unsigned count = 0; + while (((uint8_t) * --m_position & 0xC0u) == 0x80u) + ++count; + // now check that the sequence was valid: + if (count != utf8_trailing_byte_count(*m_position)) + invalid_sequence(); + m_value = pending_read; + } + + bool equal(const U8ToU32Iterator& that) const { + return m_position == that.m_position; + } + + void extract_current() const { + m_value = static_cast<Utf8Type>(*m_position); + // we must not have a continuation character: + if (((uint8_t)m_value & 0xC0u) == 0x80u) + invalid_sequence(); + // see how many extra byts we have: + unsigned extra = utf8_trailing_byte_count(*m_position); + // extract the extra bits, 6 from each extra byte: + BaseIterator next(m_position); + for (unsigned c = 0; c < extra; ++c) { + ++next; + m_value <<= 6; + auto entry = static_cast<uint8_t>(*next); + if ((c > 0) && ((entry & 0xC0u) != 0x80u)) + invalid_sequence(); + m_value += entry & 0x3Fu; + } + // we now need to remove a few of the leftmost bits, but how many depends + // upon how many extra bytes we've extracted: + static const Utf32Type masks[4] = { + 0x7Fu, 0x7FFu, 0xFFFFu, 0x1FFFFFu, + }; + m_value &= masks[extra]; + // check the result: + if ((uint32_t)m_value > (uint32_t)0x10FFFFu) + invalid_sequence(); + } + + BaseIterator m_position; + mutable U32Type m_value; +}; + +// Output iterator +template <class BaseIterator, class U32Type = Utf32Type> +class Utf8OutputIterator { +public: + typedef void difference_type; + typedef void value_type; + typedef U32Type* pointer; + typedef U32Type& reference; + + Utf8OutputIterator(const BaseIterator& b) : m_position(b) {} + Utf8OutputIterator(const Utf8OutputIterator& that) : m_position(that.m_position) {} + Utf8OutputIterator& operator=(const Utf8OutputIterator& that) { + m_position = that.m_position; + return *this; + } + + const Utf8OutputIterator& operator*() const { + return *this; + } + + void operator=(U32Type val) const { + push(val); + } + + Utf8OutputIterator& operator++() { + return *this; + } + + Utf8OutputIterator& operator++(int) { + return *this; + } + +private: + static void invalid_utf32_code_point(U32Type val) { + throwInvalidUtf32CodePoint(val); + } + + void push(U32Type c) const { + if (c > 0x10FFFFu) + invalid_utf32_code_point(c); + + if ((uint32_t)c < 0x80u) { + *m_position++ = static_cast<Utf8Type>((uint32_t)c); + } else if ((uint32_t)c < 0x800u) { + *m_position++ = static_cast<Utf8Type>(0xC0u + ((uint32_t)c >> 6)); + *m_position++ = static_cast<Utf8Type>(0x80u + ((uint32_t)c & 0x3Fu)); + } else if ((uint32_t)c < 0x10000u) { + *m_position++ = static_cast<Utf8Type>(0xE0u + ((uint32_t)c >> 12)); + *m_position++ = static_cast<Utf8Type>(0x80u + (((uint32_t)c >> 6) & 0x3Fu)); + *m_position++ = static_cast<Utf8Type>(0x80u + ((uint32_t)c & 0x3Fu)); + } else { + *m_position++ = static_cast<Utf8Type>(0xF0u + ((uint32_t)c >> 18)); + *m_position++ = static_cast<Utf8Type>(0x80u + (((uint32_t)c >> 12) & 0x3Fu)); + *m_position++ = static_cast<Utf8Type>(0x80u + (((uint32_t)c >> 6) & 0x3Fu)); + *m_position++ = static_cast<Utf8Type>(0x80u + ((uint32_t)c & 0x3Fu)); + } + } + + mutable BaseIterator m_position; +}; + +} + +#endif |