1 files changed, 229 insertions, 0 deletions
diff --git a/source/core/StarUnicode.hpp b/source/core/StarUnicode.hpp
new file mode 100644
index 0000000..845259f
--- /dev/null
+++ b/source/core/StarUnicode.hpp
@@ -0,0 +1,229 @@
+#ifndef STAR_UTF8_HPP
+#define STAR_UTF8_HPP
+
+#include "StarByteArray.hpp"
+#include "StarMaybe.hpp"
+
+namespace Star {
+
+STAR_EXCEPTION(UnicodeException, StarException);
+
+typedef char Utf8Type;
+typedef char32_t Utf32Type;
+
+#define STAR_UTF32_REPLACEMENT_CHAR 0x000000b7L
+
+void throwInvalidUtf8Sequence();
+void throwMissingUtf8End();
+void throwInvalidUtf32CodePoint(Utf32Type val);
+
+// If passed NPos as a size, assumes modified UTF-8 and stops on NULL byte.
+// Otherwise, ignores NULL.
+size_t utf8Length(Utf8Type const* utf8, size_t size = NPos);
+// Encode up to six utf8 bytes into a utf32 character.  If passed NPos as len,
+// assumes modified UTF-8 and stops on NULL, otherwise ignores.
+size_t utf8DecodeChar(Utf8Type const* utf8, Utf32Type* utf32, size_t len = NPos);
+// Encode single utf32 char into up to 6 utf8 characters.
+size_t utf8EncodeChar(Utf8Type* utf8, Utf32Type utf32, size_t len = 6);
+
+Utf32Type hexStringToUtf32(std::string const& codepoint, Maybe<Utf32Type> previousCodepoint = {});
+std::string hexStringFromUtf32(Utf32Type character);
+
+bool isUtf16LeadSurrogate(Utf32Type codepoint);
+bool isUtf16TrailSurrogate(Utf32Type codepoint);
+
+Utf32Type utf32FromUtf16SurrogatePair(Utf32Type lead, Utf32Type trail);
+pair<Utf32Type, Maybe<Utf32Type>> utf32ToUtf16SurrogatePair(Utf32Type codepoint);
+
+// Bidirectional iterator that can make utf8 appear as utf32
+template <class BaseIterator, class U32Type = Utf32Type>
+class U8ToU32Iterator {
+public:
+  typedef ptrdiff_t difference_type;
+  typedef U32Type value_type;
+  typedef U32Type* pointer;
+  typedef U32Type& reference;
+  typedef std::bidirectional_iterator_tag iterator_category;
+
+  U8ToU32Iterator() : m_position(), m_value(pending_read) {}
+
+  U8ToU32Iterator(BaseIterator b) : m_position(b), m_value(pending_read) {}
+
+  U32Type const& operator*() const {
+    if (m_value == pending_read)
+      extract_current();
+    return m_value;
+  }
+
+  U8ToU32Iterator const& operator++() {
+    increment();
+    return *this;
+  }
+
+  U8ToU32Iterator operator++(int) {
+    U8ToU32Iterator clone(*this);
+    increment();
+    return clone;
+  }
+
+  U8ToU32Iterator const& operator--() {
+    decrement();
+    return *this;
+  }
+
+  U8ToU32Iterator operator--(int) {
+    U8ToU32Iterator clone(*this);
+    decrement();
+    return clone;
+  }
+
+  bool operator==(U8ToU32Iterator const& that) const {
+    return equal(that);
+  }
+
+  bool operator!=(U8ToU32Iterator const& that) const {
+    return !equal(that);
+  }
+
+private:
+  // special values for pending iterator reads:
+  static U32Type const pending_read = 0xffffffffu;
+
+  static void invalid_sequence() {
+    throwInvalidUtf8Sequence();
+  }
+
+  static unsigned utf8_byte_count(Utf8Type c) {
+    // if the most significant bit with a zero in it is in position
+    // 8-N then there are N bytes in this UTF-8 sequence:
+    uint8_t mask = 0x80u;
+    unsigned result = 0;
+    while (c & mask) {
+      ++result;
+      mask >>= 1;
+    }
+    return (result == 0) ? 1 : ((result > 4) ? 4 : result);
+  }
+
+  static unsigned utf8_trailing_byte_count(Utf8Type c) {
+    return utf8_byte_count(c) - 1;
+  }
+
+  void increment() {
+    // skip high surrogate first if there is one:
+    unsigned c = utf8_byte_count(*m_position);
+    std::advance(m_position, c);
+    m_value = pending_read;
+  }
+
+  void decrement() {
+    // Keep backtracking until we don't have a trailing character:
+    unsigned count = 0;
+    while (((uint8_t) * --m_position & 0xC0u) == 0x80u)
+      ++count;
+    // now check that the sequence was valid:
+    if (count != utf8_trailing_byte_count(*m_position))
+      invalid_sequence();
+    m_value = pending_read;
+  }
+
+  bool equal(const U8ToU32Iterator& that) const {
+    return m_position == that.m_position;
+  }
+
+  void extract_current() const {
+    m_value = static_cast<Utf8Type>(*m_position);
+    // we must not have a continuation character:
+    if (((uint8_t)m_value & 0xC0u) == 0x80u)
+      invalid_sequence();
+    // see how many extra byts we have:
+    unsigned extra = utf8_trailing_byte_count(*m_position);
+    // extract the extra bits, 6 from each extra byte:
+    BaseIterator next(m_position);
+    for (unsigned c = 0; c < extra; ++c) {
+      ++next;
+      m_value <<= 6;
+      auto entry = static_cast<uint8_t>(*next);
+      if ((c > 0) && ((entry & 0xC0u) != 0x80u))
+        invalid_sequence();
+      m_value += entry & 0x3Fu;
+    }
+    // we now need to remove a few of the leftmost bits, but how many depends
+    // upon how many extra bytes we've extracted:
+    static const Utf32Type masks[4] = {
+        0x7Fu, 0x7FFu, 0xFFFFu, 0x1FFFFFu,
+    };
+    m_value &= masks[extra];
+    // check the result:
+    if ((uint32_t)m_value > (uint32_t)0x10FFFFu)
+      invalid_sequence();
+  }
+
+  BaseIterator m_position;
+  mutable U32Type m_value;
+};
+
+// Output iterator
+template <class BaseIterator, class U32Type = Utf32Type>
+class Utf8OutputIterator {
+public:
+  typedef void difference_type;
+  typedef void value_type;
+  typedef U32Type* pointer;
+  typedef U32Type& reference;
+
+  Utf8OutputIterator(const BaseIterator& b) : m_position(b) {}
+  Utf8OutputIterator(const Utf8OutputIterator& that) : m_position(that.m_position) {}
+  Utf8OutputIterator& operator=(const Utf8OutputIterator& that) {
+    m_position = that.m_position;
+    return *this;
+  }
+
+  const Utf8OutputIterator& operator*() const {
+    return *this;
+  }
+
+  void operator=(U32Type val) const {
+    push(val);
+  }
+
+  Utf8OutputIterator& operator++() {
+    return *this;
+  }
+
+  Utf8OutputIterator& operator++(int) {
+    return *this;
+  }
+
+private:
+  static void invalid_utf32_code_point(U32Type val) {
+    throwInvalidUtf32CodePoint(val);
+  }
+
+  void push(U32Type c) const {
+    if (c > 0x10FFFFu)
+      invalid_utf32_code_point(c);
+
+    if ((uint32_t)c < 0x80u) {
+      *m_position++ = static_cast<Utf8Type>((uint32_t)c);
+    } else if ((uint32_t)c < 0x800u) {
+      *m_position++ = static_cast<Utf8Type>(0xC0u + ((uint32_t)c >> 6));
+      *m_position++ = static_cast<Utf8Type>(0x80u + ((uint32_t)c & 0x3Fu));
+    } else if ((uint32_t)c < 0x10000u) {
+      *m_position++ = static_cast<Utf8Type>(0xE0u + ((uint32_t)c >> 12));
+      *m_position++ = static_cast<Utf8Type>(0x80u + (((uint32_t)c >> 6) & 0x3Fu));
+      *m_position++ = static_cast<Utf8Type>(0x80u + ((uint32_t)c & 0x3Fu));
+    } else {
+      *m_position++ = static_cast<Utf8Type>(0xF0u + ((uint32_t)c >> 18));
+      *m_position++ = static_cast<Utf8Type>(0x80u + (((uint32_t)c >> 12) & 0x3Fu));
+      *m_position++ = static_cast<Utf8Type>(0x80u + (((uint32_t)c >> 6) & 0x3Fu));
+      *m_position++ = static_cast<Utf8Type>(0x80u + ((uint32_t)c & 0x3Fu));
+    }
+  }
+
+  mutable BaseIterator m_position;
+};
+
+}
+
+#endif