diff options
Diffstat (limited to 'source/core/StarJsonParser.hpp')
-rw-r--r-- | source/core/StarJsonParser.hpp | 733 |
1 files changed, 733 insertions, 0 deletions
diff --git a/source/core/StarJsonParser.hpp b/source/core/StarJsonParser.hpp new file mode 100644 index 0000000..87a4bd9 --- /dev/null +++ b/source/core/StarJsonParser.hpp @@ -0,0 +1,733 @@ +#ifndef STAR_JSON_PARSER_HPP +#define STAR_JSON_PARSER_HPP + +#include <vector> + +#include "StarUnicode.hpp" + +namespace Star { + +struct JsonStream { + virtual ~JsonStream() {} + + virtual void beginObject() = 0; + virtual void objectKey(char32_t const*, size_t) = 0; + virtual void endObject() = 0; + + virtual void beginArray() = 0; + virtual void endArray() = 0; + + virtual void putString(char32_t const*, size_t) = 0; + virtual void putDouble(char32_t const*, size_t) = 0; + virtual void putInteger(char32_t const*, size_t) = 0; + virtual void putBoolean(bool) = 0; + virtual void putNull() = 0; + + virtual void putWhitespace(char32_t const*, size_t) = 0; + virtual void putColon() = 0; + virtual void putComma() = 0; +}; + +// Will parse JSON and output to a given JsonStream. Parses an *extension* to +// the JSON format that includes comments. +template <typename InputIterator> +class JsonParser { +public: + JsonParser(JsonStream& stream) + : m_line(0), m_column(0), m_stream(stream) {} + virtual ~JsonParser() {} + + // Does not throw. On error, returned iterator will not be equal to end, and + // error() will be non-null. Set fragment to true to parse any JSON type + // rather than just object or array. + InputIterator parse(InputIterator begin, InputIterator end, bool fragment = false) { + init(begin, end); + + try { + white(); + if (fragment) + value(); + else + top(); + white(); + } catch (ParsingException const&) { + } + + return m_current; + } + + // Human readable parsing error, does not include line or column info. + char const* error() const { + if (m_error.empty()) + return nullptr; + else + return m_error.c_str(); + } + + size_t line() const { + return m_line + 1; + } + + size_t column() const { + return m_column + 1; + } + +private: + typedef std::basic_string<char32_t> CharArray; + + // Thrown internally to abort parsing. + class ParsingException {}; + + void top() { + switch (m_char) { + case '{': + object(); + break; + case '[': + array(); + break; + default: + error("expected JSON object or array at top level"); + return; + } + } + + void value() { + switch (m_char) { + case '{': + object(); + break; + case '[': + array(); + break; + case '"': + string(); + break; + case '-': + number(); + break; + case 0: + error("unexpected end of stream parsing value"); + return; + default: + m_char >= '0' && m_char <= '9' ? number() : word(); + break; + } + } + + void object() { + if (m_char != '{') + error("bad object, should be '{'"); + + next(); + m_stream.beginObject(); + + white(); + if (m_char == '}') { + next(); + m_stream.endObject(); + return; + } + + while (true) { + CharArray s = parseString(); + m_stream.objectKey(s.c_str(), s.length()); + + white(); + if (m_char != ':') + error("bad object, should be ':'"); + next(); + m_stream.putColon(); + white(); + + value(); + + white(); + if (m_char == '}') { + next(); + m_stream.endObject(); + return; + } else if (m_char == ',') { + next(); + m_stream.putComma(); + white(); + } else if (m_char == 0) { + error("unexpected end of stream parsing object."); + } else { + error("bad object, should be '}' or ','"); + } + } + } + + void array() { + if (m_char == '[') { + next(); + m_stream.beginArray(); + white(); + if (m_char == ']') { + next(); + m_stream.endArray(); + } else { + while (true) { + value(); + white(); + if (m_char == ']') { + next(); + m_stream.endArray(); + break; + } else if (m_char == ',') { + next(); + m_stream.putComma(); + white(); + } else if (m_char == 0) { + error("unexpected end of stream parsing array."); + } else { + error("bad array, should be ',' or ']'"); + } + } + } + } else { + error("bad array"); + } + } + + void string() { + CharArray s = parseString(); + m_stream.putString(s.c_str(), s.length()); + } + + void number() { + std::basic_string<char32_t> buffer; + bool hasDot = false; + + if (m_char == '-') { + buffer += '-'; + next(); + } + + if (m_char == '0') { + buffer += '0'; + next(); + } else if (m_char > '0' && m_char <= '9') { + while (m_char >= '0' && m_char <= '9') { + buffer += m_char; + next(); + } + } else { + error("bad number, must start with digit"); + } + + if (m_char == '.') { + hasDot = true; + buffer += '.'; + next(); + while (m_char >= '0' && m_char <= '9') { + buffer += m_char; + next(); + } + } + + if (m_char == 'e' || m_char == 'E') { + buffer += m_char; + next(); + if (m_char == '-' || m_char == '+') { + buffer += m_char; + next(); + } + while (m_char >= '0' && m_char <= '9') { + buffer += m_char; + next(); + } + } + + if (hasDot) { + try { + m_stream.putDouble(buffer.c_str(), buffer.length()); + } catch (std::exception const& e) { + error(std::string("Bad double: ") + e.what()); + } + } else { + try { + m_stream.putInteger(buffer.c_str(), buffer.length()); + } catch (std::exception const& e) { + error(std::string("Bad integer: ") + e.what()); + } + } + } + + // true, false, or null + void word() { + switch (m_char) { + case 't': + next(); + check('r'); + check('u'); + check('e'); + m_stream.putBoolean(true); + break; + case 'f': + next(); + check('a'); + check('l'); + check('s'); + check('e'); + m_stream.putBoolean(false); + break; + case 'n': + next(); + check('u'); + check('l'); + check('l'); + m_stream.putNull(); + break; + default: + error("unexpected character parsing word"); + return; + } + } + + CharArray parseString() { + if (m_char != '"') + error("bad string, should be '\"'"); + next(); + + CharArray str; + + while (true) { + if (m_char == '\\') { + next(); + if (m_char == 'u') { + std::string hexString; + next(); + for (int i = 0; i < 4; ++i) { + hexString.push_back(m_char); + next(); + } + char32_t codepoint = hexStringToUtf32(hexString); + if (isUtf16LeadSurrogate(codepoint)) { + check('\\'); + check('u'); + hexString.clear(); + for (int i = 0; i < 4; ++i) { + hexString.push_back(m_char); + next(); + } + codepoint = hexStringToUtf32(hexString, codepoint); + } + str += codepoint; + } else { + switch (m_char) { + case '"': + str += '"'; + break; + case '\\': + str += '\\'; + break; + case '/': + str += '/'; + break; + case 'b': + str += '\b'; + break; + case 'f': + str += '\f'; + break; + case 'n': + str += '\n'; + break; + case 'r': + str += '\r'; + break; + case 't': + str += '\t'; + break; + default: + error("bad string escape character"); + break; + } + next(); + } + } else if (m_char == '\"') { + next(); + return str; + } else if (m_char == 0) { + error("unexpected end of stream reading string!"); + } else { + str += m_char; + next(); + } + } + error("parser bug"); + return {}; + } + + // Checks current char then moves on to the next one + void check(char32_t c) { + if (m_char == 0) + error("unexpected end of stream parsing word"); + if (m_char != c) + error("unexpected character in word"); + next(); + } + + void init(InputIterator begin, InputIterator end) { + m_current = begin; + m_end = end; + m_line = 0; + m_column = 0; + + if (m_current != m_end) + m_char = *m_current; + else + m_char = 0; + } + + // Consumes next character. + void next() { + if (m_current == m_end) + return; + + if (m_char == '\n') { + ++m_line; + m_column = 0; + } else { + ++m_column; + } + ++m_current; + + if (m_current != m_end) + m_char = *m_current; + else + m_char = 0; + } + + // Will skip whitespace and comments between tokens. + void white() { + CharArray buffer; + while (m_current != m_end) { + if (m_char == '/') { + // Always consume '/' found in whitespace, because that is never valid + // JSON (other than comments) + buffer += m_char; + next(); + if (m_current != m_end && m_char == '/') { + // eat "/" + buffer += m_char; + next(); + + // Read '//' style comments up until eol/eof. + while (m_current != m_end && m_char != '\n') { + buffer += m_char; + next(); + } + } else if (m_current != m_end && m_char == '*') { + // eat "*" + buffer += m_char; + next(); + + // Read '/*' style comments up until '*/'. + while (m_current != m_end) { + if (m_char == '*') { + buffer += m_char; + next(); + if (m_char == '/') { + buffer += m_char; + next(); + break; + } + } else { + buffer += m_char; + next(); + if (m_current == m_end) + error("/* comment has no matching */"); + } + } + } else { + // The only allowed characters following / in whitespace are / and * + error("/ character in whitespace is not follwed by '/' or '*', invalid comment"); + return; + } + } else if (isSpace(m_char)) { + buffer += m_char; + next(); + } else { + if (buffer.size() != 0) + m_stream.putWhitespace(buffer.c_str(), buffer.length()); + return; + } + } + if (buffer.size() != 0) + m_stream.putWhitespace(buffer.c_str(), buffer.length()); + } + + void error(std::string msg) { + m_error = move(msg); + throw ParsingException(); + } + + bool isSpace(char32_t c) { + // Only whitespace allowed by JSON + return c == 0x20 || // space + c == 0x09 || // horizontal tab + c == 0x0a || // newline + c == 0x0d || // carriage return + c == 0xfeff; // BOM or ZWNBSP + } + + char32_t m_char; + InputIterator m_current; + InputIterator m_end; + size_t m_line; + size_t m_column; + std::string m_error; + JsonStream& m_stream; +}; + +// Write JSON through JsonStream interface. +template <typename OutputIterator> +class JsonWriter : public JsonStream { +public: + JsonWriter(OutputIterator out, unsigned pretty = 0) + : m_out(out), m_pretty(pretty) {} + + void beginObject() { + startValue(); + pushState(Object); + write('{'); + } + + void objectKey(char32_t const* s, size_t len) { + if (currentState() == ObjectElement) { + if (m_pretty) + write('\n'); + indent(); + } else { + pushState(ObjectElement); + if (m_pretty) + write('\n'); + indent(); + } + + write('"'); + char32_t c = *s; + while (c && len) { + write(c); + c = *++s; + --len; + } + write('"'); + if (m_pretty) + write(' '); + } + + void endObject() { + popState(Object); + + if (m_pretty) + write('\n'); + indent(); + write('}'); + } + + void beginArray() { + startValue(); + pushState(Array); + write('['); + } + + void endArray() { + popState(Array); + write(']'); + } + + void putString(char32_t const* s, size_t len) { + startValue(); + + write('"'); + char32_t c = *s; + while (c && (len > 0)) { + if (!isPrintable(c)) { + switch (c) { + case '"': + write('\\'); + write('"'); + break; + case '\\': + write('\\'); + write('\\'); + break; + case '\b': + write('\\'); + write('b'); + break; + case '\f': + write('\\'); + write('f'); + break; + case '\n': + write('\\'); + write('n'); + break; + case '\r': + write('\\'); + write('r'); + break; + case '\t': + write('\\'); + write('t'); + break; + default: + auto hex = hexStringFromUtf32(c); + if (hex.size() == 4) { + write('\\'); + write('u'); + for (auto c : hex) { + write(c); + } + } else if (hex.size() == 8) { + write('\\'); + write('u'); + for (auto c : hex.substr(0, 4)) { + write(c); + } + write('\\'); + write('u'); + for (auto c : hex.substr(4)) { + write(c); + } + } else { + throw UnicodeException("Internal Error: Received invalid unicode hex from hexStringFromUtf32."); + } + break; + } + } else { + write(c); + } + c = *++s; + --len; + } + write('"'); + } + + void putDouble(char32_t const* s, size_t len) { + startValue(); + for (size_t i = 0; i < len; ++i) + write(s[i]); + } + + void putInteger(char32_t const* s, size_t len) { + startValue(); + for (size_t i = 0; i < len; ++i) + write(s[i]); + } + + void putBoolean(bool b) { + startValue(); + if (b) { + write('t'); + write('r'); + write('u'); + write('e'); + } else { + write('f'); + write('a'); + write('l'); + write('s'); + write('e'); + } + } + + void putNull() { + startValue(); + write('n'); + write('u'); + write('l'); + write('l'); + } + + void putWhitespace(char32_t const* s, size_t len) { + // If m_pretty is true, extra spurious whitespace will be inserted. + for (size_t i = 0; i < len; ++i) + write(s[i]); + } + + void putColon() { + write(':'); + if (m_pretty) + write(' '); + } + + void putComma() { + write(','); + } + +private: + enum State { + Top, + Object, + ObjectElement, + Array, + ArrayElement + }; + + // Handles separating array elements if currently adding to an array + void startValue() { + if (currentState() == ArrayElement) { + if (m_pretty) + write(' '); + } else if (currentState() == Array) { + pushState(ArrayElement); + } + } + + void indent() { + for (unsigned i = 0; i < m_state.size() / 2; ++i) { + for (unsigned j = 0; j < m_pretty; ++j) { + write(' '); + } + } + } + + // Push state onto stack. + void pushState(State state) { + m_state.push_back(state); + } + + // Pop state stack down to given state. + void popState(State state) { + while (true) { + if (m_state.empty()) + return; + + State last = currentState(); + m_state.pop_back(); + if (last == state) + return; + } + } + + State currentState() { + if (m_state.empty()) + return Top; + else + return *prev(m_state.end()); + } + + void write(char32_t c) { + *m_out = c; + ++m_out; + } + + // Only chars that are unescaped according to JSON spec. + bool isPrintable(char32_t c) { + return (c >= 0x20 && c <= 0x21) || (c >= 0x23 && c <= 0x5b) || (c >= 0x5d && c <= 0x10ffff); + } + + OutputIterator m_out; + unsigned m_pretty; + std::vector<State> m_state; +}; + +} + +#endif |