#include "utf8.hh" #include "utf_error.hh" namespace utf { namespace { inline bool valid_codepoint(uint32_t c) { return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff); } } // namespace uint32_t read8(std::string_view data, std::size_t& offset) { if (offset >= data.size()) return NEED_MORE; uint32_t ret; uint8_t size; switch (static_cast(data[offset]) >> 4) { case 15: if (data[offset] & 0x08) return INVALID; ret = static_cast(data[offset] & 0x07) << 18; size = 4; break; case 14: ret = static_cast(data[offset] & 0x0f) << 12; size = 3; break; case 13: case 12: ret = static_cast(data[offset] & 0x1f) << 6; size = 2; break; default: if (data[offset] & 0x80) return INVALID; return data[offset++]; } if (data.size() - offset < size) return NEED_MORE; for (uint8_t i = 1; i < size; ++i) { if ((data[offset + i] & 0xc0) != 0x80) return INVALID; ret |= static_cast(data[offset + i] & 0x3f) << (size - i - 1) * 6; } if (!valid_codepoint(ret)) return INVALID; switch (size) { case 4: if (ret < 0x10000) return INVALID; break; case 3: if (ret < 0x800) return INVALID; break; case 2: if (ret < 0x80) return INVALID; break; } offset += size; return ret; } } // namespace utf