#include "utf8.hh" #include "utf_error.hh" namespace utf { namespace { inline bool valid_codepoint(uint32_t c) { return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff); } } // namespace uint32_t read8(std::span data, std::size_t& offset) { if (offset >= data.size()) return NEED_MORE; uint32_t ret; uint8_t size; switch (data[offset] >> 4) { case 15: if (data[offset] & 0x08) return INVALID; ret = static_cast(data[offset] & 0x07) << 18; size = 4; break; case 14: ret = static_cast(data[offset] & 0x0f) << 12; size = 3; break; case 13: case 12: ret = static_cast(data[offset] & 0x1f) << 6; size = 2; break; default: if (data[offset] & 0x80) return INVALID; return data[offset++]; } if (data.size() - offset < size) return NEED_MORE; for (uint8_t i = 1; i < size; ++i) { if ((data[offset + i] & 0xc0) != 0x80) return INVALID; ret |= static_cast(data[offset + i] & 0x3f) << (size - i - 1) * 6; } if (!valid_codepoint(ret)) return INVALID; switch (size) { case 4: if (ret < 0x10000) return INVALID; break; case 3: if (ret < 0x800) return INVALID; break; case 2: if (ret < 0x80) return INVALID; break; } offset += size; return ret; } bool write8(uint32_t codepoint, std::span data, std::size_t& offset) { if (offset >= data.size()) UNLIKELY { return false; } if (codepoint < 0x80) { data[offset++] = codepoint; } else if (codepoint < 0x800) { if (data.size() - offset < 2) UNLIKELY { return false; } data[offset++] = 0xc0 | (codepoint >> 6); data[offset++] = 0x80 | (codepoint & 0x3f); } else if (codepoint < 0x10000) { if (data.size() - offset < 3) UNLIKELY { return false; } data[offset++] = 0xe0 | (codepoint >> 12); data[offset++] = 0x80 | ((codepoint >> 6) & 0x3f); data[offset++] = 0x80 | (codepoint & 0x3f); } else { if (data.size() - offset < 4) UNLIKELY { return false; } data[offset++] = 0xf0 | (codepoint >> 18); data[offset++] = 0x80 | ((codepoint >> 12) & 0x3f); data[offset++] = 0x80 | ((codepoint >> 6) & 0x3f); data[offset++] = 0x80 | (codepoint & 0x3f); } return true; } } // namespace utf