From fc4547b412e28164af1bf8981234c6af959ccc0b Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Tue, 13 Jun 2023 10:07:16 +0200 Subject: WIP --- utf/src/utf16.cc | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ utf/src/utf32.cc | 43 +++++++++++++++++++++++++++++++++++ utf/src/utf8.cc | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 178 insertions(+) create mode 100644 utf/src/utf16.cc create mode 100644 utf/src/utf32.cc create mode 100644 utf/src/utf8.cc (limited to 'utf/src') diff --git a/utf/src/utf16.cc b/utf/src/utf16.cc new file mode 100644 index 0000000..43595bf --- /dev/null +++ b/utf/src/utf16.cc @@ -0,0 +1,67 @@ +#include "utf16.hh" + +#include "utf_error.hh" + +namespace utf { + +namespace { + +inline bool is_high_surrogate(uint16_t c) { + return c >= 0xd800 && c <= 0xdbff; +} + +inline bool is_low_surrogate(uint16_t c) { + return c >= 0xdc00 && c <= 0xdfff; +} + +} // namespace + +uint32_t read16be(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 2) + return NEED_MORE; + uint16_t c = static_cast(data[offset]) << 8 + | static_cast(data[offset + 1] & 0xff); + if (is_high_surrogate(c)) { + if (data.size() - offset < 4) + return NEED_MORE; + uint16_t d = static_cast(data[offset + 2]) << 8 + | static_cast(data[offset + 3] & 0xff); + if (is_low_surrogate(d)) { + offset += 4; + return 0x10000 + + (static_cast(c & 0x3ff) << 10 + | (d & 0x3ff)); + } + return INVALID; + } else if (is_low_surrogate(c)) { + return INVALID; + } + offset += 2; + return c; +} + +uint32_t read16le(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 2) + return NEED_MORE; + uint16_t c = static_cast(data[offset + 1]) << 8 + | static_cast(data[offset] & 0xff); + if (is_high_surrogate(c)) { + if (data.size() - offset < 4) + return NEED_MORE; + uint16_t d = static_cast(data[offset + 3]) << 8 + | static_cast(data[offset + 2] & 0xff); + if (is_low_surrogate(d)) { + offset += 4; + return 0x10000 + + (static_cast(c & 0x3ff) << 10 + | (d & 0x3ff)); + } + return INVALID; + } else if (is_low_surrogate(c)) { + return INVALID; + } + offset += 2; + return c; +} + +} // namespace utf diff --git a/utf/src/utf32.cc b/utf/src/utf32.cc new file mode 100644 index 0000000..cfa29b6 --- /dev/null +++ b/utf/src/utf32.cc @@ -0,0 +1,43 @@ +#include "utf32.hh" + +#include "utf_error.hh" + +namespace utf { + +namespace { + +inline bool valid_codepoint(uint32_t c) { + return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff); +} + +} // namespace + +uint32_t read32be(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 4) + return NEED_MORE; + uint32_t c = static_cast(data[offset]) << 24 + | static_cast(data[offset + 1] & 0xff) << 16 + | static_cast(data[offset + 2] & 0xff) << 8 + | static_cast(data[offset + 3] & 0xff); + if (valid_codepoint(c)) { + offset += 4; + return c; + } + return INVALID; +} + +uint32_t read32le(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 4) + return NEED_MORE; + uint32_t c = static_cast(data[offset + 3]) << 24 + | static_cast(data[offset + 2] & 0xff) << 16 + | static_cast(data[offset + 1] & 0xff) << 8 + | static_cast(data[offset] & 0xff); + if (valid_codepoint(c)) { + offset += 4; + return c; + } + return INVALID; +} + +} // namespace utf diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc new file mode 100644 index 0000000..54b0296 --- /dev/null +++ b/utf/src/utf8.cc @@ -0,0 +1,68 @@ +#include "utf8.hh" + +#include "utf_error.hh" + +namespace utf { + +namespace { + +inline bool valid_codepoint(uint32_t c) { + return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff); +} + +} // namespace + +uint32_t read8(std::string_view data, std::size_t& offset) { + if (offset >= data.size()) + return NEED_MORE; + uint32_t ret; + uint8_t size; + switch (static_cast(data[offset]) >> 4) { + case 15: + if (data[offset] & 0x08) + return INVALID; + ret = static_cast(data[offset] & 0x07) << 18; + size = 4; + break; + case 14: + ret = static_cast(data[offset] & 0x0f) << 12; + size = 3; + break; + case 13: + case 12: + ret = static_cast(data[offset] & 0x1f) << 6; + size = 2; + break; + default: + if (data[offset] & 0x80) + return INVALID; + return data[offset++]; + } + if (data.size() - offset < size) + return NEED_MORE; + for (uint8_t i = 1; i < size; ++i) { + if ((data[offset + i] & 0xc0) != 0x80) + return INVALID; + ret |= static_cast(data[offset + i] & 0x3f) << (size - i - 1) * 6; + } + if (!valid_codepoint(ret)) + return INVALID; + switch (size) { + case 4: + if (ret < 0x10000) + return INVALID; + break; + case 3: + if (ret < 0x800) + return INVALID; + break; + case 2: + if (ret < 0x80) + return INVALID; + break; + } + offset += size; + return ret; +} + +} // namespace utf -- cgit v1.2.3-70-g09d2