diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2023-06-13 10:07:16 +0200 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2023-06-13 10:07:16 +0200 |
| commit | fc4547b412e28164af1bf8981234c6af959ccc0b (patch) | |
| tree | 061253e7a4f6abaca282223b36d10f0bed8cad23 /utf | |
WIP
Diffstat (limited to 'utf')
| -rw-r--r-- | utf/inc/utf16.hh | 31 | ||||
| -rw-r--r-- | utf/inc/utf32.hh | 29 | ||||
| -rw-r--r-- | utf/inc/utf8.hh | 22 | ||||
| -rw-r--r-- | utf/inc/utf_error.hh | 13 | ||||
| -rw-r--r-- | utf/meson.build | 38 | ||||
| -rw-r--r-- | utf/src/utf16.cc | 67 | ||||
| -rw-r--r-- | utf/src/utf32.cc | 43 | ||||
| -rw-r--r-- | utf/src/utf8.cc | 68 | ||||
| -rw-r--r-- | utf/tst/test_utf16.cc | 157 | ||||
| -rw-r--r-- | utf/tst/test_utf32.cc | 145 | ||||
| -rw-r--r-- | utf/tst/test_utf8.cc | 188 |
11 files changed, 801 insertions, 0 deletions
diff --git a/utf/inc/utf16.hh b/utf/inc/utf16.hh new file mode 100644 index 0000000..344b1a2 --- /dev/null +++ b/utf/inc/utf16.hh @@ -0,0 +1,31 @@ +#ifndef UTF_UTF16_HH +#define UTF_UTF16_HH + +#include "macros.hh" + +#include <cstdint> +#include <string_view> + +namespace utf { + +/* Read one unicode codepoint from UTF-16 BigEndian encoded data if possible. + * If successfull offset is incremented to point to next codepoint. + * Will fail: + * - not enough data is left in data given offset, returns NEED_MORE. + * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs, + * returns INVALID. + */ +uint32_t HIDDEN read16be(std::string_view data, std::size_t& offset); + +/* Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible. + * If successfull offset is incremented to point to next codepoint. + * Will fail: + * - not enough data is left in data given offset, returns NEED_MORE. + * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs, + * returns INVALID. + */ +uint32_t HIDDEN read16le(std::string_view data, std::size_t& offset); + +} // namespace utf + +#endif // UTF_UTF16_HH diff --git a/utf/inc/utf32.hh b/utf/inc/utf32.hh new file mode 100644 index 0000000..2d3088e --- /dev/null +++ b/utf/inc/utf32.hh @@ -0,0 +1,29 @@ +#ifndef UTF_UTF32_HH +#define UTF_UTF32_HH + +#include "macros.hh" + +#include <cstdint> +#include <string_view> + +namespace utf { + +/* Read one unicode codepoint from UTF-32 BigEndian encoded data if possible. + * If successfull offset is incremented to point to next codepoint. + * Will fail: + * - not enough data is left in data given offset, returns NEED_MORE. + * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID. + */ +uint32_t HIDDEN read32be(std::string_view data, std::size_t& offset); + +/* Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible. + * If successfull offset is incremented to point to next codepoint. + * Will fail: + * - not enough data is left in data given offset, returns NEED_MORE. + * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID. + */ +uint32_t HIDDEN read32le(std::string_view data, std::size_t& offset); + +} // namespace utf + +#endif // UTF_UTF32_HH diff --git a/utf/inc/utf8.hh b/utf/inc/utf8.hh new file mode 100644 index 0000000..a3ea84a --- /dev/null +++ b/utf/inc/utf8.hh @@ -0,0 +1,22 @@ +#ifndef UTF_UTF8_HH +#define UTF_UTF8_HH + +#include "macros.hh" + +#include <cstdint> +#include <string_view> + +namespace utf { + +/* Read one unicode codepoint from UTF-8 encoded data if possible. + * If successfull offset is incremented to point to next codepoint. + * Will fail: + * - not enough data is left in data given offset, returns NEED_MORE. + * - data is not valid UTF-8, this includes overlong encodings and + * invalid unicode code points, returns INVALID. + */ +uint32_t HIDDEN read8(std::string_view data, std::size_t& offset); + +} // namespace utf + +#endif // UTF_UTF8_HH diff --git a/utf/inc/utf_error.hh b/utf/inc/utf_error.hh new file mode 100644 index 0000000..079fa43 --- /dev/null +++ b/utf/inc/utf_error.hh @@ -0,0 +1,13 @@ +#ifndef UTF_ERROR_HH +#define UTF_ERROR_HH + +#include <cstdint> + +namespace utf { + +constexpr uint32_t NEED_MORE = 0xfffffffe; +constexpr uint32_t INVALID = 0xffffffff; + +} // namespace utf + +#endif // UTF_ERROR_HH diff --git a/utf/meson.build b/utf/meson.build new file mode 100644 index 0000000..64db6ff --- /dev/null +++ b/utf/meson.build @@ -0,0 +1,38 @@ +deps = [ + base_dep, +] + +inc = include_directories('inc') +lib = static_library( + 'utf', + 'src/utf8.cc', + 'src/utf16.cc', + 'src/utf32.cc', + dependencies: deps, + include_directories: inc, + install: false, +) + +utf_dep = declare_dependency( + dependencies: deps, + include_directories: inc, + link_with: lib, +) + +test('utf8', + executable( + 'test_utf8', + sources: ['tst/test_utf8.cc'], + dependencies: [utf_dep, gtest_dep])) + +test('utf16', + executable( + 'test_utf16', + sources: ['tst/test_utf16.cc'], + dependencies: [utf_dep, gtest_dep])) + +test('utf32', + executable( + 'test_utf32', + sources: ['tst/test_utf32.cc'], + dependencies: [utf_dep, gtest_dep])) diff --git a/utf/src/utf16.cc b/utf/src/utf16.cc new file mode 100644 index 0000000..43595bf --- /dev/null +++ b/utf/src/utf16.cc @@ -0,0 +1,67 @@ +#include "utf16.hh" + +#include "utf_error.hh" + +namespace utf { + +namespace { + +inline bool is_high_surrogate(uint16_t c) { + return c >= 0xd800 && c <= 0xdbff; +} + +inline bool is_low_surrogate(uint16_t c) { + return c >= 0xdc00 && c <= 0xdfff; +} + +} // namespace + +uint32_t read16be(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 2) + return NEED_MORE; + uint16_t c = static_cast<uint16_t>(data[offset]) << 8 + | static_cast<uint16_t>(data[offset + 1] & 0xff); + if (is_high_surrogate(c)) { + if (data.size() - offset < 4) + return NEED_MORE; + uint16_t d = static_cast<uint16_t>(data[offset + 2]) << 8 + | static_cast<uint16_t>(data[offset + 3] & 0xff); + if (is_low_surrogate(d)) { + offset += 4; + return 0x10000 + + (static_cast<uint32_t>(c & 0x3ff) << 10 + | (d & 0x3ff)); + } + return INVALID; + } else if (is_low_surrogate(c)) { + return INVALID; + } + offset += 2; + return c; +} + +uint32_t read16le(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 2) + return NEED_MORE; + uint16_t c = static_cast<uint16_t>(data[offset + 1]) << 8 + | static_cast<uint16_t>(data[offset] & 0xff); + if (is_high_surrogate(c)) { + if (data.size() - offset < 4) + return NEED_MORE; + uint16_t d = static_cast<uint16_t>(data[offset + 3]) << 8 + | static_cast<uint16_t>(data[offset + 2] & 0xff); + if (is_low_surrogate(d)) { + offset += 4; + return 0x10000 + + (static_cast<uint32_t>(c & 0x3ff) << 10 + | (d & 0x3ff)); + } + return INVALID; + } else if (is_low_surrogate(c)) { + return INVALID; + } + offset += 2; + return c; +} + +} // namespace utf diff --git a/utf/src/utf32.cc b/utf/src/utf32.cc new file mode 100644 index 0000000..cfa29b6 --- /dev/null +++ b/utf/src/utf32.cc @@ -0,0 +1,43 @@ +#include "utf32.hh" + +#include "utf_error.hh" + +namespace utf { + +namespace { + +inline bool valid_codepoint(uint32_t c) { + return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff); +} + +} // namespace + +uint32_t read32be(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 4) + return NEED_MORE; + uint32_t c = static_cast<uint32_t>(data[offset]) << 24 + | static_cast<uint32_t>(data[offset + 1] & 0xff) << 16 + | static_cast<uint32_t>(data[offset + 2] & 0xff) << 8 + | static_cast<uint32_t>(data[offset + 3] & 0xff); + if (valid_codepoint(c)) { + offset += 4; + return c; + } + return INVALID; +} + +uint32_t read32le(std::string_view data, std::size_t& offset) { + if (offset > data.size() || data.size() - offset < 4) + return NEED_MORE; + uint32_t c = static_cast<uint32_t>(data[offset + 3]) << 24 + | static_cast<uint32_t>(data[offset + 2] & 0xff) << 16 + | static_cast<uint32_t>(data[offset + 1] & 0xff) << 8 + | static_cast<uint32_t>(data[offset] & 0xff); + if (valid_codepoint(c)) { + offset += 4; + return c; + } + return INVALID; +} + +} // namespace utf diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc new file mode 100644 index 0000000..54b0296 --- /dev/null +++ b/utf/src/utf8.cc @@ -0,0 +1,68 @@ +#include "utf8.hh" + +#include "utf_error.hh" + +namespace utf { + +namespace { + +inline bool valid_codepoint(uint32_t c) { + return (c < 0xd800) || (c > 0xdfff && c <= 0x10ffff); +} + +} // namespace + +uint32_t read8(std::string_view data, std::size_t& offset) { + if (offset >= data.size()) + return NEED_MORE; + uint32_t ret; + uint8_t size; + switch (static_cast<uint8_t>(data[offset]) >> 4) { + case 15: + if (data[offset] & 0x08) + return INVALID; + ret = static_cast<uint32_t>(data[offset] & 0x07) << 18; + size = 4; + break; + case 14: + ret = static_cast<uint32_t>(data[offset] & 0x0f) << 12; + size = 3; + break; + case 13: + case 12: + ret = static_cast<uint32_t>(data[offset] & 0x1f) << 6; + size = 2; + break; + default: + if (data[offset] & 0x80) + return INVALID; + return data[offset++]; + } + if (data.size() - offset < size) + return NEED_MORE; + for (uint8_t i = 1; i < size; ++i) { + if ((data[offset + i] & 0xc0) != 0x80) + return INVALID; + ret |= static_cast<uint32_t>(data[offset + i] & 0x3f) << (size - i - 1) * 6; + } + if (!valid_codepoint(ret)) + return INVALID; + switch (size) { + case 4: + if (ret < 0x10000) + return INVALID; + break; + case 3: + if (ret < 0x800) + return INVALID; + break; + case 2: + if (ret < 0x80) + return INVALID; + break; + } + offset += size; + return ret; +} + +} // namespace utf diff --git a/utf/tst/test_utf16.cc b/utf/tst/test_utf16.cc new file mode 100644 index 0000000..c17982e --- /dev/null +++ b/utf/tst/test_utf16.cc @@ -0,0 +1,157 @@ +#include "utf16.hh" + +#include "utf_error.hh" + +#include <gtest/gtest.h> + +TEST(utf16be, sanity) { + std::string_view str("\x00\x24", 2); + size_t offset = 0; + auto ret = utf::read16be(str, offset); + EXPECT_EQ('$', ret); + EXPECT_EQ(2, offset); + + str = "\x20\xAC"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(0x20AC, ret); + EXPECT_EQ(2, offset); + + str = "\xD8\x01\xDC\x37"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(0x10437, ret); + EXPECT_EQ(4, offset); + + str = "\xD8\x52\xDF\x62"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(0x24B62, ret); + EXPECT_EQ(4, offset); +} + +TEST(utf16le, sanity) { + std::string_view str("\x24\x00", 2); + size_t offset = 0; + auto ret = utf::read16le(str, offset); + EXPECT_EQ('$', ret); + EXPECT_EQ(2, offset); + + str = "\xAC\x20"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(0x20AC, ret); + EXPECT_EQ(2, offset); + + str = "\x01\xD8\x37\xDC"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(0x10437, ret); + EXPECT_EQ(4, offset); + + str = "\x52\xD8\x62\xDF"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(0x24B62, ret); + EXPECT_EQ(4, offset); +} + +TEST(utf16be, bom) { + std::string_view str("\xFE\xFF\x20\xAC"); + size_t offset = 0; + auto ret = utf::read16be(str, offset); + EXPECT_EQ(0xFEFF, ret); + ret = utf::read16be(str, offset); + EXPECT_EQ(0x20AC, ret); + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} + +TEST(utf16le, bom) { + std::string_view str("\xFF\xFE\xAC\x20"); + size_t offset = 0; + auto ret = utf::read16le(str, offset); + EXPECT_EQ(0xFEFF, ret); + ret = utf::read16le(str, offset); + EXPECT_EQ(0x20AC, ret); + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} + +TEST(utf16be, invalid) { + std::string_view str("\xD8"); + size_t offset = 0; + auto ret = utf::read16be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = ""; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\xD8\x01"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\xD8\x01\xDC"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\xDC\x37\xD8\x01"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xD8\x01\xD8\x01"; + offset = 0; + ret = utf::read16be(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); +} + +TEST(utf16le, invalid) { + std::string_view str("\x01"); + size_t offset = 0; + auto ret = utf::read16le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = ""; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\x01\xD8"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\x01\xD8\x37"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\x37\xDC\x01\xD8"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\x01\xD8\x01\xD8"; + offset = 0; + ret = utf::read16le(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); +} diff --git a/utf/tst/test_utf32.cc b/utf/tst/test_utf32.cc new file mode 100644 index 0000000..796b4cd --- /dev/null +++ b/utf/tst/test_utf32.cc @@ -0,0 +1,145 @@ +#include "utf32.hh" + +#include "utf_error.hh" + +#include <gtest/gtest.h> + +TEST(utf32be, sanity) { + std::string_view str("\x00\x00\x00\x24", 4); + size_t offset = 0; + auto ret = utf::read32be(str, offset); + EXPECT_EQ('$', ret); + EXPECT_EQ(4, offset); + + str = std::string_view("\x00\x00\x20\xAC", 4); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(0x20AC, ret); + EXPECT_EQ(4, offset); + + str = std::string_view("\x00\x01\x04\x37", 4); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(0x10437, ret); + EXPECT_EQ(4, offset); +} + +TEST(utf32le, sanity) { + std::string_view str("\x24\x00\x00\x00", 4); + size_t offset = 0; + auto ret = utf::read32le(str, offset); + EXPECT_EQ('$', ret); + EXPECT_EQ(4, offset); + + str = std::string_view("\xAC\x20\x00\x00", 4); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(0x20AC, ret); + EXPECT_EQ(4, offset); + + str = std::string_view("\x37\x04\x01\x00", 4); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(0x10437, ret); + EXPECT_EQ(4, offset); +} + +TEST(utf32be, invalid) { + std::string_view str("\xFF\xFF\xFF\xFF"); + size_t offset = 0; + auto ret = utf::read32be(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\x00\xD8\x00", 4); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = ""; + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00", 1); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\x00", 2); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\x00\x00", 3); + offset = 0; + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); +} + +TEST(utf32le, invalid) { + std::string_view str("\xFF\xFF\xFF\xFF"); + size_t offset = 0; + auto ret = utf::read32le(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\xD8\x00\x00", 4); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = ""; + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00", 1); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\x00", 2); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = std::string_view("\x00\x00\x00", 3); + offset = 0; + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); +} + +TEST(utf32be, bom) { + std::string_view str("\x00\x00\xFF\xFE\x00\x00\x20\xAC", 8); + size_t offset = 0; + auto ret = utf::read32be(str, offset); + EXPECT_EQ(0xFFFE, ret); + ret = utf::read32be(str, offset); + EXPECT_EQ(0x20AC, ret); + ret = utf::read32be(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} + +TEST(utf32le, bom) { + std::string_view str("\xFE\xFF\x00\x00\xAC\x20\x00\x00", 8); + size_t offset = 0; + auto ret = utf::read32le(str, offset); + EXPECT_EQ(0xFFFE, ret); + ret = utf::read32le(str, offset); + EXPECT_EQ(0x20AC, ret); + ret = utf::read32le(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} diff --git a/utf/tst/test_utf8.cc b/utf/tst/test_utf8.cc new file mode 100644 index 0000000..10df969 --- /dev/null +++ b/utf/tst/test_utf8.cc @@ -0,0 +1,188 @@ +#include "utf8.hh" + +#include "utf_error.hh" + +#include <gtest/gtest.h> + +TEST(utf8, sanity) { + std::string_view str("$"); + size_t offset = 0; + auto ret = utf::read8(str, offset); + EXPECT_EQ('$', ret); + EXPECT_EQ(1, offset); + + str = "\xC2\xA3"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0xa3, ret); + EXPECT_EQ(2, offset); + + str = "\xD0\x98"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0x418, ret); + EXPECT_EQ(2, offset); + + str = "\xE0\xA4\xB9"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0x939, ret); + EXPECT_EQ(3, offset); + + str = "\xE2\x82\xAC"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0x20AC, ret); + EXPECT_EQ(3, offset); + + str = "\xED\x95\x9C"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0xD55C, ret); + EXPECT_EQ(3, offset); + + str = "\xF0\x90\x8D\x88"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(0x10348, ret); + EXPECT_EQ(4, offset); +} + +TEST(utf8, overlong) { + std::string_view str("\xF0\x82\x82\xAC"); + size_t offset = 0; + auto ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xE0\x81\x81"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xC0\x80"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); +} + +TEST(utf8, invalid) { + std::string_view str("\xED\xB0\x80"); + size_t offset = 0; + auto ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xFB\xFF\xFF"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xFF\xFF\xFF\xFF\xFF"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = ""; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\x80"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xC2"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\xC2\x03"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::INVALID, ret); + EXPECT_EQ(0, offset); + + str = "\xE0\xA4"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); + + str = "\xF0\x90\x8D"; + offset = 0; + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(0, offset); +} + +TEST(utf8, multiple1) { + std::string_view str("\x4D\xC3\xAC\x6E\x68\x20\x6E\xC3\xB3\x69\x20\x74\x69" + "\xE1\xBA\xBF\x6E\x67\x20\x56\x69\xE1\xBB\x87\x74"); + size_t offset = 0; + auto ret = utf::read8(str, offset); + EXPECT_EQ('M', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0xEC, ret); + ret = utf::read8(str, offset); + EXPECT_EQ('n', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('h', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(' ', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('n', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0xF3, ret); + ret = utf::read8(str, offset); + EXPECT_EQ('i', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(' ', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('t', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('i', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0x1EBF, ret); + ret = utf::read8(str, offset); + EXPECT_EQ('n', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('g', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(' ', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('V', ret); + ret = utf::read8(str, offset); + EXPECT_EQ('i', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0x1EC7, ret); + ret = utf::read8(str, offset); + EXPECT_EQ('t', ret); + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} + +TEST(utf8, multiple2) { + std::string_view str("\xF0\xA8\x89\x9F\xE5\x91\x90\xE3\x97\x82\xE8\xB6\x8A"); + size_t offset = 0; + auto ret = utf::read8(str, offset); + EXPECT_EQ(0x2825F, ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0x5450, ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0x35C2, ret); + ret = utf::read8(str, offset); + EXPECT_EQ(0x8D8A, ret); + ret = utf::read8(str, offset); + EXPECT_EQ(utf::NEED_MORE, ret); + EXPECT_EQ(str.size(), offset); +} |
