From 7dd49c6293172b494c78918507242cdb55d35137 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Sun, 21 Jan 2024 12:31:30 +0100 Subject: WIP --- utf/inc/utf16.hh | 12 +-- utf/inc/utf32.hh | 12 +-- utf/inc/utf8.hh | 19 ++++- utf/meson.build | 6 +- utf/src/utf16.cc | 4 +- utf/src/utf32.cc | 4 +- utf/src/utf8.cc | 35 ++++++++- utf/tst/test_utf16.cc | 81 ++++++++------------ utf/tst/test_utf32.cc | 85 ++++++++++----------- utf/tst/test_utf8.cc | 204 ++++++++++++++++++++++++++++++++------------------ 10 files changed, 270 insertions(+), 192 deletions(-) (limited to 'utf') diff --git a/utf/inc/utf16.hh b/utf/inc/utf16.hh index 344b1a2..b9229bc 100644 --- a/utf/inc/utf16.hh +++ b/utf/inc/utf16.hh @@ -4,27 +4,29 @@ #include "macros.hh" #include -#include +#include namespace utf { -/* Read one unicode codepoint from UTF-16 BigEndian encoded data if possible. +/** + * Read one unicode codepoint from UTF-16 BigEndian encoded data if possible. * If successfull offset is incremented to point to next codepoint. * Will fail: * - not enough data is left in data given offset, returns NEED_MORE. * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs, * returns INVALID. */ -uint32_t HIDDEN read16be(std::string_view data, std::size_t& offset); +uint32_t HIDDEN read16be(std::span data, std::size_t& offset); -/* Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible. +/** + * Read one unicode codepoint from UTF-16 LittleEndian encoded data if possible. * If successfull offset is incremented to point to next codepoint. * Will fail: * - not enough data is left in data given offset, returns NEED_MORE. * - data is not valid UTF-16, ie. invalid or incomplete surrogate pairs, * returns INVALID. */ -uint32_t HIDDEN read16le(std::string_view data, std::size_t& offset); +uint32_t HIDDEN read16le(std::span data, std::size_t& offset); } // namespace utf diff --git a/utf/inc/utf32.hh b/utf/inc/utf32.hh index 2d3088e..4ee5eac 100644 --- a/utf/inc/utf32.hh +++ b/utf/inc/utf32.hh @@ -4,25 +4,27 @@ #include "macros.hh" #include -#include +#include namespace utf { -/* Read one unicode codepoint from UTF-32 BigEndian encoded data if possible. +/** + * Read one unicode codepoint from UTF-32 BigEndian encoded data if possible. * If successfull offset is incremented to point to next codepoint. * Will fail: * - not enough data is left in data given offset, returns NEED_MORE. * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID. */ -uint32_t HIDDEN read32be(std::string_view data, std::size_t& offset); +uint32_t HIDDEN read32be(std::span data, std::size_t& offset); -/* Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible. +/** + * Read one unicode codepoint from UTF-32 LittleEndian encoded data if possible. * If successfull offset is incremented to point to next codepoint. * Will fail: * - not enough data is left in data given offset, returns NEED_MORE. * - data is not valid UTF-32, ie. outside valid ranges, returns INVALID. */ -uint32_t HIDDEN read32le(std::string_view data, std::size_t& offset); +uint32_t HIDDEN read32le(std::span data, std::size_t& offset); } // namespace utf diff --git a/utf/inc/utf8.hh b/utf/inc/utf8.hh index a3ea84a..7735ecd 100644 --- a/utf/inc/utf8.hh +++ b/utf/inc/utf8.hh @@ -4,18 +4,29 @@ #include "macros.hh" #include -#include +#include namespace utf { -/* Read one unicode codepoint from UTF-8 encoded data if possible. - * If successfull offset is incremented to point to next codepoint. +/** + * Read one unicode codepoint from UTF-8 encoded data if possible. + * If successful, offset is incremented to point to next codepoint. * Will fail: * - not enough data is left in data given offset, returns NEED_MORE. * - data is not valid UTF-8, this includes overlong encodings and * invalid unicode code points, returns INVALID. */ -uint32_t HIDDEN read8(std::string_view data, std::size_t& offset); +uint32_t HIDDEN read8(std::span data, std::size_t& offset); + +/** + * Write one unicode codepoint to UTF-8 encoded data if possible. + * If successful, offset is incremented to the end of the written data + * and true is returned. + * If not successful, offset is not incremented and false is returned. + * data is not modified. + */ +bool HIDDEN write8(uint32_t codepoint, std::span data, + std::size_t& offset); } // namespace utf diff --git a/utf/meson.build b/utf/meson.build index 64db6ff..051ddd1 100644 --- a/utf/meson.build +++ b/utf/meson.build @@ -23,16 +23,16 @@ test('utf8', executable( 'test_utf8', sources: ['tst/test_utf8.cc'], - dependencies: [utf_dep, gtest_dep])) + dependencies: [utf_dep, gmock_dep, gtest_dep])) test('utf16', executable( 'test_utf16', sources: ['tst/test_utf16.cc'], - dependencies: [utf_dep, gtest_dep])) + dependencies: [utf_dep, gmock_dep, gtest_dep])) test('utf32', executable( 'test_utf32', sources: ['tst/test_utf32.cc'], - dependencies: [utf_dep, gtest_dep])) + dependencies: [utf_dep, gmock_dep, gtest_dep])) diff --git a/utf/src/utf16.cc b/utf/src/utf16.cc index 43595bf..623c1be 100644 --- a/utf/src/utf16.cc +++ b/utf/src/utf16.cc @@ -16,7 +16,7 @@ inline bool is_low_surrogate(uint16_t c) { } // namespace -uint32_t read16be(std::string_view data, std::size_t& offset) { +uint32_t read16be(std::span data, std::size_t& offset) { if (offset > data.size() || data.size() - offset < 2) return NEED_MORE; uint16_t c = static_cast(data[offset]) << 8 @@ -40,7 +40,7 @@ uint32_t read16be(std::string_view data, std::size_t& offset) { return c; } -uint32_t read16le(std::string_view data, std::size_t& offset) { +uint32_t read16le(std::span data, std::size_t& offset) { if (offset > data.size() || data.size() - offset < 2) return NEED_MORE; uint16_t c = static_cast(data[offset + 1]) << 8 diff --git a/utf/src/utf32.cc b/utf/src/utf32.cc index cfa29b6..e33b0b4 100644 --- a/utf/src/utf32.cc +++ b/utf/src/utf32.cc @@ -12,7 +12,7 @@ inline bool valid_codepoint(uint32_t c) { } // namespace -uint32_t read32be(std::string_view data, std::size_t& offset) { +uint32_t read32be(std::span data, std::size_t& offset) { if (offset > data.size() || data.size() - offset < 4) return NEED_MORE; uint32_t c = static_cast(data[offset]) << 24 @@ -26,7 +26,7 @@ uint32_t read32be(std::string_view data, std::size_t& offset) { return INVALID; } -uint32_t read32le(std::string_view data, std::size_t& offset) { +uint32_t read32le(std::span data, std::size_t& offset) { if (offset > data.size() || data.size() - offset < 4) return NEED_MORE; uint32_t c = static_cast(data[offset + 3]) << 24 diff --git a/utf/src/utf8.cc b/utf/src/utf8.cc index 54b0296..0e444ae 100644 --- a/utf/src/utf8.cc +++ b/utf/src/utf8.cc @@ -12,12 +12,12 @@ inline bool valid_codepoint(uint32_t c) { } // namespace -uint32_t read8(std::string_view data, std::size_t& offset) { +uint32_t read8(std::span data, std::size_t& offset) { if (offset >= data.size()) return NEED_MORE; uint32_t ret; uint8_t size; - switch (static_cast(data[offset]) >> 4) { + switch (data[offset] >> 4) { case 15: if (data[offset] & 0x08) return INVALID; @@ -65,4 +65,35 @@ uint32_t read8(std::string_view data, std::size_t& offset) { return ret; } +bool write8(uint32_t codepoint, std::span data, std::size_t& offset) { + if (offset >= data.size()) UNLIKELY { + return false; + } + if (codepoint < 0x80) { + data[offset++] = codepoint; + } else if (codepoint < 0x800) { + if (data.size() - offset < 2) UNLIKELY { + return false; + } + data[offset++] = 0xc0 | (codepoint >> 6); + data[offset++] = 0x80 | (codepoint & 0x3f); + } else if (codepoint < 0x10000) { + if (data.size() - offset < 3) UNLIKELY { + return false; + } + data[offset++] = 0xe0 | (codepoint >> 12); + data[offset++] = 0x80 | ((codepoint >> 6) & 0x3f); + data[offset++] = 0x80 | (codepoint & 0x3f); + } else { + if (data.size() - offset < 4) UNLIKELY { + return false; + } + data[offset++] = 0xf0 | (codepoint >> 18); + data[offset++] = 0x80 | ((codepoint >> 12) & 0x3f); + data[offset++] = 0x80 | ((codepoint >> 6) & 0x3f); + data[offset++] = 0x80 | (codepoint & 0x3f); + } + return true; +} + } // namespace utf diff --git a/utf/tst/test_utf16.cc b/utf/tst/test_utf16.cc index c17982e..3b3c03c 100644 --- a/utf/tst/test_utf16.cc +++ b/utf/tst/test_utf16.cc @@ -2,156 +2,137 @@ #include "utf_error.hh" +#include #include TEST(utf16be, sanity) { - std::string_view str("\x00\x24", 2); size_t offset = 0; - auto ret = utf::read16be(str, offset); + auto ret = utf::read16be(std::array({0x00, 0x24}), offset); EXPECT_EQ('$', ret); EXPECT_EQ(2, offset); - str = "\x20\xAC"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array({0x20, 0xAC}), offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(2, offset); - str = "\xD8\x01\xDC\x37"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array({0xD8, 0x01, 0xDC, 0x37}), offset); EXPECT_EQ(0x10437, ret); EXPECT_EQ(4, offset); - str = "\xD8\x52\xDF\x62"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array({0xD8, 0x52, 0xDF, 0x62}), offset); EXPECT_EQ(0x24B62, ret); EXPECT_EQ(4, offset); } TEST(utf16le, sanity) { - std::string_view str("\x24\x00", 2); size_t offset = 0; - auto ret = utf::read16le(str, offset); + auto ret = utf::read16le(std::array({0x24, 0x00}), offset); EXPECT_EQ('$', ret); EXPECT_EQ(2, offset); - str = "\xAC\x20"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array({0xAC, 0x20}), offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(2, offset); - str = "\x01\xD8\x37\xDC"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array({0x01, 0xD8, 0x37, 0xDC}), offset); EXPECT_EQ(0x10437, ret); EXPECT_EQ(4, offset); - str = "\x52\xD8\x62\xDF"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array({0x52, 0xD8, 0x62, 0xDF}), offset); EXPECT_EQ(0x24B62, ret); EXPECT_EQ(4, offset); } TEST(utf16be, bom) { - std::string_view str("\xFE\xFF\x20\xAC"); + std::array data({0xFE, 0xFF, 0x20, 0xAC}); size_t offset = 0; - auto ret = utf::read16be(str, offset); + auto ret = utf::read16be(data, offset); EXPECT_EQ(0xFEFF, ret); - ret = utf::read16be(str, offset); + ret = utf::read16be(data, offset); EXPECT_EQ(0x20AC, ret); - ret = utf::read16be(str, offset); + ret = utf::read16be(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); } TEST(utf16le, bom) { - std::string_view str("\xFF\xFE\xAC\x20"); + std::array data({0xFF, 0xFE, 0xAC, 0x20}); size_t offset = 0; - auto ret = utf::read16le(str, offset); + auto ret = utf::read16le(data, offset); EXPECT_EQ(0xFEFF, ret); - ret = utf::read16le(str, offset); + ret = utf::read16le(data, offset); EXPECT_EQ(0x20AC, ret); - ret = utf::read16le(str, offset); + ret = utf::read16le(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); } TEST(utf16be, invalid) { - std::string_view str("\xD8"); size_t offset = 0; - auto ret = utf::read16be(str, offset); + auto ret = utf::read16be(std::array({0xD8}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = ""; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array(), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\xD8\x01"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array({0xD8, 0x01}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\xD8\x01\xDC"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array({0xD8, 0x01, 0xDC}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\xDC\x37\xD8\x01"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array({0xDC, 0x37, 0xD8, 0x01}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xD8\x01\xD8\x01"; offset = 0; - ret = utf::read16be(str, offset); + ret = utf::read16be(std::array({0xD8, 0x01, 0xD8, 0x01}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); } TEST(utf16le, invalid) { - std::string_view str("\x01"); size_t offset = 0; - auto ret = utf::read16le(str, offset); + auto ret = utf::read16le(std::array({0x01}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = ""; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array(), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\x01\xD8"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array({0x01, 0xD8}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\x01\xD8\x37"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array({0x01, 0xD8, 0x37}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\x37\xDC\x01\xD8"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array({0x37, 0xDC, 0x01, 0xD8}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\x01\xD8\x01\xD8"; offset = 0; - ret = utf::read16le(str, offset); + ret = utf::read16le(std::array({0x01, 0xD8, 0x01, 0xD8}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); } diff --git a/utf/tst/test_utf32.cc b/utf/tst/test_utf32.cc index 796b4cd..447b541 100644 --- a/utf/tst/test_utf32.cc +++ b/utf/tst/test_utf32.cc @@ -2,144 +2,137 @@ #include "utf_error.hh" +#include #include TEST(utf32be, sanity) { - std::string_view str("\x00\x00\x00\x24", 4); size_t offset = 0; - auto ret = utf::read32be(str, offset); + auto ret = utf::read32be( + std::array({0x00, 0x00, 0x00, 0x24}), offset); EXPECT_EQ('$', ret); EXPECT_EQ(4, offset); - str = std::string_view("\x00\x00\x20\xAC", 4); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be( + std::array({0x00, 0x00, 0x20, 0xAC}), offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(4, offset); - str = std::string_view("\x00\x01\x04\x37", 4); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be( + std::array({0x00, 0x01, 0x04, 0x37}), offset); EXPECT_EQ(0x10437, ret); EXPECT_EQ(4, offset); } TEST(utf32le, sanity) { - std::string_view str("\x24\x00\x00\x00", 4); size_t offset = 0; - auto ret = utf::read32le(str, offset); + auto ret = utf::read32le( + std::array({0x24, 0x00, 0x00, 0x00}), offset); EXPECT_EQ('$', ret); EXPECT_EQ(4, offset); - str = std::string_view("\xAC\x20\x00\x00", 4); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le( + std::array({0xAC, 0x20, 0x00, 0x00}), offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(4, offset); - str = std::string_view("\x37\x04\x01\x00", 4); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le( + std::array({0x37, 0x04, 0x01, 0x00}), offset); EXPECT_EQ(0x10437, ret); EXPECT_EQ(4, offset); } TEST(utf32be, invalid) { - std::string_view str("\xFF\xFF\xFF\xFF"); size_t offset = 0; - auto ret = utf::read32be(str, offset); + auto ret = utf::read32be( + std::array({0xFF, 0xFF, 0xFF, 0xFF}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\x00\xD8\x00", 4); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be( + std::array({0x00, 0x00, 0xD8, 0x00}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = ""; offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be(std::array({}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00", 1); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be(std::array({0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\x00", 2); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be(std::array({0x00, 0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\x00\x00", 3); offset = 0; - ret = utf::read32be(str, offset); + ret = utf::read32be(std::array({0x00, 0x00, 0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); } TEST(utf32le, invalid) { - std::string_view str("\xFF\xFF\xFF\xFF"); size_t offset = 0; - auto ret = utf::read32le(str, offset); + auto ret = utf::read32le( + std::array({0xFF, 0xFF, 0xFF, 0xFF}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\xD8\x00\x00", 4); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le( + std::array({0x00, 0xD8, 0x00, 0x00}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = ""; offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le(std::array(), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00", 1); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le(std::array({0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\x00", 2); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le(std::array({0x00, 0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = std::string_view("\x00\x00\x00", 3); offset = 0; - ret = utf::read32le(str, offset); + ret = utf::read32le(std::array({0x00, 0x00, 0x00}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); } TEST(utf32be, bom) { - std::string_view str("\x00\x00\xFF\xFE\x00\x00\x20\xAC", 8); + std::array data({0x00, 0x00, 0xFF, 0xFE, 0x00, 0x00, 0x20, 0xAC}); size_t offset = 0; - auto ret = utf::read32be(str, offset); + auto ret = utf::read32be(data, offset); EXPECT_EQ(0xFFFE, ret); - ret = utf::read32be(str, offset); + ret = utf::read32be(data, offset); EXPECT_EQ(0x20AC, ret); - ret = utf::read32be(str, offset); + ret = utf::read32be(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); } TEST(utf32le, bom) { - std::string_view str("\xFE\xFF\x00\x00\xAC\x20\x00\x00", 8); + std::array data({0xFE, 0xFF, 0x00, 0x00, 0xAC, 0x20, 0x00, 0x00}); size_t offset = 0; - auto ret = utf::read32le(str, offset); + auto ret = utf::read32le(data, offset); EXPECT_EQ(0xFFFE, ret); - ret = utf::read32le(str, offset); + ret = utf::read32le(data, offset); EXPECT_EQ(0x20AC, ret); - ret = utf::read32le(str, offset); + ret = utf::read32le(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); } diff --git a/utf/tst/test_utf8.cc b/utf/tst/test_utf8.cc index 10df969..8bdeba4 100644 --- a/utf/tst/test_utf8.cc +++ b/utf/tst/test_utf8.cc @@ -2,187 +2,245 @@ #include "utf_error.hh" +#include +#include #include +#include -TEST(utf8, sanity) { - std::string_view str("$"); +TEST(utf8, read_sanity) { size_t offset = 0; - auto ret = utf::read8(str, offset); + auto ret = utf::read8(std::array({'$'}), offset); EXPECT_EQ('$', ret); EXPECT_EQ(1, offset); - str = "\xC2\xA3"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xC2, 0xA3}), offset); EXPECT_EQ(0xa3, ret); EXPECT_EQ(2, offset); - str = "\xD0\x98"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xD0, 0x98}), offset); EXPECT_EQ(0x418, ret); EXPECT_EQ(2, offset); - str = "\xE0\xA4\xB9"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xE0, 0xA4, 0xB9}), offset); EXPECT_EQ(0x939, ret); EXPECT_EQ(3, offset); - str = "\xE2\x82\xAC"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xE2, 0x82, 0xAC}), offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(3, offset); - str = "\xED\x95\x9C"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xED, 0x95, 0x9C}), offset); EXPECT_EQ(0xD55C, ret); EXPECT_EQ(3, offset); - str = "\xF0\x90\x8D\x88"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xF0, 0x90, 0x8D, 0x88}), offset); EXPECT_EQ(0x10348, ret); EXPECT_EQ(4, offset); } -TEST(utf8, overlong) { - std::string_view str("\xF0\x82\x82\xAC"); +TEST(utf8, write_sanity) { + std::array out; size_t offset = 0; - auto ret = utf::read8(str, offset); + EXPECT_TRUE(utf::write8('$', out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre('$')); + EXPECT_EQ(1, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0xa3, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xC2, 0xA3)); + EXPECT_EQ(2, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0x418, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xD0, 0x98)); + EXPECT_EQ(2, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0x939, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xE0, 0xA4, 0xB9)); + EXPECT_EQ(3, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0x20AC, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xE2, 0x82, 0xAC)); + EXPECT_EQ(3, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0xD55C, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xED, 0x95, 0x9C)); + EXPECT_EQ(3, offset); + + offset = 0; + EXPECT_TRUE(utf::write8(0x10348, out, offset)); + EXPECT_THAT(std::span(out).subspan(0, offset), + testing::ElementsAre(0xF0, 0x90, 0x8D, 0x88)); + EXPECT_EQ(4, offset); +} + +TEST(utf8, read_overlong) { + size_t offset = 0; + auto ret = utf::read8( + std::array({0xF0, 0x82, 0x82, 0xAC}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xE0\x81\x81"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xE0, 0x81, 0x81}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xC0\x80"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xC0, 0x80}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); } -TEST(utf8, invalid) { - std::string_view str("\xED\xB0\x80"); +TEST(utf8, read_invalid) { size_t offset = 0; - auto ret = utf::read8(str, offset); + auto ret = utf::read8(std::array({0xED, 0xB0, 0x80}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xFB\xFF\xFF"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xFB, 0xFF, 0xFF}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xFF\xFF\xFF\xFF\xFF"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8( + std::array({0xFF, 0xFF, 0xFF, 0xFF, 0xFF}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = ""; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array(), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\x80"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0x80}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xC2"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xC2}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\xC2\x03"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xC2, 0x03}), offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); - str = "\xE0\xA4"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xE0, 0xA4}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); - str = "\xF0\x90\x8D"; offset = 0; - ret = utf::read8(str, offset); + ret = utf::read8(std::array({0xF0, 0x90, 0x8D}), offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); } -TEST(utf8, multiple1) { - std::string_view str("\x4D\xC3\xAC\x6E\x68\x20\x6E\xC3\xB3\x69\x20\x74\x69" - "\xE1\xBA\xBF\x6E\x67\x20\x56\x69\xE1\xBB\x87\x74"); +TEST(utf8, read_multiple1) { + std::array data({ + 0x4D, 0xC3, 0xAC, 0x6E, 0x68, 0x20, 0x6E, 0xC3, 0xB3, 0x69, + 0x20, 0x74, 0x69, 0xE1, 0xBA, 0xBF, 0x6E, 0x67, 0x20, 0x56, + 0x69, 0xE1, 0xBB, 0x87, 0x74 + }); size_t offset = 0; - auto ret = utf::read8(str, offset); + auto ret = utf::read8(data, offset); EXPECT_EQ('M', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0xEC, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('n', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('h', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(' ', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('n', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0xF3, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('i', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(' ', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('t', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('i', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0x1EBF, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('n', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('g', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(' ', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('V', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('i', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0x1EC7, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ('t', ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); } -TEST(utf8, multiple2) { - std::string_view str("\xF0\xA8\x89\x9F\xE5\x91\x90\xE3\x97\x82\xE8\xB6\x8A"); +TEST(utf8, read_multiple2) { + std::array data({ + 0xF0, 0xA8, 0x89, 0x9F, 0xE5, 0x91, 0x90, 0xE3, 0x97, 0x82, + 0xE8, 0xB6, 0x8A, + }); size_t offset = 0; - auto ret = utf::read8(str, offset); + auto ret = utf::read8(data, offset); EXPECT_EQ(0x2825F, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0x5450, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0x35C2, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(0x8D8A, ret); - ret = utf::read8(str, offset); + ret = utf::read8(data, offset); EXPECT_EQ(utf::NEED_MORE, ret); - EXPECT_EQ(str.size(), offset); + EXPECT_EQ(data.size(), offset); +} + +TEST(utf8, write_no_space) { + std::array data; + std::span out(data); + size_t offset = 0; + EXPECT_FALSE(utf::write8('$', out.subspan(0, 0), offset)); + EXPECT_EQ(0u, offset); + + EXPECT_FALSE(utf::write8(0xa3, out.subspan(0, 1), offset)); + EXPECT_EQ(0u, offset); + EXPECT_FALSE(utf::write8(0x418, out.subspan(0, 0), offset)); + EXPECT_EQ(0u, offset); + + EXPECT_FALSE(utf::write8(0x939, out.subspan(0, 2), offset)); + EXPECT_EQ(0u, offset); + EXPECT_FALSE(utf::write8(0x20AC, out.subspan(0, 0), offset)); + EXPECT_EQ(0u, offset); + + EXPECT_FALSE(utf::write8(0x10348, out.subspan(0, 3), offset)); + EXPECT_EQ(0u, offset); } -- cgit v1.2.3-70-g09d2