#include "utf8.hh" #include "utf_error.hh" #include TEST(utf8, sanity) { std::string_view str("$"); size_t offset = 0; auto ret = utf::read8(str, offset); EXPECT_EQ('$', ret); EXPECT_EQ(1, offset); str = "\xC2\xA3"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(0xa3, ret); EXPECT_EQ(2, offset); str = "\xD0\x98"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(0x418, ret); EXPECT_EQ(2, offset); str = "\xE0\xA4\xB9"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(0x939, ret); EXPECT_EQ(3, offset); str = "\xE2\x82\xAC"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(0x20AC, ret); EXPECT_EQ(3, offset); str = "\xED\x95\x9C"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(0xD55C, ret); EXPECT_EQ(3, offset); str = "\xF0\x90\x8D\x88"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(0x10348, ret); EXPECT_EQ(4, offset); } TEST(utf8, overlong) { std::string_view str("\xF0\x82\x82\xAC"); size_t offset = 0; auto ret = utf::read8(str, offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); str = "\xE0\x81\x81"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); str = "\xC0\x80"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); } TEST(utf8, invalid) { std::string_view str("\xED\xB0\x80"); size_t offset = 0; auto ret = utf::read8(str, offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); str = "\xFB\xFF\xFF"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); str = "\xFF\xFF\xFF\xFF\xFF"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); str = ""; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); str = "\x80"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); str = "\xC2"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); str = "\xC2\x03"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(utf::INVALID, ret); EXPECT_EQ(0, offset); str = "\xE0\xA4"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); str = "\xF0\x90\x8D"; offset = 0; ret = utf::read8(str, offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(0, offset); } TEST(utf8, multiple1) { std::string_view str("\x4D\xC3\xAC\x6E\x68\x20\x6E\xC3\xB3\x69\x20\x74\x69" "\xE1\xBA\xBF\x6E\x67\x20\x56\x69\xE1\xBB\x87\x74"); size_t offset = 0; auto ret = utf::read8(str, offset); EXPECT_EQ('M', ret); ret = utf::read8(str, offset); EXPECT_EQ(0xEC, ret); ret = utf::read8(str, offset); EXPECT_EQ('n', ret); ret = utf::read8(str, offset); EXPECT_EQ('h', ret); ret = utf::read8(str, offset); EXPECT_EQ(' ', ret); ret = utf::read8(str, offset); EXPECT_EQ('n', ret); ret = utf::read8(str, offset); EXPECT_EQ(0xF3, ret); ret = utf::read8(str, offset); EXPECT_EQ('i', ret); ret = utf::read8(str, offset); EXPECT_EQ(' ', ret); ret = utf::read8(str, offset); EXPECT_EQ('t', ret); ret = utf::read8(str, offset); EXPECT_EQ('i', ret); ret = utf::read8(str, offset); EXPECT_EQ(0x1EBF, ret); ret = utf::read8(str, offset); EXPECT_EQ('n', ret); ret = utf::read8(str, offset); EXPECT_EQ('g', ret); ret = utf::read8(str, offset); EXPECT_EQ(' ', ret); ret = utf::read8(str, offset); EXPECT_EQ('V', ret); ret = utf::read8(str, offset); EXPECT_EQ('i', ret); ret = utf::read8(str, offset); EXPECT_EQ(0x1EC7, ret); ret = utf::read8(str, offset); EXPECT_EQ('t', ret); ret = utf::read8(str, offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(str.size(), offset); } TEST(utf8, multiple2) { std::string_view str("\xF0\xA8\x89\x9F\xE5\x91\x90\xE3\x97\x82\xE8\xB6\x8A"); size_t offset = 0; auto ret = utf::read8(str, offset); EXPECT_EQ(0x2825F, ret); ret = utf::read8(str, offset); EXPECT_EQ(0x5450, ret); ret = utf::read8(str, offset); EXPECT_EQ(0x35C2, ret); ret = utf::read8(str, offset); EXPECT_EQ(0x8D8A, ret); ret = utf::read8(str, offset); EXPECT_EQ(utf::NEED_MORE, ret); EXPECT_EQ(str.size(), offset); }