From 18a622f378b403788c67fc785d30f4609caa3fc7 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Mon, 15 Sep 2025 20:52:51 +0200 Subject: uio: Unicode reader Reads UTF-8 and UTF-16 into UTF-8 or UTF-16 strings. If strict is true, fails at first invalid character. If strict is false, invalid characters are replaced with U+FFFD. For the replacement, I changed behavior if uN::read_replace to only jump one byte. Otherwise a common invalid case when ISO-8859-1 or WIN-1252 are read as UTF-8 would skip many characters. If skip_bom is true any bom at start of stream is ignored. If skip_bom is false any bom will be included. Input format can be forced, if not detect is used which will try to guess and then fallback to UTF-8. --- test/u.cc | 87 +++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 38 deletions(-) (limited to 'test/u.cc') diff --git a/test/u.cc b/test/u.cc index 53455f2..dc77e7d 100644 --- a/test/u.cc +++ b/test/u.cc @@ -4,6 +4,7 @@ #include "umod8.hh" #include "u16.hh" +#include #include namespace { @@ -20,7 +21,7 @@ TEST(u8, empty) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::End, ret.error()); - auto ret_replace = u8::read_replace(it, empty.end()); + auto ret_replace = u8::read_replace(it, empty.end(), false); ASSERT_FALSE(ret_replace.has_value()); EXPECT_EQ(u::ReadErrorReplace::End, ret_replace.error()); @@ -75,7 +76,7 @@ TEST(u8, examples) { EXPECT_EQ(it, literal.end()); it = literal.begin(); - auto ret_replace = u8::read_replace(it, literal.end()); + auto ret_replace = u8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0x10345, *ret_replace); EXPECT_EQ(it, literal.end()); @@ -153,9 +154,14 @@ TEST(u8, incomplete) { EXPECT_EQ(u::ReadError::Incomplete, ret.error()); it = literal.begin(); - auto ret_replace = u8::read_replace(it, literal.end()); + auto ret_replace = u8::read_replace(it, literal.end(), false); ASSERT_FALSE(ret_replace.has_value()); EXPECT_EQ(u::ReadErrorReplace::Incomplete, ret_replace.error()); + + it = literal.begin(); + ret_replace = u8::read_replace(it, literal.end(), true); + ASSERT_TRUE(ret_replace.has_value()); + EXPECT_EQ(0xfffd, ret_replace.value()); } { std::vector literal{0xf0}; @@ -188,10 +194,10 @@ TEST(u8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = u8::read_replace(it, literal.end()); + auto ret_replace = u8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } { std::vector literal{0xa0}; @@ -200,10 +206,10 @@ TEST(u8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = u8::read_replace(it, literal.end()); + auto ret_replace = u8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } { std::vector literal{0xce, 0xff}; @@ -212,10 +218,10 @@ TEST(u8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = u8::read_replace(it, literal.end()); + auto ret_replace = u8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } { std::vector literal{0xec, 0xff, 0x84}; @@ -224,10 +230,10 @@ TEST(u8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = u8::read_replace(it, literal.end()); + auto ret_replace = u8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } { std::vector literal{0xec, 0x9c, 0xff}; @@ -236,10 +242,10 @@ TEST(u8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = u8::read_replace(it, literal.end()); + auto ret_replace = u8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } { std::vector literal{0xf0, 0xff, 0x8d, 0x85}; @@ -248,10 +254,10 @@ TEST(u8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = u8::read_replace(it, literal.end()); + auto ret_replace = u8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } { std::vector literal{0xf0, 0x90, 0xff, 0x85}; @@ -260,10 +266,10 @@ TEST(u8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = u8::read_replace(it, literal.end()); + auto ret_replace = u8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } { std::vector literal{0xf0, 0x90, 0x8d, 0xff}; @@ -272,10 +278,10 @@ TEST(u8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = u8::read_replace(it, literal.end()); + auto ret_replace = u8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } } @@ -286,7 +292,7 @@ TEST(umod8, empty) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::End, ret.error()); - auto ret_replace = umod8::read_replace(it, empty.end()); + auto ret_replace = umod8::read_replace(it, empty.end(), false); ASSERT_FALSE(ret_replace.has_value()); EXPECT_EQ(u::ReadErrorReplace::End, ret_replace.error()); @@ -329,7 +335,7 @@ TEST(umod8, examples) { EXPECT_EQ(it, literal.end()); it = literal.begin(); - auto ret_replace = umod8::read_replace(it, literal.end()); + auto ret_replace = umod8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0x10400, *ret_replace); EXPECT_EQ(it, literal.end()); @@ -416,9 +422,14 @@ TEST(umod8, incomplete) { EXPECT_EQ(u::ReadError::Incomplete, ret.error()); it = literal.begin(); - auto ret_replace = umod8::read_replace(it, literal.end()); + auto ret_replace = umod8::read_replace(it, literal.end(), false); ASSERT_FALSE(ret_replace.has_value()); EXPECT_EQ(u::ReadErrorReplace::Incomplete, ret_replace.error()); + + it = literal.begin(); + ret_replace = umod8::read_replace(it, literal.end(), true); + ASSERT_TRUE(ret_replace.has_value()); + EXPECT_EQ(0xfffd, ret_replace.value()); } { std::vector literal{0xed, 0xa0, 0x81, 0xed, 0xb0}; @@ -437,7 +448,7 @@ TEST(umod8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = umod8::read_replace(it, literal.end()); + auto ret_replace = umod8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); EXPECT_EQ(3, literal.end() - it); @@ -449,7 +460,7 @@ TEST(umod8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = umod8::read_replace(it, literal.end()); + auto ret_replace = umod8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); EXPECT_EQ(it, literal.end()); @@ -461,10 +472,10 @@ TEST(umod8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = umod8::read_replace(it, literal.end()); + auto ret_replace = umod8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } { std::vector literal{0xec, 0xff, 0x84}; @@ -473,10 +484,10 @@ TEST(umod8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = umod8::read_replace(it, literal.end()); + auto ret_replace = umod8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } { std::vector literal{0xec, 0x9c, 0xff}; @@ -485,10 +496,10 @@ TEST(umod8, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = umod8::read_replace(it, literal.end()); + auto ret_replace = umod8::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); - EXPECT_EQ(it, literal.end()); + EXPECT_EQ(it, std::next(literal.begin())); } { std::vector literal{0xed, 0xb0, 0x80, 0xed, 0xa0, 0x81}; @@ -531,7 +542,7 @@ TEST(u16, empty) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::End, ret.error()); - auto ret_replace = u16::read_replace(it, empty.end()); + auto ret_replace = u16::read_replace(it, empty.end(), false); ASSERT_FALSE(ret_replace.has_value()); EXPECT_EQ(u::ReadErrorReplace::End, ret_replace.error()); @@ -586,7 +597,7 @@ TEST(u16, examples) { EXPECT_EQ(it, literal.end()); it = literal.begin(); - auto ret_replace = u16::read_replace(it, literal.end()); + auto ret_replace = u16::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0x24b62, *ret_replace); EXPECT_EQ(it, literal.end()); @@ -652,11 +663,11 @@ TEST(u16, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = u16::read_replace(it, literal.end()); + auto ret_replace = u16::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); EXPECT_NE(it, literal.end()); - ret_replace = u16::read_replace(it, literal.end()); + ret_replace = u16::read_replace(it, literal.end(), false); ASSERT_FALSE(ret_replace.has_value()); EXPECT_EQ(u::ReadErrorReplace::Incomplete, ret_replace.error()); } @@ -667,22 +678,22 @@ TEST(u16, invalid) { ASSERT_FALSE(ret.has_value()); EXPECT_EQ(u::ReadError::Invalid, ret.error()); it = literal.begin(); - auto ret_replace = u16::read_replace(it, literal.end()); + auto ret_replace = u16::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); EXPECT_NE(it, literal.end()); - ret_replace = u16::read_replace(it, literal.end()); + ret_replace = u16::read_replace(it, literal.end(), false); ASSERT_FALSE(ret_replace.has_value()); EXPECT_EQ(u::ReadErrorReplace::Incomplete, ret_replace.error()); } { std::vector literal{0xdc37, 0xdf62}; auto it = literal.begin(); - auto ret_replace = u16::read_replace(it, literal.end()); + auto ret_replace = u16::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); EXPECT_NE(it, literal.end()); - ret_replace = u16::read_replace(it, literal.end()); + ret_replace = u16::read_replace(it, literal.end(), false); ASSERT_TRUE(ret_replace.has_value()); EXPECT_EQ(0xfffd, *ret_replace); EXPECT_EQ(it, literal.end()); -- cgit v1.3