summaryrefslogtreecommitdiff
path: root/src/uio.hh
diff options
context:
space:
mode:
authorJoel Klinghed <the_jk@spawned.biz>2025-09-15 20:52:51 +0200
committerJoel Klinghed <the_jk@spawned.biz>2025-09-15 20:52:51 +0200
commit18a622f378b403788c67fc785d30f4609caa3fc7 (patch)
tree9d13f4ef49a06c9e4837487f61bc90b734ad9b9a /src/uio.hh
parent28c6425e4ed1cd2eab538e7cba08c18aa83d8af5 (diff)
uio: Unicode reader
Reads UTF-8 and UTF-16 into UTF-8 or UTF-16 strings. If strict is true, fails at first invalid character. If strict is false, invalid characters are replaced with U+FFFD. For the replacement, I changed behavior if uN::read_replace to only jump one byte. Otherwise a common invalid case when ISO-8859-1 or WIN-1252 are read as UTF-8 would skip many characters. If skip_bom is true any bom at start of stream is ignored. If skip_bom is false any bom will be included. Input format can be forced, if not detect is used which will try to guess and then fallback to UTF-8.
Diffstat (limited to 'src/uio.hh')
-rw-r--r--src/uio.hh78
1 files changed, 78 insertions, 0 deletions
diff --git a/src/uio.hh b/src/uio.hh
new file mode 100644
index 0000000..a0911a1
--- /dev/null
+++ b/src/uio.hh
@@ -0,0 +1,78 @@
+#ifndef UIO_HH
+#define UIO_HH
+
+#include "io.hh" // IWYU pragma: export
+
+#include <cstddef>
+#include <expected>
+#include <string>
+
+namespace u {
+
+enum class ReaderInputFormat {
+ UTF8,
+ UTF16_BE,
+ UTF16_LE,
+ DETECT,
+};
+
+struct ReaderConfig {
+ // If false (default), invalid data is replaced with U+FFFD
+ bool strict{false};
+ // Input format
+ ReaderInputFormat input{ReaderInputFormat::DETECT};
+ // If true (default), any BOM found at start of stream will be skipped
+ bool skip_bom{true};
+};
+
+} // namespace u8
+
+namespace u8 {
+
+class Reader : public io::Reader {
+ public:
+ using io::Reader::read;
+ using io::Reader::repeat_read;
+
+ [[nodiscard]] std::expected<size_t, io::ReadError> read(
+ std::string& data, size_t max);
+
+ [[nodiscard]] std::expected<size_t, io::ReadError> repeat_read(
+ std::string& data, size_t max);
+};
+
+[[nodiscard]] std::unique_ptr<Reader> open(
+ std::unique_ptr<io::Reader> reader, u::ReaderConfig config = {});
+
+[[nodiscard]] std::expected<std::unique_ptr<Reader>, io::OpenError> open(
+ const std::string& file_path, u::ReaderConfig config = {});
+[[nodiscard]] std::expected<std::unique_ptr<Reader>, io::OpenError> openat(
+ int dirfd, const std::string& file_path, u::ReaderConfig config = {});
+
+} // namespace u8
+
+namespace u16 {
+
+class Reader : public io::Reader {
+ public:
+ using io::Reader::read;
+ using io::Reader::repeat_read;
+
+ [[nodiscard]] std::expected<size_t, io::ReadError> read(
+ std::u16string& data, size_t max);
+
+ [[nodiscard]] std::expected<size_t, io::ReadError> repeat_read(
+ std::u16string& data, size_t max);
+};
+
+[[nodiscard]] std::unique_ptr<Reader> open(
+ std::unique_ptr<io::Reader> reader, u::ReaderConfig config = {});
+
+[[nodiscard]] std::expected<std::unique_ptr<Reader>, io::OpenError> open(
+ const std::string& file_path, u::ReaderConfig config = {});
+[[nodiscard]] std::expected<std::unique_ptr<Reader>, io::OpenError> openat(
+ int dirfd, const std::string& file_path, u::ReaderConfig config = {});
+
+} // namespace u16
+
+#endif // UIO_HH