From 50348284f5d82ccfd65b0c803ba0ba895912ceff Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Thu, 18 Sep 2025 23:57:56 +0200 Subject: java::uescape: Unicode reader that knows about Java's \uXXXX escapes --- src/java_uescape.cc | 380 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/java_uescape.hh | 24 ++++ src/u16.hh | 8 +- src/u8.hh | 8 +- src/umod8.hh | 6 +- 5 files changed, 415 insertions(+), 11 deletions(-) create mode 100644 src/java_uescape.cc create mode 100644 src/java_uescape.hh (limited to 'src') diff --git a/src/java_uescape.cc b/src/java_uescape.cc new file mode 100644 index 0000000..925b050 --- /dev/null +++ b/src/java_uescape.cc @@ -0,0 +1,380 @@ +#include "java_uescape.hh" + +#include "u16.hh" +#include "u8.hh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +const size_t kBufferSize = 1024; + +template +class UnicodeEscapeReader { + public: + UnicodeEscapeReader(std::unique_ptr reader, T backslash, T u, T zero, + T nine, T lc_a, T lc_f, T uc_a, T uc_f) + : reader_(std::move(reader)), + backslash_(backslash), + u_(u), + zero_(zero), + nine_(nine), + lc_a_(lc_a), + lc_f_(lc_f), + uc_a_(uc_a), + uc_f_(uc_f), + buffer_(std::make_unique_for_overwrite(kBufferSize)) {} + + [[nodiscard]] std::expected read(void* dst, + size_t max) { + State state; + bool eof; + + state.wstart = reinterpret_cast(dst); + state.wptr = state.wstart; + // NOLINTNEXTLINE(bugprone-sizeof-expression) + state.wend = state.wstart + max / sizeof(T); + + if (fill_ == 0) { + // Optimize for the case when there are no unicode escapes. + auto ret = reader_->read(dst, max); + if (!ret.has_value() || ret.value() == 0) { + return ret; + } + + eof = false; + state.rstart = reinterpret_cast(dst); + // NOLINTNEXTLINE(bugprone-sizeof-expression) + state.rend = state.rstart + (ret.value() / sizeof(T)); + state.shared = true; + } else { + if (error_.has_value()) { + return std::unexpected(error_.value()); + } + + auto ret = reader_->read(buffer_.get() + fill_, + (kBufferSize - fill_) * sizeof(T)); + if (!ret.has_value()) { + return ret; + } + eof = ret.value() == 0; + fill_ += ret.value() / sizeof(T); + + state.rstart = buffer_.get(); + state.rend = buffer_.get() + fill_; + state.shared = false; + } + + while (true) { + auto* ptr = std::find(state.rstart, state.rend, backslash_); + if (transfer(state, ptr) || ptr == state.rend) + return finish(state); + + auto* const first_backslash = ptr; + do { + ptr++; + } while (ptr < state.rend && *ptr == backslash_); + + if (ptr == state.rend && !eof) + return finish(state); + + if (ptr == state.rend || *ptr != u_ || (ptr - first_backslash) % 2 == 0) { + // Even number of backslashes, or no u + if (transfer(state, ptr)) + return finish(state); + continue; + } + + // auto* const first_u = ptr; + do { + ptr++; + } while (ptr < state.rend && *ptr == u_); + + if (state.rend - ptr < 4) { + if (!eof) + return finish(state); + + // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs. + return std::unexpected(error(io::ReadError::InvalidData)); + } + + auto maybe_code = unhex4(ptr); + if (!maybe_code.has_value()) { + // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs. + return std::unexpected(error(io::ReadError::InvalidData)); + } + ptr += 4; + uint32_t code = maybe_code.value(); + + if (code >= 0xdc00 && code <= 0xdfff) { + // Low surrogate before (or without?), replace + code = 0xfffd; + } + + if (code < 0xd800 || code > 0xdfff) { + // Not a pair + if (write_code(state, code, ptr)) + return finish(state); + continue; + } + + auto* const second_first_backslash = ptr; + do { + ptr++; + } while (ptr < state.rend && *ptr == backslash_); + + if (ptr == state.rend && !eof) + return finish(state); + + if (ptr == state.rend || *ptr != u_ || + (ptr - second_first_backslash) % 2 == 0) { + // High surrogate not followed by an escape, write out replacement + // and restart. + if (write_code(state, 0xfffd, second_first_backslash)) + return finish(state); + continue; + } + + // auto* const first_u = ptr; + do { + ptr++; + } while (ptr < state.rend && *ptr == u_); + + if (state.rend - ptr < 4) { + if (!eof) + return finish(state); + + // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs. + return std::unexpected(error(io::ReadError::InvalidData)); + } + + maybe_code = unhex4(ptr); + if (!maybe_code.has_value()) { + // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs. + return std::unexpected(error(io::ReadError::InvalidData)); + } + ptr += 4; + uint32_t low_code = maybe_code.value(); + + if (low_code >= 0xdc00 && low_code <= 0xdfff) { + // Pair + code = 0x10000 + (((code - 0xd800) << 10) | (low_code - 0xdc00)); + if (write_code(state, code, ptr)) + return finish(state); + } else { + // High surrogate not followed by an low surrogate, write out + // replacement and restart. + if (write_code(state, 0xfffd, second_first_backslash)) + return finish(state); + } + } + } + + [[nodiscard]] std::expected skip(size_t max) { + auto tmp = std::make_unique_for_overwrite(max); + return read(tmp.get(), max); + } + + private: + struct State { + T* rstart; + T* rend; + + bool shared; + + T* wstart; + T* wptr; + T* wend; + }; + + bool transfer(State& state, T*& ptr) { + if (state.shared) { + state.rstart = ptr; + state.wptr = ptr; + return false; + } + size_t ravail = ptr - state.rstart; + size_t wavail = state.wend - state.wptr; + size_t avail = std::min(ravail, wavail); + + memcpy(state.wptr, state.rstart, avail); + state.wptr += avail; + fill_ -= avail; + memmove(buffer_.get(), ptr, fill_ * sizeof(T)); + + if (avail != ravail) + return true; + + ptr -= ravail; + state.rstart = buffer_.get(); + state.rend = buffer_.get() + fill_; + return false; + } + + std::expected finish(State const& state) { + if (state.shared) { + size_t avail = state.rend - state.rstart; + if (fill_ + avail > kBufferSize) + abort(); // NOLINT(misc-include-cleaner) + memcpy(buffer_.get() + fill_, state.rstart, avail * sizeof(T)); + fill_ += avail; + } + + // We can't return zero, that is treated as EOF, we need to read more. + if (state.wptr == state.wstart) { + if (fill_ > 0) { + return read(state.wstart, (state.wend - state.wstart) * sizeof(T)); + } + } + + return (state.wptr - state.wstart) * sizeof(T); + } + + bool write_code(State& state, uint32_t code, T* ptr) { + auto* const wptr = state.wptr; + if (!writer_(state.wptr, state.wend, code)) + return true; + + if (state.shared) { + // Remove the extra bytes (if any) + auto* rstart = state.rstart + (state.wptr - wptr); + assert(ptr >= rstart); + memmove(rstart, ptr, (state.rend - ptr) * sizeof(T)); + state.rend -= (ptr - rstart); + state.rstart = rstart; + } else { + // Just drop escape from buffer + fill_ = state.rend - ptr; + memmove(buffer_.get(), ptr, fill_ * sizeof(T)); + state.rstart = buffer_.get(); + state.rend = state.rstart + fill_; + } + return false; + } + + io::ReadError error(io::ReadError err) { + // If read() returns an error it should continue to do so. + error_ = err; + fill_ = 1; + return err; + } + + [[nodiscard]] std::optional unhex4(T* ptr) const { + auto a = unhex1(ptr[0]); + auto b = unhex1(ptr[1]); + auto c = unhex1(ptr[2]); + auto d = unhex1(ptr[3]); + if (a.has_value() && b.has_value() && c.has_value() && d.has_value()) + return (*a << 12) | (*b << 8) | (*c << 4) | *d; + return std::nullopt; + } + + [[nodiscard]] std::optional unhex1(T c) const { + if (c >= zero_ && c <= nine_) + return c - zero_; + if (c >= lc_a_ && c <= lc_f_) + return 10 + (c - lc_a_); + if (c >= uc_a_ && c <= uc_f_) + return 10 + (c - uc_a_); + return std::nullopt; + } + + std::unique_ptr reader_; + Writer writer_; + T const backslash_; + T const u_; + T const zero_; + T const nine_; + T const lc_a_; + T const lc_f_; + T const uc_a_; + T const uc_f_; + std::unique_ptr buffer_; + size_t fill_{0}; + std::optional error_; +}; + +} // namespace + +namespace u8::java { + +namespace { + +struct Writer { + bool operator()(uint8_t*& start, uint8_t* const& end, uint32_t code) const { + return u8::write(start, end, code); + } +}; + +class ReaderImpl : public u8::Reader, + UnicodeEscapeReader { + public: + explicit ReaderImpl(std::unique_ptr reader) + : UnicodeEscapeReader( + std::move(reader), '\\', 'u', '0', '9', 'a', 'f', 'A', 'F') {} + + [[nodiscard]] std::expected read(void* dst, + size_t max) override { + return UnicodeEscapeReader::read(dst, max); + } + + [[nodiscard]] std::expected skip(size_t max) override { + return UnicodeEscapeReader::skip(max); + } +}; + +} // namespace + +[[nodiscard]] +std::unique_ptr open(std::unique_ptr reader, + u::ReaderConfig config) { + return std::make_unique(u8::open(std::move(reader), config)); +} + +} // namespace u8::java + +namespace u16::java { + +namespace { + +struct Writer { + bool operator()(uint16_t*& start, uint16_t* const& end, uint32_t code) const { + return u16::write(start, end, code); + } +}; + +class ReaderImpl : public u16::Reader, + UnicodeEscapeReader { + public: + explicit ReaderImpl(std::unique_ptr reader) + : UnicodeEscapeReader( + std::move(reader), u'\\', u'u', u'0', u'9', u'a', u'f', u'A', + u'F') {} + + [[nodiscard]] std::expected read(void* dst, + size_t max) override { + return UnicodeEscapeReader::read(dst, max); + } + + [[nodiscard]] std::expected skip(size_t max) override { + return UnicodeEscapeReader::skip(max); + } +}; + +} // namespace + +[[nodiscard]] +std::unique_ptr open(std::unique_ptr reader, + u::ReaderConfig config) { + return std::make_unique(u16::open(std::move(reader), config)); +} + +} // namespace u16::java diff --git a/src/java_uescape.hh b/src/java_uescape.hh new file mode 100644 index 0000000..8c845ad --- /dev/null +++ b/src/java_uescape.hh @@ -0,0 +1,24 @@ +#ifndef JAVA_UESCAPE_HH +#define JAVA_UESCAPE_HH + +#include "uio.hh" // IWYU pragma: export + +#include + +namespace u8::java { + +[[nodiscard]] +std::unique_ptr open(std::unique_ptr reader, + u::ReaderConfig config = {}); + +} // namespace u8::java + +namespace u16::java { + +[[nodiscard]] +std::unique_ptr open(std::unique_ptr reader, + u::ReaderConfig config = {}); + +} // namespace u16::java + +#endif // JAVA_UESCAPE_HH diff --git a/src/u16.hh b/src/u16.hh index 70ba157..17a30f4 100644 --- a/src/u16.hh +++ b/src/u16.hh @@ -13,7 +13,7 @@ namespace u16 { template requires std::is_same_v, uint16_t> -std::expected read(T& start, const T& end) { +std::expected read(T& start, T const& end) { if (start == end) return std::unexpected(u::ReadError::End); uint16_t u = *start; @@ -39,7 +39,7 @@ std::expected read(T& start, const T& end) { template requires std::is_same_v, uint16_t> std::expected read_replace(T& start, - const T& end, + T const& end, bool eof) { auto const tmp = start; auto ret = read(start, end); @@ -61,7 +61,7 @@ std::expected read_replace(T& start, template requires std::is_same_v, uint16_t> -bool write(T& start, const T& end, uint32_t code) { +bool write(T& start, T const& end, uint32_t code) { if (code < 0x10000) { if (start == end) return false; @@ -80,7 +80,7 @@ bool write(T& start, const T& end, uint32_t code) { template requires std::is_same_v, uint16_t> -bool skip(T& start, const T& end) { +bool skip(T& start, T const& end) { if (start == end) return false; if (*start >= 0xd800 && *start <= 0xdbff) { diff --git a/src/u8.hh b/src/u8.hh index 5292602..d673caa 100644 --- a/src/u8.hh +++ b/src/u8.hh @@ -13,7 +13,7 @@ namespace u8 { template requires std::is_same_v, uint8_t> -std::expected read(T& start, const T& end) { +std::expected read(T& start, T const& end) { if (start == end) return std::unexpected(u::ReadError::End); uint32_t u; @@ -106,7 +106,7 @@ std::expected read(T& start, const T& end) { template requires std::is_same_v, uint8_t> std::expected read_replace(T& start, - const T& end, + T const& end, bool eof) { auto const tmp = start; auto ret = read(start, end); @@ -128,7 +128,7 @@ std::expected read_replace(T& start, template requires std::is_same_v, uint8_t> -bool write(T& start, const T& end, uint32_t code) { +bool write(T& start, T const& end, uint32_t code) { if (code < 0x80) { if (start == end) return false; @@ -164,7 +164,7 @@ bool write(T& start, const T& end, uint32_t code) { template requires std::is_same_v, uint8_t> -bool skip(T& start, const T& end) { +bool skip(T& start, T const& end) { if (start == end) return false; switch (*start >> 4) { diff --git a/src/umod8.hh b/src/umod8.hh index 4731942..4423f52 100644 --- a/src/umod8.hh +++ b/src/umod8.hh @@ -13,7 +13,7 @@ namespace umod8 { template requires std::is_same_v, uint8_t> -std::expected read(T& start, const T& end) { +std::expected read(T& start, T const& end) { if (start == end) return std::unexpected(u::ReadError::End); uint32_t u; @@ -115,7 +115,7 @@ std::expected read(T& start, const T& end) { template requires std::is_same_v, uint8_t> std::expected read_replace(T& start, - const T& end, + T const& end, bool eof) { auto const tmp = start; auto ret = read(start, end); @@ -137,7 +137,7 @@ std::expected read_replace(T& start, template requires std::is_same_v, uint8_t> -bool write(T& start, const T& end, uint32_t code) { +bool write(T& start, T const& end, uint32_t code) { if (code > 0 && code < 0x80) { if (start == end) return false; -- cgit v1.3