#include "java_uescape.hh" #include "u16.hh" #include "u8.hh" #include #include #include #include #include #include #include #include #include namespace { const size_t kBufferSize = 1024; template class UnicodeEscapeReader { public: UnicodeEscapeReader(std::unique_ptr reader, T backslash, T u, T zero, T nine, T lc_a, T lc_f, T uc_a, T uc_f) : reader_(std::move(reader)), backslash_(backslash), u_(u), zero_(zero), nine_(nine), lc_a_(lc_a), lc_f_(lc_f), uc_a_(uc_a), uc_f_(uc_f), buffer_(std::make_unique_for_overwrite(kBufferSize)) {} [[nodiscard]] std::expected read(void* dst, size_t max) { State state; bool eof; state.wstart = reinterpret_cast(dst); state.wptr = state.wstart; // NOLINTNEXTLINE(bugprone-sizeof-expression) state.wend = state.wstart + max / sizeof(T); if (fill_ == 0) { // Optimize for the case when there are no unicode escapes. auto ret = reader_->read(dst, max); if (!ret.has_value()) return ret; eof = false; state.rstart = reinterpret_cast(dst); // NOLINTNEXTLINE(bugprone-sizeof-expression) state.rend = state.rstart + (ret.value() / sizeof(T)); state.shared = true; } else { if (error_.has_value()) { return std::unexpected(error_.value()); } auto ret = reader_->read(buffer_.get() + fill_, (kBufferSize - fill_) * sizeof(T)); if (!ret.has_value()) { if (ret.error() != io::ReadError::Eof) { return ret; } eof = true; } else { eof = false; fill_ += ret.value() / sizeof(T); } state.rstart = buffer_.get(); state.rend = buffer_.get() + fill_; state.shared = false; } while (true) { auto* ptr = std::find(state.rstart, state.rend, backslash_); if (transfer(state, ptr) || ptr == state.rend) return finish(state); auto* const first_backslash = ptr; do { ptr++; } while (ptr < state.rend && *ptr == backslash_); if (ptr == state.rend && !eof) return finish(state); if (ptr == state.rend || *ptr != u_ || (ptr - first_backslash) % 2 == 0) { // Even number of backslashes, or no u if (transfer(state, ptr)) return finish(state); continue; } // auto* const first_u = ptr; do { ptr++; } while (ptr < state.rend && *ptr == u_); if (state.rend - ptr < 4) { if (!eof) return finish(state); // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs. return std::unexpected(error(io::ReadError::InvalidData)); } auto maybe_code = unhex4(ptr); if (!maybe_code.has_value()) { // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs. return std::unexpected(error(io::ReadError::InvalidData)); } ptr += 4; uint32_t code = maybe_code.value(); if (code >= 0xdc00 && code <= 0xdfff) { // Low surrogate before (or without?), replace code = 0xfffd; } if (code < 0xd800 || code > 0xdfff) { // Not a pair if (write_code(state, code, ptr)) return finish(state); continue; } auto* const second_first_backslash = ptr; do { ptr++; } while (ptr < state.rend && *ptr == backslash_); if (ptr == state.rend && !eof) return finish(state); if (ptr == state.rend || *ptr != u_ || (ptr - second_first_backslash) % 2 == 0) { // High surrogate not followed by an escape, write out replacement // and restart. if (write_code(state, 0xfffd, second_first_backslash)) return finish(state); continue; } // auto* const first_u = ptr; do { ptr++; } while (ptr < state.rend && *ptr == u_); if (state.rend - ptr < 4) { if (!eof) return finish(state); // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs. return std::unexpected(error(io::ReadError::InvalidData)); } maybe_code = unhex4(ptr); if (!maybe_code.has_value()) { // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs. return std::unexpected(error(io::ReadError::InvalidData)); } ptr += 4; uint32_t low_code = maybe_code.value(); if (low_code >= 0xdc00 && low_code <= 0xdfff) { // Pair code = 0x10000 + (((code - 0xd800) << 10) | (low_code - 0xdc00)); if (write_code(state, code, ptr)) return finish(state); } else { // High surrogate not followed by an low surrogate, write out // replacement and restart. if (write_code(state, 0xfffd, second_first_backslash)) return finish(state); } } } [[nodiscard]] std::expected skip(size_t max) { auto tmp = std::make_unique_for_overwrite(max); return read(tmp.get(), max); } private: struct State { T* rstart; T* rend; bool shared; T* wstart; T* wptr; T* wend; }; bool transfer(State& state, T*& ptr) { if (state.shared) { state.rstart = ptr; state.wptr = ptr; return false; } size_t ravail = ptr - state.rstart; size_t wavail = state.wend - state.wptr; size_t avail = std::min(ravail, wavail); memcpy(state.wptr, state.rstart, avail); state.wptr += avail; fill_ -= avail; memmove(buffer_.get(), ptr, fill_ * sizeof(T)); if (avail != ravail) return true; ptr -= ravail; state.rstart = buffer_.get(); state.rend = buffer_.get() + fill_; return false; } std::expected finish(State const& state) { if (state.shared) { size_t avail = state.rend - state.rstart; if (fill_ + avail > kBufferSize) abort(); // NOLINT(misc-include-cleaner) memcpy(buffer_.get() + fill_, state.rstart, avail * sizeof(T)); fill_ += avail; } // We shouldn't return zero, we should read more if we can. if (state.wptr == state.wstart) { if (fill_ > 0) { return read(state.wstart, (state.wend - state.wstart) * sizeof(T)); } } return (state.wptr - state.wstart) * sizeof(T); } bool write_code(State& state, uint32_t code, T* ptr) { auto* const wptr = state.wptr; if (!writer_(state.wptr, state.wend, code)) return true; if (state.shared) { // Remove the extra bytes (if any) auto* rstart = state.rstart + (state.wptr - wptr); assert(ptr >= rstart); memmove(rstart, ptr, (state.rend - ptr) * sizeof(T)); state.rend -= (ptr - rstart); state.rstart = rstart; } else { // Just drop escape from buffer fill_ = state.rend - ptr; memmove(buffer_.get(), ptr, fill_ * sizeof(T)); state.rstart = buffer_.get(); state.rend = state.rstart + fill_; } return false; } io::ReadError error(io::ReadError err) { // If read() returns an error it should continue to do so. error_ = err; fill_ = 1; return err; } [[nodiscard]] std::optional unhex4(T* ptr) const { auto a = unhex1(ptr[0]); auto b = unhex1(ptr[1]); auto c = unhex1(ptr[2]); auto d = unhex1(ptr[3]); if (a.has_value() && b.has_value() && c.has_value() && d.has_value()) return (*a << 12) | (*b << 8) | (*c << 4) | *d; return std::nullopt; } [[nodiscard]] std::optional unhex1(T c) const { if (c >= zero_ && c <= nine_) return c - zero_; if (c >= lc_a_ && c <= lc_f_) return 10 + (c - lc_a_); if (c >= uc_a_ && c <= uc_f_) return 10 + (c - uc_a_); return std::nullopt; } std::unique_ptr reader_; Writer writer_; T const backslash_; T const u_; T const zero_; T const nine_; T const lc_a_; T const lc_f_; T const uc_a_; T const uc_f_; std::unique_ptr buffer_; size_t fill_{0}; std::optional error_; }; } // namespace namespace u8::java { namespace { struct Writer { bool operator()(uint8_t*& start, uint8_t* const& end, uint32_t code) const { return u8::write(start, end, code); } }; class ReaderImpl : public u8::Reader, UnicodeEscapeReader { public: explicit ReaderImpl(std::unique_ptr reader) : UnicodeEscapeReader( std::move(reader), '\\', 'u', '0', '9', 'a', 'f', 'A', 'F') {} [[nodiscard]] std::expected read(void* dst, size_t max) override { return UnicodeEscapeReader::read(dst, max); } [[nodiscard]] std::expected skip(size_t max) override { return UnicodeEscapeReader::skip(max); } }; } // namespace [[nodiscard]] std::unique_ptr open(std::unique_ptr reader, u::ReaderConfig config) { return std::make_unique(u8::open(std::move(reader), config)); } } // namespace u8::java namespace u16::java { namespace { struct Writer { bool operator()(uint16_t*& start, uint16_t* const& end, uint32_t code) const { return u16::write(start, end, code); } }; class ReaderImpl : public u16::Reader, UnicodeEscapeReader { public: explicit ReaderImpl(std::unique_ptr reader) : UnicodeEscapeReader( std::move(reader), u'\\', u'u', u'0', u'9', u'a', u'f', u'A', u'F') {} [[nodiscard]] std::expected read(void* dst, size_t max) override { return UnicodeEscapeReader::read(dst, max); } [[nodiscard]] std::expected skip(size_t max) override { return UnicodeEscapeReader::skip(max); } }; } // namespace [[nodiscard]] std::unique_ptr open(std::unique_ptr reader, u::ReaderConfig config) { return std::make_unique(u16::open(std::move(reader), config)); } } // namespace u16::java