summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/java_uescape.cc380
-rw-r--r--src/java_uescape.hh24
-rw-r--r--src/u16.hh8
-rw-r--r--src/u8.hh8
-rw-r--r--src/umod8.hh6
5 files changed, 415 insertions, 11 deletions
diff --git a/src/java_uescape.cc b/src/java_uescape.cc
new file mode 100644
index 0000000..925b050
--- /dev/null
+++ b/src/java_uescape.cc
@@ -0,0 +1,380 @@
+#include "java_uescape.hh"
+
+#include "u16.hh"
+#include "u8.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <expected>
+#include <memory>
+#include <optional>
+#include <utility>
+
+namespace {
+
+const size_t kBufferSize = 1024;
+
+template <typename T, typename Reader, typename Writer>
+class UnicodeEscapeReader {
+ public:
+ UnicodeEscapeReader(std::unique_ptr<Reader> reader, T backslash, T u, T zero,
+ T nine, T lc_a, T lc_f, T uc_a, T uc_f)
+ : reader_(std::move(reader)),
+ backslash_(backslash),
+ u_(u),
+ zero_(zero),
+ nine_(nine),
+ lc_a_(lc_a),
+ lc_f_(lc_f),
+ uc_a_(uc_a),
+ uc_f_(uc_f),
+ buffer_(std::make_unique_for_overwrite<T[]>(kBufferSize)) {}
+
+ [[nodiscard]] std::expected<size_t, io::ReadError> read(void* dst,
+ size_t max) {
+ State state;
+ bool eof;
+
+ state.wstart = reinterpret_cast<T*>(dst);
+ state.wptr = state.wstart;
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
+ state.wend = state.wstart + max / sizeof(T);
+
+ if (fill_ == 0) {
+ // Optimize for the case when there are no unicode escapes.
+ auto ret = reader_->read(dst, max);
+ if (!ret.has_value() || ret.value() == 0) {
+ return ret;
+ }
+
+ eof = false;
+ state.rstart = reinterpret_cast<T*>(dst);
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
+ state.rend = state.rstart + (ret.value() / sizeof(T));
+ state.shared = true;
+ } else {
+ if (error_.has_value()) {
+ return std::unexpected(error_.value());
+ }
+
+ auto ret = reader_->read(buffer_.get() + fill_,
+ (kBufferSize - fill_) * sizeof(T));
+ if (!ret.has_value()) {
+ return ret;
+ }
+ eof = ret.value() == 0;
+ fill_ += ret.value() / sizeof(T);
+
+ state.rstart = buffer_.get();
+ state.rend = buffer_.get() + fill_;
+ state.shared = false;
+ }
+
+ while (true) {
+ auto* ptr = std::find(state.rstart, state.rend, backslash_);
+ if (transfer(state, ptr) || ptr == state.rend)
+ return finish(state);
+
+ auto* const first_backslash = ptr;
+ do {
+ ptr++;
+ } while (ptr < state.rend && *ptr == backslash_);
+
+ if (ptr == state.rend && !eof)
+ return finish(state);
+
+ if (ptr == state.rend || *ptr != u_ || (ptr - first_backslash) % 2 == 0) {
+ // Even number of backslashes, or no u
+ if (transfer(state, ptr))
+ return finish(state);
+ continue;
+ }
+
+ // auto* const first_u = ptr;
+ do {
+ ptr++;
+ } while (ptr < state.rend && *ptr == u_);
+
+ if (state.rend - ptr < 4) {
+ if (!eof)
+ return finish(state);
+
+ // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs.
+ return std::unexpected(error(io::ReadError::InvalidData));
+ }
+
+ auto maybe_code = unhex4(ptr);
+ if (!maybe_code.has_value()) {
+ // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs.
+ return std::unexpected(error(io::ReadError::InvalidData));
+ }
+ ptr += 4;
+ uint32_t code = maybe_code.value();
+
+ if (code >= 0xdc00 && code <= 0xdfff) {
+ // Low surrogate before (or without?), replace
+ code = 0xfffd;
+ }
+
+ if (code < 0xd800 || code > 0xdfff) {
+ // Not a pair
+ if (write_code(state, code, ptr))
+ return finish(state);
+ continue;
+ }
+
+ auto* const second_first_backslash = ptr;
+ do {
+ ptr++;
+ } while (ptr < state.rend && *ptr == backslash_);
+
+ if (ptr == state.rend && !eof)
+ return finish(state);
+
+ if (ptr == state.rend || *ptr != u_ ||
+ (ptr - second_first_backslash) % 2 == 0) {
+ // High surrogate not followed by an escape, write out replacement
+ // and restart.
+ if (write_code(state, 0xfffd, second_first_backslash))
+ return finish(state);
+ continue;
+ }
+
+ // auto* const first_u = ptr;
+ do {
+ ptr++;
+ } while (ptr < state.rend && *ptr == u_);
+
+ if (state.rend - ptr < 4) {
+ if (!eof)
+ return finish(state);
+
+ // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs.
+ return std::unexpected(error(io::ReadError::InvalidData));
+ }
+
+ maybe_code = unhex4(ptr);
+ if (!maybe_code.has_value()) {
+ // If an eligible \ is followed by u, or more than one u, and the last u is not followed by four hexadecimal digits, then a compile-time error occurs.
+ return std::unexpected(error(io::ReadError::InvalidData));
+ }
+ ptr += 4;
+ uint32_t low_code = maybe_code.value();
+
+ if (low_code >= 0xdc00 && low_code <= 0xdfff) {
+ // Pair
+ code = 0x10000 + (((code - 0xd800) << 10) | (low_code - 0xdc00));
+ if (write_code(state, code, ptr))
+ return finish(state);
+ } else {
+ // High surrogate not followed by an low surrogate, write out
+ // replacement and restart.
+ if (write_code(state, 0xfffd, second_first_backslash))
+ return finish(state);
+ }
+ }
+ }
+
+ [[nodiscard]] std::expected<size_t, io::ReadError> skip(size_t max) {
+ auto tmp = std::make_unique_for_overwrite<T[]>(max);
+ return read(tmp.get(), max);
+ }
+
+ private:
+ struct State {
+ T* rstart;
+ T* rend;
+
+ bool shared;
+
+ T* wstart;
+ T* wptr;
+ T* wend;
+ };
+
+ bool transfer(State& state, T*& ptr) {
+ if (state.shared) {
+ state.rstart = ptr;
+ state.wptr = ptr;
+ return false;
+ }
+ size_t ravail = ptr - state.rstart;
+ size_t wavail = state.wend - state.wptr;
+ size_t avail = std::min(ravail, wavail);
+
+ memcpy(state.wptr, state.rstart, avail);
+ state.wptr += avail;
+ fill_ -= avail;
+ memmove(buffer_.get(), ptr, fill_ * sizeof(T));
+
+ if (avail != ravail)
+ return true;
+
+ ptr -= ravail;
+ state.rstart = buffer_.get();
+ state.rend = buffer_.get() + fill_;
+ return false;
+ }
+
+ std::expected<size_t, io::ReadError> finish(State const& state) {
+ if (state.shared) {
+ size_t avail = state.rend - state.rstart;
+ if (fill_ + avail > kBufferSize)
+ abort(); // NOLINT(misc-include-cleaner)
+ memcpy(buffer_.get() + fill_, state.rstart, avail * sizeof(T));
+ fill_ += avail;
+ }
+
+ // We can't return zero, that is treated as EOF, we need to read more.
+ if (state.wptr == state.wstart) {
+ if (fill_ > 0) {
+ return read(state.wstart, (state.wend - state.wstart) * sizeof(T));
+ }
+ }
+
+ return (state.wptr - state.wstart) * sizeof(T);
+ }
+
+ bool write_code(State& state, uint32_t code, T* ptr) {
+ auto* const wptr = state.wptr;
+ if (!writer_(state.wptr, state.wend, code))
+ return true;
+
+ if (state.shared) {
+ // Remove the extra bytes (if any)
+ auto* rstart = state.rstart + (state.wptr - wptr);
+ assert(ptr >= rstart);
+ memmove(rstart, ptr, (state.rend - ptr) * sizeof(T));
+ state.rend -= (ptr - rstart);
+ state.rstart = rstart;
+ } else {
+ // Just drop escape from buffer
+ fill_ = state.rend - ptr;
+ memmove(buffer_.get(), ptr, fill_ * sizeof(T));
+ state.rstart = buffer_.get();
+ state.rend = state.rstart + fill_;
+ }
+ return false;
+ }
+
+ io::ReadError error(io::ReadError err) {
+ // If read() returns an error it should continue to do so.
+ error_ = err;
+ fill_ = 1;
+ return err;
+ }
+
+ [[nodiscard]] std::optional<uint16_t> unhex4(T* ptr) const {
+ auto a = unhex1(ptr[0]);
+ auto b = unhex1(ptr[1]);
+ auto c = unhex1(ptr[2]);
+ auto d = unhex1(ptr[3]);
+ if (a.has_value() && b.has_value() && c.has_value() && d.has_value())
+ return (*a << 12) | (*b << 8) | (*c << 4) | *d;
+ return std::nullopt;
+ }
+
+ [[nodiscard]] std::optional<uint16_t> unhex1(T c) const {
+ if (c >= zero_ && c <= nine_)
+ return c - zero_;
+ if (c >= lc_a_ && c <= lc_f_)
+ return 10 + (c - lc_a_);
+ if (c >= uc_a_ && c <= uc_f_)
+ return 10 + (c - uc_a_);
+ return std::nullopt;
+ }
+
+ std::unique_ptr<Reader> reader_;
+ Writer writer_;
+ T const backslash_;
+ T const u_;
+ T const zero_;
+ T const nine_;
+ T const lc_a_;
+ T const lc_f_;
+ T const uc_a_;
+ T const uc_f_;
+ std::unique_ptr<T[]> buffer_;
+ size_t fill_{0};
+ std::optional<io::ReadError> error_;
+};
+
+} // namespace
+
+namespace u8::java {
+
+namespace {
+
+struct Writer {
+ bool operator()(uint8_t*& start, uint8_t* const& end, uint32_t code) const {
+ return u8::write(start, end, code);
+ }
+};
+
+class ReaderImpl : public u8::Reader,
+ UnicodeEscapeReader<uint8_t, u8::Reader, Writer> {
+ public:
+ explicit ReaderImpl(std::unique_ptr<u8::Reader> reader)
+ : UnicodeEscapeReader<uint8_t, u8::Reader, Writer>(
+ std::move(reader), '\\', 'u', '0', '9', 'a', 'f', 'A', 'F') {}
+
+ [[nodiscard]] std::expected<size_t, io::ReadError> read(void* dst,
+ size_t max) override {
+ return UnicodeEscapeReader<uint8_t, u8::Reader, Writer>::read(dst, max);
+ }
+
+ [[nodiscard]] std::expected<size_t, io::ReadError> skip(size_t max) override {
+ return UnicodeEscapeReader<uint8_t, u8::Reader, Writer>::skip(max);
+ }
+};
+
+} // namespace
+
+[[nodiscard]]
+std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader,
+ u::ReaderConfig config) {
+ return std::make_unique<ReaderImpl>(u8::open(std::move(reader), config));
+}
+
+} // namespace u8::java
+
+namespace u16::java {
+
+namespace {
+
+struct Writer {
+ bool operator()(uint16_t*& start, uint16_t* const& end, uint32_t code) const {
+ return u16::write(start, end, code);
+ }
+};
+
+class ReaderImpl : public u16::Reader,
+ UnicodeEscapeReader<uint16_t, u16::Reader, Writer> {
+ public:
+ explicit ReaderImpl(std::unique_ptr<u16::Reader> reader)
+ : UnicodeEscapeReader<uint16_t, u16::Reader, Writer>(
+ std::move(reader), u'\\', u'u', u'0', u'9', u'a', u'f', u'A',
+ u'F') {}
+
+ [[nodiscard]] std::expected<size_t, io::ReadError> read(void* dst,
+ size_t max) override {
+ return UnicodeEscapeReader<uint16_t, u16::Reader, Writer>::read(dst, max);
+ }
+
+ [[nodiscard]] std::expected<size_t, io::ReadError> skip(size_t max) override {
+ return UnicodeEscapeReader<uint16_t, u16::Reader, Writer>::skip(max);
+ }
+};
+
+} // namespace
+
+[[nodiscard]]
+std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader,
+ u::ReaderConfig config) {
+ return std::make_unique<ReaderImpl>(u16::open(std::move(reader), config));
+}
+
+} // namespace u16::java
diff --git a/src/java_uescape.hh b/src/java_uescape.hh
new file mode 100644
index 0000000..8c845ad
--- /dev/null
+++ b/src/java_uescape.hh
@@ -0,0 +1,24 @@
+#ifndef JAVA_UESCAPE_HH
+#define JAVA_UESCAPE_HH
+
+#include "uio.hh" // IWYU pragma: export
+
+#include <memory>
+
+namespace u8::java {
+
+[[nodiscard]]
+std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader,
+ u::ReaderConfig config = {});
+
+} // namespace u8::java
+
+namespace u16::java {
+
+[[nodiscard]]
+std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader,
+ u::ReaderConfig config = {});
+
+} // namespace u16::java
+
+#endif // JAVA_UESCAPE_HH
diff --git a/src/u16.hh b/src/u16.hh
index 70ba157..17a30f4 100644
--- a/src/u16.hh
+++ b/src/u16.hh
@@ -13,7 +13,7 @@ namespace u16 {
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint16_t>
-std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
+std::expected<uint32_t, u::ReadError> read(T& start, T const& end) {
if (start == end)
return std::unexpected(u::ReadError::End);
uint16_t u = *start;
@@ -39,7 +39,7 @@ std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint16_t>
std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
- const T& end,
+ T const& end,
bool eof) {
auto const tmp = start;
auto ret = read(start, end);
@@ -61,7 +61,7 @@ std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint16_t>
-bool write(T& start, const T& end, uint32_t code) {
+bool write(T& start, T const& end, uint32_t code) {
if (code < 0x10000) {
if (start == end)
return false;
@@ -80,7 +80,7 @@ bool write(T& start, const T& end, uint32_t code) {
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint16_t>
-bool skip(T& start, const T& end) {
+bool skip(T& start, T const& end) {
if (start == end)
return false;
if (*start >= 0xd800 && *start <= 0xdbff) {
diff --git a/src/u8.hh b/src/u8.hh
index 5292602..d673caa 100644
--- a/src/u8.hh
+++ b/src/u8.hh
@@ -13,7 +13,7 @@ namespace u8 {
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint8_t>
-std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
+std::expected<uint32_t, u::ReadError> read(T& start, T const& end) {
if (start == end)
return std::unexpected(u::ReadError::End);
uint32_t u;
@@ -106,7 +106,7 @@ std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint8_t>
std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
- const T& end,
+ T const& end,
bool eof) {
auto const tmp = start;
auto ret = read(start, end);
@@ -128,7 +128,7 @@ std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint8_t>
-bool write(T& start, const T& end, uint32_t code) {
+bool write(T& start, T const& end, uint32_t code) {
if (code < 0x80) {
if (start == end)
return false;
@@ -164,7 +164,7 @@ bool write(T& start, const T& end, uint32_t code) {
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint8_t>
-bool skip(T& start, const T& end) {
+bool skip(T& start, T const& end) {
if (start == end)
return false;
switch (*start >> 4) {
diff --git a/src/umod8.hh b/src/umod8.hh
index 4731942..4423f52 100644
--- a/src/umod8.hh
+++ b/src/umod8.hh
@@ -13,7 +13,7 @@ namespace umod8 {
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint8_t>
-std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
+std::expected<uint32_t, u::ReadError> read(T& start, T const& end) {
if (start == end)
return std::unexpected(u::ReadError::End);
uint32_t u;
@@ -115,7 +115,7 @@ std::expected<uint32_t, u::ReadError> read(T& start, const T& end) {
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint8_t>
std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
- const T& end,
+ T const& end,
bool eof) {
auto const tmp = start;
auto ret = read(start, end);
@@ -137,7 +137,7 @@ std::expected<uint32_t, u::ReadErrorReplace> read_replace(T& start,
template <std::forward_iterator T>
requires std::is_same_v<std::iter_value_t<T>, uint8_t>
-bool write(T& start, const T& end, uint32_t code) {
+bool write(T& start, T const& end, uint32_t code) {
if (code > 0 && code < 0x80) {
if (start == end)
return false;