From 2a9e59adb5db8630ab7bdbdeedac623e3397989b Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Wed, 17 Sep 2025 00:48:46 +0200 Subject: uline: Add unicode line reader --- src/uline.cc | 198 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 198 insertions(+) create mode 100644 src/uline.cc (limited to 'src/uline.cc') diff --git a/src/uline.cc b/src/uline.cc new file mode 100644 index 0000000..21927b3 --- /dev/null +++ b/src/uline.cc @@ -0,0 +1,198 @@ +#include "uline.hh" + +#include "check.hh" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +template +class UnicodeReader { + public: + UnicodeReader(std::unique_ptr reader, size_t max_len, + std::array line_terminators) + : reader_(std::move(reader)), + max_len_(max_len), + line_terminators_(line_terminators), + buffer_(std::make_unique_for_overwrite( + check::add(max_len, static_cast(2)))), + rptr_(buffer_.get()), + wptr_(buffer_.get()), + search_(rptr_), + end_(buffer_.get() + check::add(max_len, static_cast(2))) {} + + [[nodiscard]] + std::expected, line::ReadError> read() { + while (true) { + search_ = std::find_first_of(search_, wptr_, line_terminators_.begin(), + line_terminators_.end()); + if (search_ < wptr_) { + if (std::cmp_greater(search_ - rptr_, max_len_)) { + return line(max_len_, 0); + } + + size_t tlen; + if (*search_ == line_terminators_[1]) { + tlen = 1; + } else { + if (search_ + 1 == wptr_) { + make_space_if_needed(); + auto got = fill(); + if (got.has_value()) { + if (got.value() == 0) { + return line(search_ - rptr_, 1); + } + } else { + return std::unexpected(line::ReadError(got.error())); + } + } + if (search_[1] == line_terminators_[1]) { + tlen = 2; + } else { + tlen = 1; + } + } + return line(search_ - rptr_, tlen); + } + if (std::cmp_greater_equal(wptr_ - rptr_, max_len_)) { + return line(max_len_, 0); + } + + make_space_if_needed(); + auto got = fill(); + if (got.has_value()) { + if (got.value() == 0) { + if (rptr_ == wptr_) { + return std::unexpected(line::ReadError()); + } + return line(wptr_ - rptr_, 0); + } + } else { + return std::unexpected(line::ReadError(got.error())); + } + } + } + + [[nodiscard]] uint64_t number() const { return number_; } + + private: + std::basic_string_view line(size_t len, size_t terminator_len) { + assert(len <= max_len_); + auto ret = std::basic_string_view(rptr_, len); + rptr_ += len + terminator_len; + search_ = rptr_; + ++number_; + return ret; + } + + void make_space_if_needed() { + size_t free = rptr_ - buffer_.get(); + if (free == 0) + return; + size_t avail = end_ - wptr_; + if (avail > 1024) + return; + memmove(buffer_.get(), rptr_, (wptr_ - rptr_) * sizeof(T)); + search_ -= free; + wptr_ -= free; + rptr_ = buffer_.get(); + } + + std::expected fill() { + auto ret = reader_->read(wptr_, (end_ - wptr_) * sizeof(T)); + if (ret.has_value()) + wptr_ += ret.value() / sizeof(T); + return ret; + } + + std::unique_ptr reader_; + size_t const max_len_; + std::array const line_terminators_; + uint64_t number_{0}; + std::unique_ptr buffer_; + T* rptr_; + T* wptr_; + T* search_; + T* const end_; +}; + +} // namespace + +namespace u8 { + +namespace line { + +namespace { + +class ReaderImpl : public UnicodeReader, + public virtual Reader { + public: + ReaderImpl(std::unique_ptr reader, size_t max_len) + : UnicodeReader(std::move(reader), max_len, + {'\r', '\n'}) {} + + [[nodiscard]] + std::expected read() override { + return UnicodeReader::read(); + } + + [[nodiscard]] + uint64_t number() const override { + return UnicodeReader::number(); + } +}; + +} // namespace + +[[nodiscard]] +std::unique_ptr open(std::unique_ptr reader, + size_t max_len) { + return std::make_unique(std::move(reader), std::move(max_len)); +} + +} // namespace line + +} // namespace u8 + +namespace u16 { + +namespace line { + +namespace { + +class ReaderImpl : public UnicodeReader, + public virtual Reader { + public: + ReaderImpl(std::unique_ptr reader, size_t max_len) + : UnicodeReader(std::move(reader), max_len, + {u'\r', u'\n'}) {} + + [[nodiscard]] + std::expected read() override { + return UnicodeReader::read(); + } + + [[nodiscard]] + uint64_t number() const override { + return UnicodeReader::number(); + } +}; + +} // namespace + +[[nodiscard]] +std::unique_ptr open(std::unique_ptr reader, + size_t max_len) { + return std::make_unique(std::move(reader), std::move(max_len)); +} + +} // namespace line + +} // namespace u16 -- cgit v1.3