From 32e14551a90e85000e41b3f0445d34d58a1431e4 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Wed, 10 Sep 2025 22:12:22 +0200 Subject: Add unicode general category lookup Generate the lookup tables from UnicodeData.txt, do to that, add gen_ugc, which uses csv, buffers, line, io and other modules to do the job. --- src/line.cc | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 src/line.cc (limited to 'src/line.cc') diff --git a/src/line.cc b/src/line.cc new file mode 100644 index 0000000..2eeb116 --- /dev/null +++ b/src/line.cc @@ -0,0 +1,133 @@ +#include "line.hh" + +#include "check.hh" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace line { + +namespace { + +const char kLineTerminators[] = "\r\n"; + +class ReaderImpl : public Reader { + public: + ReaderImpl(std::unique_ptr reader, size_t max_len) + : reader_(std::move(reader)), max_len_(max_len), + buffer_(std::make_unique_for_overwrite( + check::add(max_len, static_cast(2)))), + rptr_(buffer_.get()), wptr_(buffer_.get()), search_(rptr_), + end_(buffer_.get() + check::add(max_len, static_cast(2))) {} + + [[nodiscard]] std::expected read() override { + while (true) { + search_ = std::find_first_of(search_, wptr_, + kLineTerminators, kLineTerminators + 2); + if (search_ < wptr_) { + if (std::cmp_greater(search_ - rptr_, max_len_)) { + return line(max_len_, 0); + } + + size_t tlen; + if (*search_ == '\n') { + tlen = 1; + } else { + if (search_ + 1 == wptr_) { + make_space_if_needed(); + auto got = fill(); + if (got.has_value()) { + if (got.value() == 0) { + return line(search_ - rptr_, 1); + } + } else { + return std::unexpected(ReadError(got.error())); + } + } + if (search_[1] == '\n') { + tlen = 2; + } else { + tlen = 1; + } + } + return line(search_ - rptr_, tlen); + } + if (std::cmp_greater_equal(wptr_ - rptr_, max_len_)) { + return line(max_len_, 0); + } + + make_space_if_needed(); + auto got = fill(); + if (got.has_value()) { + if (got.value() == 0) { + if (rptr_ == wptr_) { + return std::unexpected(ReadError()); + } + return line(wptr_ - rptr_, 0); + } + } else { + return std::unexpected(ReadError(got.error())); + } + } + } + + [[nodiscard]] uint64_t number() const override { return number_; } + + private: + std::string_view line(size_t len, size_t terminator_len) { + assert(len <= max_len_); + auto ret = std::string_view(rptr_, len); + rptr_ += len + terminator_len; + search_ = rptr_; + ++number_; + return ret; + } + + void make_space_if_needed() { + size_t free = rptr_ - buffer_.get(); + if (free == 0) return; + size_t avail = end_ - wptr_; + if (avail > 1024) return; + memmove(buffer_.get(), rptr_, wptr_ - rptr_); + search_ -= free; + wptr_ -= free; + rptr_ = buffer_.get(); + } + + std::expected fill() { + auto ret = reader_->read(wptr_, end_ - wptr_); + if (ret.has_value()) + wptr_ += ret.value(); + return ret; + } + + std::unique_ptr reader_; + size_t const max_len_; + uint64_t number_{0}; + std::unique_ptr buffer_; + char* rptr_; + char* wptr_; + char* search_; + char* const end_; +}; + +} // namespace + +ReadError::ReadError() + : eof(true) {} + +ReadError::ReadError(io::ReadError error) + : eof(false), io_error(error) {} + +std::unique_ptr open(std::unique_ptr reader, + size_t max_len) { + return std::make_unique(std::move(reader), max_len); +} + +} // namespace line -- cgit v1.3