From 32e14551a90e85000e41b3f0445d34d58a1431e4 Mon Sep 17 00:00:00 2001 From: Joel Klinghed Date: Wed, 10 Sep 2025 22:12:22 +0200 Subject: Add unicode general category lookup Generate the lookup tables from UnicodeData.txt, do to that, add gen_ugc, which uses csv, buffers, line, io and other modules to do the job. --- src/io.cc | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 src/io.cc (limited to 'src/io.cc') diff --git a/src/io.cc b/src/io.cc new file mode 100644 index 0000000..baf162a --- /dev/null +++ b/src/io.cc @@ -0,0 +1,238 @@ +#include "io.hh" + +#include "unique_fd.hh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace io { + +namespace { + +class BasicReader : public Reader { + public: + explicit BasicReader(unique_fd fd) + : fd_(std::move(fd)) { + } + + [[nodiscard]] + std::expected read(void* dst, size_t max) override { + ssize_t ret = ::read( + fd_.get(), dst, std::min(static_cast( + std::numeric_limits::max()), max)); + if (ret < 0) { + switch (errno) { + case EINTR: + return read(dst, max); + default: + return std::unexpected(ReadError::Error); + } + } + offset_ += ret; + return ret; + } + + [[nodiscard]] + std::expected skip(size_t max) override { + off_t ret; + if (sizeof(size_t) > sizeof(off_t)) { + // NOLINTNEXTLINE(bugprone-narrowing-conversions) + ret = lseek(fd_.get(), std::min(static_cast( + std::numeric_limits::max()), max), SEEK_CUR); + } else { + ret = lseek(fd_.get(), static_cast(max), SEEK_CUR); + } + if (ret < 0) { + return std::unexpected(ReadError::Error); + } + // Don't want skip to go past (cached) file end. + if (!size_.has_value() || ret > size_.value()) { + // When going past end, double check that it still is the end. + off_t ret2 = lseek(fd_.get(), 0, SEEK_END); + if (ret2 < 0) { + // We're screwed, but try to go back to original position and then + // return error. + size_.reset(); + lseek(fd_.get(), offset_, SEEK_SET); + return std::unexpected(ReadError::Error); + } + size_ = ret2; + if (ret > ret2) { + auto distance = ret2 - offset_; + offset_ = ret2; + return distance; + } + // Seek back to where we should be + if (lseek(fd_.get(), ret, SEEK_SET) < 0) { + return std::unexpected(ReadError::Error); + } + } + auto distance = ret - offset_; + offset_ = ret; + return distance; + } + + private: + unique_fd fd_; + off_t offset_{0}; + std::optional size_; +}; + +class MemoryReader : public Reader { + public: + MemoryReader(void* ptr, size_t size) + : ptr_(ptr), size_(size) { + } + + [[nodiscard]] + std::expected read(void* dst, size_t max) override { + size_t avail = size_ - offset_; + size_t ret = std::min(max, avail); + memcpy(dst, reinterpret_cast(ptr_) + offset_, ret); + offset_ += ret; + return ret; + } + + [[nodiscard]] + std::expected skip(size_t max) override { + size_t avail = size_ - offset_; + size_t ret = std::min(max, avail); + offset_ += ret; + return ret; + } + + protected: + void* ptr_; + size_t const size_; + + private: + size_t offset_{0}; +}; + +class MmapReader : public MemoryReader { + public: + MmapReader(unique_fd fd, void* ptr, size_t size) + : MemoryReader(ptr, size), fd_(std::move(fd)) { + } + + ~MmapReader() override { + munmap(ptr_, size_); + } + + private: + unique_fd fd_; +}; + +class StringReader : public MemoryReader { + public: + explicit StringReader(std::string data) + : MemoryReader(nullptr, data.size()), data_(std::move(data)) { + ptr_ = data_.data(); + } + + private: + std::string data_; +}; + +} // namespace + +std::expected Reader::read(std::string& str) { + return read(str.data(), str.size()); +} + +std::expected Reader::repeat_read(void* dst, size_t max) { + auto ret = read(dst, max); + if (!ret.has_value() || ret.value() == 0 || ret.value() == max) + return ret; + + char* d = reinterpret_cast(dst); + size_t offset = ret.value(); + while (true) { + ret = read(d + offset, max - offset); + if (!ret.has_value() || ret.value() == 0) + break; + offset += ret.value(); + if (offset == max) + break; + } + return offset; +} + +std::expected Reader::repeat_read(std::string& str) { + return repeat_read(str.data(), str.size()); +} + +std::expected Reader::repeat_skip(size_t max) { + auto ret = skip(max); + if (!ret.has_value() || ret.value() == 0 || ret.value() == max) + return ret; + + size_t offset = ret.value(); + while (true) { + ret = skip(max - offset); + if (!ret.has_value() || ret.value() == 0) + break; + offset += ret.value(); + if (offset == max) + break; + } + return offset; +} + +std::expected, OpenError> open( + const std::string& file_path) { + return openat(AT_FDCWD, file_path); +} + +std::expected, OpenError> openat( + int dirfd, const std::string& file_path) { + unique_fd fd(::openat(dirfd, file_path.c_str(), O_RDONLY)); + if (fd) { + struct stat buf; + if (fstat(fd.get(), &buf) == 0) { + if (std::cmp_less_equal(buf.st_size, + std::numeric_limits::max())) { + auto size = static_cast(buf.st_size); + void* ptr = mmap(nullptr, size, PROT_READ, MAP_PRIVATE, fd.get(), 0); + if (ptr != MAP_FAILED) { + return std::make_unique(std::move(fd), ptr, size); + } + } + } + return std::make_unique(std::move(fd)); + } + OpenError err; + switch (errno) { + case EINTR: + return openat(dirfd, file_path); + case EACCES: + err = OpenError::NoAccess; + break; + case ENOENT: + err = OpenError::NoSuchFile; + break; + default: + err = OpenError::Error; + break; + } + return std::unexpected(err); +} + +std::unique_ptr memory(std::string data) { + return std::make_unique(std::move(data)); +} + +} // namespace io -- cgit v1.3