diff options
41 files changed, 2639 insertions, 18 deletions
diff --git a/data/get_unicode.sh b/data/get_unicode.sh new file mode 100755 index 0000000..99662b3 --- /dev/null +++ b/data/get_unicode.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +declare -a versions + +# Java 8 +versions+=("6.2.0") + +# Java 9 +versions+=("8.0.0") + +# Java 11 +versions+=("10.0.0") + +# Java 12 +versions+=("11.0.0") + +# Java 13 +versions+=("12.1.0") + +# Java 15 +versions+=("13.0.0") + +# Java 19 +versions+=("14.0.0") + +# Java 20 +versions+=("15.0.0") + +# Java 22 +versions+=("15.1.0") + +# Java 24 +versions+=("16.0.0") + +basedir=$(dirname -- "${BASH_SOURCE[0]}") + +for version in "${versions[@]}"; do + target="$basedir"/unicode-"$version"/UnicodeData.txt + if [ ! -e "$target".xz ]; then + mkdir -p "$basedir"/unicode-"$version" + curl "https://www.unicode.org/Public/${version}/ucd/UnicodeData.txt" -o "$target" + xz -9 "$target" + fi +done diff --git a/data/unicode-10.0.0/UnicodeData.txt.xz b/data/unicode-10.0.0/UnicodeData.txt.xz Binary files differnew file mode 100644 index 0000000..25eb906 --- /dev/null +++ b/data/unicode-10.0.0/UnicodeData.txt.xz diff --git a/data/unicode-11.0.0/UnicodeData.txt.xz b/data/unicode-11.0.0/UnicodeData.txt.xz Binary files differnew file mode 100644 index 0000000..e586bd3 --- /dev/null +++ b/data/unicode-11.0.0/UnicodeData.txt.xz diff --git a/data/unicode-12.1.0/UnicodeData.txt.xz b/data/unicode-12.1.0/UnicodeData.txt.xz Binary files differnew file mode 100644 index 0000000..15f8880 --- /dev/null +++ b/data/unicode-12.1.0/UnicodeData.txt.xz diff --git a/data/unicode-13.0.0/UnicodeData.txt.xz b/data/unicode-13.0.0/UnicodeData.txt.xz Binary files differnew file mode 100644 index 0000000..9e723dd --- /dev/null +++ b/data/unicode-13.0.0/UnicodeData.txt.xz diff --git a/data/unicode-14.0.0/UnicodeData.txt.xz b/data/unicode-14.0.0/UnicodeData.txt.xz Binary files differnew file mode 100644 index 0000000..8ccc9cb --- /dev/null +++ b/data/unicode-14.0.0/UnicodeData.txt.xz diff --git a/data/unicode-15.0.0/UnicodeData.txt.xz b/data/unicode-15.0.0/UnicodeData.txt.xz Binary files differnew file mode 100644 index 0000000..dfb9976 --- /dev/null +++ b/data/unicode-15.0.0/UnicodeData.txt.xz diff --git a/data/unicode-15.1.0/UnicodeData.txt.xz b/data/unicode-15.1.0/UnicodeData.txt.xz Binary files differnew file mode 100644 index 0000000..aa89857 --- /dev/null +++ b/data/unicode-15.1.0/UnicodeData.txt.xz diff --git a/data/unicode-16.0.0/UnicodeData.txt.xz b/data/unicode-16.0.0/UnicodeData.txt.xz Binary files differnew file mode 100644 index 0000000..199d7c1 --- /dev/null +++ b/data/unicode-16.0.0/UnicodeData.txt.xz diff --git a/data/unicode-6.2.0/UnicodeData.txt.xz b/data/unicode-6.2.0/UnicodeData.txt.xz Binary files differnew file mode 100644 index 0000000..3001c3c --- /dev/null +++ b/data/unicode-6.2.0/UnicodeData.txt.xz diff --git a/data/unicode-8.0.0/UnicodeData.txt.xz b/data/unicode-8.0.0/UnicodeData.txt.xz Binary files differnew file mode 100644 index 0000000..d68fed1 --- /dev/null +++ b/data/unicode-8.0.0/UnicodeData.txt.xz diff --git a/meson.build b/meson.build index ff6a6fc..c3b6302 100644 --- a/meson.build +++ b/meson.build @@ -5,9 +5,10 @@ project( meson_version : '>= 1.3.0', default_options : [ 'warning_level=3', - 'cpp_std=c++23', + 'cpp_std=c++26', 'cpp_eh=none', 'cpp_rtti=false', + 'default_library=static', ], ) @@ -17,25 +18,149 @@ configure_file(input: 'src/config.h.in', output: 'config.h', configuration : conf_data) -dependencies = [ -] +z_dep = dependency('zlib', version: '>=1.3.0') +lzma_dep = dependency('liblzma', version: '>=5.8.0') inc = include_directories('src') -exe = executable( - 'jkc', +args_lib = library( + 'args', sources: [ 'src/args.cc', 'src/args.hh', + ], + include_directories: inc, +) +args_dep = declare_dependency(link_with: args_lib) + +buffer_lib = library( + 'buffer', + sources: [ + 'src/buffer.cc', + 'src/buffer.hh', + ], + include_directories: inc, +) +buffer_dep = declare_dependency(link_with: buffer_lib) + +io_lib = library( + 'io', + sources: [ + 'src/line.cc', + 'src/line.hh', + 'src/io.cc', + 'src/io.hh', + 'src/unique_fd.cc', + 'src/unique_fd.hh', + ], + include_directories: inc, +) +io_dep = declare_dependency(link_with: io_lib) + +str_lib = library( + 'str', + sources: [ + 'src/str.cc', + 'src/str.hh', + ], + include_directories: inc, +) +str_dep = declare_dependency(link_with: str_lib) + +csv_lib = library( + 'csv', + sources: [ + 'src/csv.cc', + 'src/csv.hh', + ], + include_directories: inc, + dependencies: [io_dep, str_dep], +) +csv_dep = declare_dependency( + link_with: csv_lib, + dependencies: [io_dep, str_dep], +) + +decompress_lib = library( + 'decompress', + sources: [ + 'src/decompress.hh', + 'src/decompress_lzma.cc', + 'src/decompress_z.cc', + ], + include_directories: inc, + dependencies: [buffer_dep, io_dep, lzma_dep, z_dep], +) +decompress_dep = declare_dependency( + link_with: decompress_lib, + dependencies: [buffer_dep, io_dep, lzma_dep, z_dep], +) + +gen_ugc = executable( + 'gen_ugc', + sources: [ + 'src/gen_ugc.cc', + ], + include_directories: inc, + install : false, + dependencies : [ + args_dep, + csv_dep, + decompress_dep, + ], +) + +unicode_versions = [ + '6.2.0', + '8.0.0', + '10.0.0', + '11.0.0', + '12.1.0', + '13.0.0', + '14.0.0', + '15.0.0', + '15.1.0', + '16.0.0', +] + +ugc_sources = [] +foreach unicode_version : unicode_versions + ugc_sources += custom_target( + 'gen-ugc-' + unicode_version, + input: ['data/unicode-' + unicode_version + '/UnicodeData.txt.xz'], + output: ['ugc_lookup_' + unicode_version + '.cc'], + command : [gen_ugc, '--prefix', + 'u' + unicode_version.replace('.', '_') + '_', + '@INPUT@', '@OUTPUT@']) +endforeach + +unicode_lib = library( + 'unicode', + sources: [ 'src/u.hh', + 'src/u.cc', 'src/u16.hh', 'src/u8.hh', + 'src/ugc.hh', 'src/umod8.hh', + ugc_sources, + ], + include_directories: inc, +) +unicode_dep = declare_dependency(link_with: unicode_lib) + +jkc = executable( + 'jkc', + sources: [ 'src/main.cc', ], include_directories: inc, install : true, - dependencies : dependencies, + dependencies : [ + args_dep, + io_dep, + unicode_dep, + ], ) gtest_main_dep = dependency('gtest_main', fallback : ['gtest_main']) @@ -46,22 +171,99 @@ test_dependencies = [ test('args', executable( 'test_args', - sources: [ - 'src/args.cc', - 'src/args.hh', - 'test/args.cc', - ], + sources: ['test/args.cc'], include_directories: inc, - dependencies : test_dependencies)) + dependencies: [ + args_dep, + test_dependencies, + ], +)) test('u', executable( 'test_u', + sources: ['test/u.cc'], + include_directories: inc, + dependencies: [ + unicode_dep, + test_dependencies, + ], +)) + +test('csv', executable( + 'test_csv', + sources: ['test/csv.cc'], + include_directories: inc, + dependencies: [ + csv_dep, + test_dependencies, + ], +)) + +test('line', executable( + 'test_line', sources: [ - 'src/u.hh', - 'src/u16.hh', - 'src/u8.hh', - 'src/umod8.hh', - 'test/u.cc', + 'test/line.cc', + 'test/io_test_helper.hh', + 'test/io_test_helper.cc', + ], + include_directories: inc, + dependencies: [ + io_dep, + test_dependencies, + ], +)) + +test('str', executable( + 'test_str', + sources: ['test/str.cc'], + include_directories: inc, + dependencies: [ + str_dep, + test_dependencies, ], +)) + +test('io', executable( + 'test_io', + sources: ['test/io.cc'], + include_directories: inc, + dependencies: [ + io_dep, + test_dependencies, + ], +)) + +test('buffer', executable( + 'test_buffer', + sources: ['test/buffer.cc'], + include_directories: inc, + dependencies : [ + buffer_dep, + test_dependencies, + ], +)) + +test('decompress', executable( + 'test_decompress', + sources: ['test/decompress.cc'], include_directories: inc, - dependencies : test_dependencies)) + dependencies : [ + decompress_dep, + test_dependencies, + ], +)) + +run_clang_tidy = find_program('run-clang-tidy', required: false) + +if run_clang_tidy.found() + # The clang-tidy target generated by meson misses most of the + # source files, so create our own. + run_target( + 'clang-tidy', + command: [ + run_clang_tidy, + '-quiet', + '-use-color', + ], + ) +endif diff --git a/src/buffer.cc b/src/buffer.cc new file mode 100644 index 0000000..65c6757 --- /dev/null +++ b/src/buffer.cc @@ -0,0 +1,213 @@ +#include "buffer.hh" + +#include <algorithm> +#include <cassert> +#include <cstring> +#include <memory> +#include <utility> + +namespace { + +class FixedBuffer : public Buffer { + public: + explicit FixedBuffer(size_t size) + : size_(size) {} + + void const* rptr(size_t& avail, size_t need) override { + if (rptr_ < wptr_) { + avail = wptr_ - rptr_; + } else if (rptr_ == wptr_ && !full_) { + avail = 0; + } else { + avail = (data_.get() + size_) - rptr_; + if (avail < need) { + rotate(); + return rptr(avail, need); + } + } + return rptr_; + } + + void consume(size_t size) override { + if (size == 0) return; + if (rptr_ < wptr_) { + assert(std::cmp_greater_equal(wptr_ - rptr_, size)); + rptr_ += size; + if (rptr_ == wptr_) + reset(); + } else { + assert(rptr_ != wptr_ || !full_); + assert(std::cmp_greater_equal((data_.get() + size_) - rptr_, size)); + rptr_ += size; + if (rptr_ == data_.get() + size_) { + rptr_ = data_.get(); + if (rptr_ == wptr_) + reset(); + } + } + } + + void* wptr(size_t& avail, size_t need) override { + if (wptr_ == nullptr) { + data_ = std::make_unique_for_overwrite<char[]>(size_); + rptr_ = wptr_ = data_.get(); + } + + if (wptr_ < rptr_) { + avail = rptr_ - wptr_; + } else if (rptr_ == wptr_ && full_) { + avail = 0; + } else { + avail = (data_.get() + size_) - wptr_; + if (avail < need) { + rotate(); + return wptr(avail, need); + } + } + return wptr_; + } + + void commit(size_t size) override { + if (size == 0) return; + if (wptr_ < rptr_) { + assert(std::cmp_greater_equal(rptr_ - wptr_, size)); + wptr_ += size; + if (wptr_ == rptr_) { + full_ = true; + } + } else { + assert(rptr_ != wptr_ || !full_); + assert(std::cmp_greater_equal((data_.get() + size_) - wptr_, size)); + wptr_ += size; + if (wptr_ == data_.get() + size_) { + wptr_ = data_.get(); + if (wptr_ == rptr_) + full_ = true; + } + } + } + + [[nodiscard]] bool full() const override { + return rptr_ == wptr_ && full_; + } + + [[nodiscard]] bool empty() const override { + return rptr_ == wptr_ && !full_; + } + + private: + void reset() { + rptr_ = wptr_ = data_.get(); + full_ = false; + } + + void rotate() { + size_t to_move = (data_.get() + size_) - rptr_; + if (wptr_ + to_move > rptr_) { + auto tmp = std::make_unique_for_overwrite<char[]>(to_move); + memcpy(tmp.get(), rptr_, to_move); + memmove(data_.get() + to_move, data_.get(), wptr_ - data_.get()); + memcpy(data_.get(), tmp.get(), to_move); + } else { + memmove(data_.get() + to_move, data_.get(), wptr_ - data_.get()); + memcpy(data_.get(), rptr_, to_move); + } + rptr_ = data_.get(); + wptr_ += to_move; + } + + size_t const size_; + std::unique_ptr<char[]> data_; + char* rptr_{nullptr}; + char* wptr_{nullptr}; + bool full_{false}; +}; + +class DynamicBuffer : public Buffer { + public: + DynamicBuffer(size_t start_size, size_t max_size) + : start_size_(start_size), max_size_(max_size) {} + + void const* rptr(size_t& avail, size_t /* need */) override { + avail = wptr_ - rptr_; + return rptr_; + } + + void consume(size_t size) override { + assert(std::cmp_greater_equal(wptr_ - rptr_, size)); + rptr_ += size; + if (rptr_ == wptr_) { + reset(); + } + } + + void* wptr(size_t& avail, size_t need) override { + avail = end_ - wptr_; + if (avail < need) { + if (end_ == nullptr) { + size_t size = std::min(max_size_, std::max(need, start_size_)); + data_ = std::make_unique_for_overwrite<char[]>(size); + end_ = data_.get() + size; + rptr_ = wptr_ = data_.get(); + avail = end_ - wptr_; + } else if (std::cmp_greater_equal(rptr_ - data_.get(), need - avail)) { + memmove(data_.get(), rptr_, wptr_ - rptr_); + wptr_ = data_.get() + (wptr_ - rptr_); + rptr_ = data_.get(); + avail = end_ - wptr_; + } else if (std::cmp_less(end_ - data_.get(), max_size_)) { + size_t current_size = end_ - data_.get(); + size_t new_size = std::min(max_size_, + current_size + std::max(need - avail, + current_size)); + auto tmp = std::make_unique_for_overwrite<char[]>(new_size); + memcpy(tmp.get(), rptr_, wptr_ - rptr_); + end_ = tmp.get() + new_size; + wptr_ = tmp.get() + (wptr_ - rptr_); + rptr_ = tmp.get(); + data_ = std::move(tmp); + avail = end_ - wptr_; + } + } + return wptr_; + } + + void commit(size_t size) override { + assert(std::cmp_greater_equal(end_ - wptr_, size)); + wptr_ += size; + } + + [[nodiscard]] bool full() const override { + return rptr_ == data_.get() && wptr_ == end_ && + std::cmp_equal(end_ - data_.get(), max_size_); + } + + [[nodiscard]] bool empty() const override { + return rptr_ == wptr_; + } + + private: + void reset() { + if (std::cmp_greater(end_ - data_.get(), start_size_)) { + data_ = std::make_unique_for_overwrite<char[]>(start_size_); + } + rptr_ = wptr_ = data_.get(); + } + + size_t const start_size_; + size_t const max_size_; + std::unique_ptr<char[]> data_; + char* end_{nullptr}; + char* rptr_{nullptr}; + char* wptr_{nullptr}; +}; + +} // namespace + +std::unique_ptr<Buffer> Buffer::fixed(size_t size) { + return std::make_unique<FixedBuffer>(size); +} + +std::unique_ptr<Buffer> Buffer::dynamic(size_t start_size, size_t max_size) { + return std::make_unique<DynamicBuffer>(start_size, max_size); +} diff --git a/src/buffer.hh b/src/buffer.hh new file mode 100644 index 0000000..685cd36 --- /dev/null +++ b/src/buffer.hh @@ -0,0 +1,31 @@ +#ifndef BUFFER_HH +#define BUFFER_HH + +#include <cstddef> +#include <memory> + +class Buffer { + public: + virtual ~Buffer() = default; + + virtual void const* rptr(size_t& avail, size_t need = 1) = 0; + virtual void consume(size_t size) = 0; + + virtual void* wptr(size_t& avail, size_t need = 1) = 0; + virtual void commit(size_t size) = 0; + + [[nodiscard]] virtual bool full() const = 0; + [[nodiscard]] virtual bool empty() const = 0; + + [[nodiscard]] + static std::unique_ptr<Buffer> fixed(size_t size); + [[nodiscard]] + static std::unique_ptr<Buffer> dynamic(size_t start_size, size_t max_size); + + protected: + Buffer() = default; + Buffer(Buffer const&) = delete; + Buffer& operator=(Buffer const&) = delete; +}; + +#endif // BUFFER_HH diff --git a/src/check.hh b/src/check.hh new file mode 100644 index 0000000..be65437 --- /dev/null +++ b/src/check.hh @@ -0,0 +1,39 @@ +#ifndef CHECK_HH +#define CHECK_HH + +#include <cstdlib> +#include <stdckdint.h> +#include <type_traits> + +namespace check { + +template<typename T> +requires std::is_arithmetic_v<T> +T add(T a, T b) { + T ret; + if (ckd_add(&ret, a, b)) + abort(); + return ret; +} + +template<typename T> +requires std::is_arithmetic_v<T> +T sub(T a, T b) { + T ret; + if (ckd_sub(&ret, a, b)) + abort(); + return ret; +} + +template<typename T> +requires std::is_arithmetic_v<T> +T mul(T a, T b) { + T ret; + if (ckd_mul(&ret, a, b)) + abort(); + return ret; +} + +} // namespace check + +#endif // CHECK_HH diff --git a/src/csv.cc b/src/csv.cc new file mode 100644 index 0000000..4135555 --- /dev/null +++ b/src/csv.cc @@ -0,0 +1,63 @@ +#include "csv.hh" + +#include "line.hh" +#include "str.hh" + +#include <cstdint> +#include <expected> +#include <memory> +#include <span> +#include <string_view> +#include <utility> +#include <vector> + +namespace csv { + +namespace { + +class ReaderImpl : public Reader { + public: + ReaderImpl(std::unique_ptr<line::Reader> reader, char separator) + : reader_(std::move(reader)), separator_(separator) { + } + + [[nodiscard]] + std::expected<std::span<std::string_view>, io::ReadError> read() override { + while (true) { + auto line = reader_->read(); + if (line.has_value()) { + str::split(line.value(), line_, separator_, /* keep_empty */ true); + if (line_.size() == 1 && line_[0].empty()) + continue; + return line_; + } + if (line.error().eof) { + return {}; + } + return std::unexpected(line.error().io_error.value()); + } + } + + [[nodiscard]] uint64_t number() const override { + return reader_->number(); + } + + private: + std::unique_ptr<line::Reader> reader_; + char const separator_; + std::vector<std::string_view> line_; +}; + +} // namespace + +std::unique_ptr<Reader> open(std::unique_ptr<line::Reader> reader, + char separator) { + return std::make_unique<ReaderImpl>(std::move(reader), separator); +} + +std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader, + char separator) { + return open(line::open(std::move(reader)), separator); +} + +} // namespace csv diff --git a/src/csv.hh b/src/csv.hh new file mode 100644 index 0000000..8c47ceb --- /dev/null +++ b/src/csv.hh @@ -0,0 +1,44 @@ +#ifndef CSV_HH +#define CSV_HH + +#include "io.hh" // IWYU pragma: export +#include "line.hh" + +#include <expected> +#include <memory> +#include <span> +#include <string_view> + +namespace csv { + +// Note that this reader is very simple, no quotes or escapes. +// Empty lines are ignored. +class Reader { + public: + virtual ~Reader() = default; + + // Returned span is only valid until next call to read. + // Returns empty span at end-of-file and only then. + [[nodiscard]] + virtual std::expected<std::span<std::string_view>, io::ReadError> read() = 0; + + // Starts at zero. Returns next line. + // So, before first read it is zero, after first read it is one. + [[nodiscard]] virtual uint64_t number() const = 0; + + protected: + Reader() = default; + + Reader(Reader const&) = delete; + Reader& operator=(Reader const&) = delete; +}; + +[[nodiscard]] std::unique_ptr<Reader> open(std::unique_ptr<line::Reader> reader, + char separator = ','); + +[[nodiscard]] std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader, + char separator = ','); + +} // namespace csv + +#endif // CSV_HH diff --git a/src/decompress.hh b/src/decompress.hh new file mode 100644 index 0000000..a15efdc --- /dev/null +++ b/src/decompress.hh @@ -0,0 +1,19 @@ +#ifndef DECOMPRESS_HH +#define DECOMPRESS_HH + +#include "io.hh" // IWYU pragma: export + +namespace decompress { + +// zlib format +std::unique_ptr<io::Reader> zlib(std::unique_ptr<io::Reader> reader); + +// gzip (.gz) format +std::unique_ptr<io::Reader> gzip(std::unique_ptr<io::Reader> reader); + +// xz format +std::unique_ptr<io::Reader> xz(std::unique_ptr<io::Reader> reader); + +} // namespace decompress + +#endif // DECOMPRESS_HH diff --git a/src/decompress_lzma.cc b/src/decompress_lzma.cc new file mode 100644 index 0000000..6baea18 --- /dev/null +++ b/src/decompress_lzma.cc @@ -0,0 +1,110 @@ +#include "decompress.hh" + +#include "buffer.hh" + +#include <lzma.h> + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <expected> +#include <memory> +#include <optional> +#include <utility> + +namespace decompress { + +namespace { + +const size_t kBufferSizeXz = static_cast<size_t>(1024) * 1024; + +class XzReader : public io::Reader { + public: + explicit XzReader(std::unique_ptr<io::Reader> reader) + : reader_(std::move(reader)) {} + + ~XzReader() override { + if (initialized_) + lzma_end(&stream_); + } + + std::expected<size_t, io::ReadError> read(void* dst, size_t max) override { + auto err = fill(); + if (err.has_value()) + return std::unexpected(err.value()); + + stream_.next_out = reinterpret_cast<unsigned char*>(dst); + stream_.avail_out = max; + + if (!initialized_) { + if (in_eof_ && buffer_->empty()) + return 0; + + lzma_mt options; + memset(&options, 0, sizeof(options)); + options.threads = std::max(static_cast<uint32_t>(1), lzma_cputhreads()); + options.memlimit_threading = lzma_physmem() / 4; + options.memlimit_stop = lzma_physmem() / 4; + auto ret = lzma_stream_decoder_mt(&stream_, &options); + if (ret != LZMA_OK) + return std::unexpected(io::ReadError::Error); + initialized_ = true; + } + + auto* const rptr = stream_.next_in; + auto ret = lzma_code(&stream_, in_eof_ ? LZMA_FINISH : LZMA_RUN); + auto got = max - stream_.avail_out; + if (ret == LZMA_STREAM_END) { + lzma_end(&stream_); + initialized_ = false; + buffer_->consume(stream_.next_in - rptr); + } else if (ret == LZMA_OK) { + if (!in_eof_) + buffer_->consume(stream_.next_in - rptr); + } else { + return std::unexpected( + ret == LZMA_DATA_ERROR + ? io::ReadError::InvalidData : io::ReadError::Error); + } + return got; + } + + std::expected<size_t, io::ReadError> skip(size_t max) override { + auto tmp = std::make_unique_for_overwrite<char[]>(max); + return read(tmp.get(), max); + } + + private: + std::optional<io::ReadError> fill() { + auto* rptr = buffer_->rptr(stream_.avail_in); + if (!in_eof_ && stream_.avail_in < kBufferSizeXz / 2) { + auto* wptr = buffer_->wptr(stream_.avail_in); + auto got = reader_->read(wptr, stream_.avail_in); + if (got.has_value()) { + buffer_->commit(got.value()); + if (got.value() == 0) + in_eof_ = true; + } else { + return got.error(); + } + rptr = buffer_->rptr(stream_.avail_in); + } + stream_.next_in = reinterpret_cast<const unsigned char*>(rptr); + return std::nullopt; + } + + std::unique_ptr<io::Reader> reader_; + bool in_eof_{false}; + std::unique_ptr<Buffer> buffer_{Buffer::fixed(kBufferSizeXz)}; + bool initialized_{false}; + lzma_stream stream_ = LZMA_STREAM_INIT; +}; + +} // namespace + +std::unique_ptr<io::Reader> xz(std::unique_ptr<io::Reader> reader) { + return std::make_unique<XzReader>(std::move(reader)); +} + +} // namespace decompress diff --git a/src/decompress_z.cc b/src/decompress_z.cc new file mode 100644 index 0000000..f9f87ae --- /dev/null +++ b/src/decompress_z.cc @@ -0,0 +1,120 @@ +#include "decompress.hh" + +#include "buffer.hh" + +#define ZLIB_CONST +#include <zlib.h> + +#include <algorithm> +#include <cstddef> +#include <expected> +#include <limits> +#include <memory> +#include <optional> +#include <utility> + +namespace decompress { + +namespace { + +const size_t kBufferSizeZ = static_cast<size_t>(1024) * 1024; + +class DecompressReader : public io::Reader { + public: + DecompressReader(std::unique_ptr<io::Reader> reader, bool gzip) + : reader_(std::move(reader)), gzip_(gzip) {} + + ~DecompressReader() override { + if (initialized_) + inflateEnd(&stream_); + } + + std::expected<size_t, io::ReadError> read(void* dst, size_t max) override { + auto err = fill(); + if (err.has_value()) + return std::unexpected(err.value()); + + // NOLINTNEXTLINE(misc-include-cleaner) + stream_.next_out = reinterpret_cast<Bytef*>(dst); + stream_.avail_out = max; + + if (!initialized_) { + if (in_eof_ && buffer_->empty()) + return 0; + + stream_.zalloc = Z_NULL; + stream_.zfree = Z_NULL; + stream_.opaque = Z_NULL; + if (inflateInit2(&stream_, gzip_ ? 16 : 0) != Z_OK) { + return std::unexpected(io::ReadError::Error); + } + initialized_ = true; + } + + auto* const rptr = stream_.next_in; + auto ret = inflate(&stream_, in_eof_ ? Z_FINISH : Z_NO_FLUSH); + auto got = max - stream_.avail_out; + if (ret == Z_STREAM_END) { + inflateEnd(&stream_); + initialized_ = false; + buffer_->consume(stream_.next_in - rptr); + } else if (ret == Z_OK) { + if (!in_eof_) + buffer_->consume(stream_.next_in - rptr); + } else { + return std::unexpected( + ret == Z_DATA_ERROR + ? io::ReadError::InvalidData : io::ReadError::Error); + } + return got; + } + + std::expected<size_t, io::ReadError> skip(size_t max) override { + auto tmp = std::make_unique_for_overwrite<char[]>(max); + return read(tmp.get(), max); + } + + private: + std::optional<io::ReadError> fill() { + size_t avail; + auto* rptr = buffer_->rptr(avail); + if (!in_eof_ && avail < kBufferSizeZ / 2) { + auto* wptr = buffer_->wptr(avail); + auto got = reader_->read(wptr, avail); + if (got.has_value()) { + buffer_->commit(got.value()); + if (got.value() == 0) + in_eof_ = true; + } else { + return got.error(); + } + rptr = buffer_->rptr(avail); + } + // NOLINTNEXTLINE(misc-include-cleaner) + stream_.next_in = reinterpret_cast<z_const Bytef*>(rptr); + stream_.avail_in = std::min( + // NOLINTNEXTLINE(misc-include-cleaner) + static_cast<size_t>(std::numeric_limits<uInt>::max()), avail); + return std::nullopt; + } + + std::unique_ptr<io::Reader> reader_; + bool const gzip_; + bool in_eof_{false}; + std::unique_ptr<Buffer> buffer_{Buffer::fixed(kBufferSizeZ)}; + bool initialized_{false}; + z_stream stream_; +}; + +} // namespace + +std::unique_ptr<io::Reader> zlib(std::unique_ptr<io::Reader> reader) { + return std::make_unique<DecompressReader>(std::move(reader), /* gzip = */ false); +} + +std::unique_ptr<io::Reader> gzip(std::unique_ptr<io::Reader> reader) { + return std::make_unique<DecompressReader>(std::move(reader), /* gzip = */ true); +} + + +} // namespace decompress diff --git a/src/gen_ugc.cc b/src/gen_ugc.cc new file mode 100644 index 0000000..e9bce11 --- /dev/null +++ b/src/gen_ugc.cc @@ -0,0 +1,317 @@ +#include "args.hh" +#include "csv.hh" +#include "decompress.hh" +#include "ugc.hh" + +#include <charconv> +#include <cstdint> +#include <expected> +#include <format> +#include <fstream> +#include <functional> +#include <iostream> +#include <map> +#include <span> +#include <string> +#include <string_view> +#include <system_error> +#include <utility> +#include <vector> + +namespace { + +std::map<std::string, u::GeneralCategory, std::less<>> str2gc{ + {"Lu", u::GeneralCategory::LETTER_UPPERCASE}, + {"Ll", u::GeneralCategory::LETTER_LOWERCASE}, + {"Lt", u::GeneralCategory::LETTER_TITLECASE}, + {"Lm", u::GeneralCategory::LETTER_MODIFIER}, + {"Lo", u::GeneralCategory::LETTER_OTHER}, + + {"Mn", u::GeneralCategory::MARK_NONSPACING}, + {"Mc", u::GeneralCategory::MARK_SPACING_COMBINDING}, + {"Me", u::GeneralCategory::MARK_SPACING_ENCLOSING}, + + {"Nd", u::GeneralCategory::NUMBER_DIGIT}, + {"Nl", u::GeneralCategory::NUMBER_LETTER}, + {"No", u::GeneralCategory::NUMBER_OTHER}, + + {"Pc", u::GeneralCategory::PUNCTUATION_CONNECTOR}, + {"Pd", u::GeneralCategory::PUNCTUATION_DASH}, + {"Ps", u::GeneralCategory::PUNCTUATION_OPEN}, + {"Pe", u::GeneralCategory::PUNCTUATION_CLOSE}, + {"Pi", u::GeneralCategory::PUNCTUATION_INITIAL_QUOTE}, + {"Pf", u::GeneralCategory::PUNCTUATION_FINAL_QUOTE}, + {"Po", u::GeneralCategory::PUNCTUATION_OTHER}, + + {"Sm", u::GeneralCategory::SYMBOL_MATH}, + {"Sc", u::GeneralCategory::SYMBOL_CURRENCY}, + {"Sk", u::GeneralCategory::SYMBOL_MODIFIER}, + {"So", u::GeneralCategory::SYMBOL_OTHER}, + + {"Zs", u::GeneralCategory::SEPARATOR_SPACE}, + {"Zl", u::GeneralCategory::SEPARATOR_LINE}, + {"Zp", u::GeneralCategory::SEPARATOR_PARAGRAPH}, + + {"Cc", u::GeneralCategory::OTHER_CONTROL}, + {"Cf", u::GeneralCategory::OTHER_FORMAT}, + {"Cs", u::GeneralCategory::OTHER_SURROGATE}, + {"Co", u::GeneralCategory::OTHER_PRIVATE_USE}, + {"Cn", u::GeneralCategory::OTHER_UNASSIGNED}, +}; + +void print_header(std::ostream& out, std::string_view prefix) { + out << "#include \"ugc.hh\"\n" + << "\n" + << "#include <array>\n" + << "#include <cstddef>\n" + << "#include <cstdint>\n" + << "\n" + << "namespace u {\n" + << "\n" + << "extern GeneralCategory " << prefix << "lookup_gc(uint32_t code) {\n"; +} + +void print_body(std::ostream& out, + std::map<uint32_t, u::GeneralCategory> const& data) { + std::vector<uint32_t> codes; + std::vector<u::GeneralCategory> categories; + + auto it = data.begin(); + codes.emplace_back(it->first); + categories.emplace_back(it->second); + + uint32_t next = it->first + 1; + + for (++it; it != data.end(); ++it) { + if (it->first == next && categories.back() == it->second) { + ++next; + } else { + codes.emplace_back(next - 1); + codes.emplace_back(it->first); + categories.emplace_back(it->second); + next = it->first + 1; + } + } + + codes.emplace_back(next - 1); + + out << " static std::array<uint32_t, " << codes.size() << "> codes{"; + for (auto code : codes) { + out << code << ","; + } + out << " };\n"; + out << " static std::array<uint8_t, " << categories.size() + << "> categories{"; + for (auto category : categories) { + out << static_cast<uint16_t>(category) << ","; + } + out << "};\n"; + + out << " size_t low = 0;\n" + << " size_t high = " << (codes.size() / 2) << ";\n" + << " while (low < high) {\n" + << " size_t m = (low + high) / 2;\n" + << " uint32_t start = codes[m * 2];\n" + << " if (code < start) {\n" + << " high = m;\n" + << " } else {\n" + << " uint32_t end = codes[(m * 2) + 1];\n" + << " if (code <= end) {\n" + << " return static_cast<u::GeneralCategory>(categories[m]);\n" + << " }\n" + << " low = m + 1;\n" + << " }\n" + << " }\n" + << " return u::GeneralCategory::OTHER_UNASSIGNED;\n"; +} + +void print_footer(std::ostream& out, std::string_view /* prefix */) { + out << "}\n" + << "\n" + << "} // namespace u\n"; +} + +std::string_view ioerr2str(io::OpenError error) { + switch (error) { + case io::OpenError::NoSuchFile: + return "No such file"; + case io::OpenError::NoAccess: + return "No access"; + case io::OpenError::Error: + return "Fatal error"; + } + std::unreachable(); +} + +std::string_view ioerr2str(io::ReadError error) { + switch (error) { + case io::ReadError::InvalidData: + return "Invalid (compressed) data"; + case io::ReadError::Error: + return "Fatal error"; + } + std::unreachable(); +} + +std::expected<std::pair<uint32_t, u::GeneralCategory>, std::string> parse_row( + std::span<std::string_view> row) { + // [code];[name];[gc];[cc];[bc];[decomposition];[nv-dec];[nv-dig];[nv-num];[bm];[alias];;[upper case];[lower case];[title case] + if (row.size() != 15) { + return std::unexpected(std::format("Invalid row ({} columns)", row.size())); + } + auto code_col = row[0]; + auto category_col = row[2]; + + uint32_t code; + auto [ptr, ec] = std::from_chars(code_col.data(), + code_col.data() + code_col.size(), code, + /* base */ 16); + if (ec != std::errc() || ptr != code_col.data() + code_col.size()) { + return std::unexpected(std::format("Invalid code value {}", code_col)); + } + u::GeneralCategory category; + auto it = str2gc.find(category_col); + if (it == str2gc.end()) { + return std::unexpected(std::format("Invalid general category {}", + category_col)); + } + category = it->second; + + return std::make_pair(code, category); +} + +std::expected<std::map<uint32_t, u::GeneralCategory>, std::string> read( + std::string_view filename) { + auto maybe_reader = io::open(std::string(filename)); + if (!maybe_reader.has_value()) { + return std::unexpected(std::format( + "Unable to open {} for reading: {}", + filename, ioerr2str(maybe_reader.error()))); + } + auto reader = std::move(maybe_reader.value()); + if (filename.ends_with(".gz")) { + reader = decompress::gzip(std::move(reader)); + } else if (filename.ends_with(".xz")) { + reader = decompress::xz(std::move(reader)); + } + + std::map<uint32_t, u::GeneralCategory> ret; + auto csv_reader = csv::open(std::move(reader), ';'); + while (true) { + auto row = csv_reader->read(); + if (!row.has_value()) { + return std::unexpected(std::format( + "{}:{}: Error reading file: {}", + filename, csv_reader->number(), ioerr2str(row.error()))); + } + if (row->empty()) + break; + + auto pair = parse_row(row.value()); + if (!pair.has_value()) { + return std::unexpected(std::format( + "{}:{}: {}", filename, csv_reader->number(), pair.error())); + } + auto name_col = (*row)[1]; + + if (name_col.ends_with(", First>")) { + std::string prefix(name_col.substr(0, name_col.size() - 8)); + row = csv_reader->read(); + if (!row.has_value()) { + return std::unexpected(std::format( + "{}:{}: Error reading file: {}", + filename, csv_reader->number(), ioerr2str(row.error()))); + } + + auto second_pair = parse_row(row.value()); + if (!pair.has_value()) { + return std::unexpected(std::format( + "{}:{}: {}", filename, csv_reader->number(), pair.error())); + } + + name_col = (*row)[1]; + if (name_col.ends_with(", Last>") && + name_col.substr(0, name_col.size() - 7) == prefix) { + if (pair->second != second_pair->second) { + return std::unexpected(std::format( + "{}:{}: Invalid range, general category doesn't match", + filename, csv_reader->number())); + } + + for (uint32_t c = pair->first; c <= second_pair->first; ++c) { + auto emplace_ret = ret.emplace(c, pair->second); + if (!emplace_ret.second) { + return std::unexpected(std::format( + "{}:{}: Duplicate value for {:#08x}", + filename, csv_reader->number(), c)); + } + } + } else { + return std::unexpected(std::format( + "{}:{}: Invalid range, {} doesn't match {}", + filename, csv_reader->number(), prefix, name_col)); + } + } else { + auto emplace_ret = ret.emplace(std::move(pair.value())); + if (!emplace_ret.second) { + return std::unexpected(std::format( + "{}:{}: Duplicate value for {:#08x}", + filename, csv_reader->number(), emplace_ret.first->first)); + } + } + } + + return ret; +} + +} // namespace + +int main(int argc, char** argv) { + auto args = Args::create(); + auto opt_help = args->option('h', "help", "display this text and exit"); + auto opt_prefix = + args->option_argument('p', "prefix", "ARG", "Prefix for exported method"); + std::vector<std::string_view> arguments; + if (!args->run(argc, argv, &arguments)) { + std::cerr << "Try `gen_u --help` for usage\n"; + return 1; + } + if (opt_help->is_set()) { + std::cout << "Usage: `gen_u [OPTIONS...] UnicodeData [OUTPUT]`\n" + << "Generates a method for getting the general category for a " + << "code point.\n" + << "\n"; + args->print_help(std::cout); + return 0; + } + if (!opt_prefix->is_set()) { + std::cerr << "No prefix given.\n" + << "Try `gen_u --help` for usage\n"; + return 1; + } + auto prefix = opt_prefix->argument(); + if (arguments.empty() || arguments.size() > 2) { + std::cerr << "Expecting one or two argument. No more, no less.\n" + << "Try `gen_u --help` for usage\n"; + return 1; + } + + auto general_categories = read(arguments[0]); + if (!general_categories.has_value()) { + std::cerr << general_categories.error() << '\n'; + return 1; + } + + if (arguments.size() < 2 || arguments[1] == "-") { + print_header(std::cout, prefix); + print_body(std::cout, general_categories.value()); + print_footer(std::cout, prefix); + } else { + std::fstream out{std::string(arguments[1]), + std::fstream::trunc | std::fstream::out}; + print_header(out, prefix); + print_body(out, general_categories.value()); + print_footer(out, prefix); + } + return 0; +} diff --git a/src/io.cc b/src/io.cc new file mode 100644 index 0000000..baf162a --- /dev/null +++ b/src/io.cc @@ -0,0 +1,238 @@ +#include "io.hh" + +#include "unique_fd.hh" + +#include <algorithm> +#include <cerrno> +#include <cstdio> +#include <cstring> +#include <expected> +#include <fcntl.h> +#include <limits> +#include <memory> +#include <optional> +#include <string> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <utility> + +namespace io { + +namespace { + +class BasicReader : public Reader { + public: + explicit BasicReader(unique_fd fd) + : fd_(std::move(fd)) { + } + + [[nodiscard]] + std::expected<size_t, ReadError> read(void* dst, size_t max) override { + ssize_t ret = ::read( + fd_.get(), dst, std::min(static_cast<size_t>( + std::numeric_limits<ssize_t>::max()), max)); + if (ret < 0) { + switch (errno) { + case EINTR: + return read(dst, max); + default: + return std::unexpected(ReadError::Error); + } + } + offset_ += ret; + return ret; + } + + [[nodiscard]] + std::expected<size_t, ReadError> skip(size_t max) override { + off_t ret; + if (sizeof(size_t) > sizeof(off_t)) { + // NOLINTNEXTLINE(bugprone-narrowing-conversions) + ret = lseek(fd_.get(), std::min(static_cast<size_t>( + std::numeric_limits<off_t>::max()), max), SEEK_CUR); + } else { + ret = lseek(fd_.get(), static_cast<off_t>(max), SEEK_CUR); + } + if (ret < 0) { + return std::unexpected(ReadError::Error); + } + // Don't want skip to go past (cached) file end. + if (!size_.has_value() || ret > size_.value()) { + // When going past end, double check that it still is the end. + off_t ret2 = lseek(fd_.get(), 0, SEEK_END); + if (ret2 < 0) { + // We're screwed, but try to go back to original position and then + // return error. + size_.reset(); + lseek(fd_.get(), offset_, SEEK_SET); + return std::unexpected(ReadError::Error); + } + size_ = ret2; + if (ret > ret2) { + auto distance = ret2 - offset_; + offset_ = ret2; + return distance; + } + // Seek back to where we should be + if (lseek(fd_.get(), ret, SEEK_SET) < 0) { + return std::unexpected(ReadError::Error); + } + } + auto distance = ret - offset_; + offset_ = ret; + return distance; + } + + private: + unique_fd fd_; + off_t offset_{0}; + std::optional<off_t> size_; +}; + +class MemoryReader : public Reader { + public: + MemoryReader(void* ptr, size_t size) + : ptr_(ptr), size_(size) { + } + + [[nodiscard]] + std::expected<size_t, ReadError> read(void* dst, size_t max) override { + size_t avail = size_ - offset_; + size_t ret = std::min(max, avail); + memcpy(dst, reinterpret_cast<char*>(ptr_) + offset_, ret); + offset_ += ret; + return ret; + } + + [[nodiscard]] + std::expected<size_t, ReadError> skip(size_t max) override { + size_t avail = size_ - offset_; + size_t ret = std::min(max, avail); + offset_ += ret; + return ret; + } + + protected: + void* ptr_; + size_t const size_; + + private: + size_t offset_{0}; +}; + +class MmapReader : public MemoryReader { + public: + MmapReader(unique_fd fd, void* ptr, size_t size) + : MemoryReader(ptr, size), fd_(std::move(fd)) { + } + + ~MmapReader() override { + munmap(ptr_, size_); + } + + private: + unique_fd fd_; +}; + +class StringReader : public MemoryReader { + public: + explicit StringReader(std::string data) + : MemoryReader(nullptr, data.size()), data_(std::move(data)) { + ptr_ = data_.data(); + } + + private: + std::string data_; +}; + +} // namespace + +std::expected<size_t, ReadError> Reader::read(std::string& str) { + return read(str.data(), str.size()); +} + +std::expected<size_t, ReadError> Reader::repeat_read(void* dst, size_t max) { + auto ret = read(dst, max); + if (!ret.has_value() || ret.value() == 0 || ret.value() == max) + return ret; + + char* d = reinterpret_cast<char*>(dst); + size_t offset = ret.value(); + while (true) { + ret = read(d + offset, max - offset); + if (!ret.has_value() || ret.value() == 0) + break; + offset += ret.value(); + if (offset == max) + break; + } + return offset; +} + +std::expected<size_t, ReadError> Reader::repeat_read(std::string& str) { + return repeat_read(str.data(), str.size()); +} + +std::expected<size_t, ReadError> Reader::repeat_skip(size_t max) { + auto ret = skip(max); + if (!ret.has_value() || ret.value() == 0 || ret.value() == max) + return ret; + + size_t offset = ret.value(); + while (true) { + ret = skip(max - offset); + if (!ret.has_value() || ret.value() == 0) + break; + offset += ret.value(); + if (offset == max) + break; + } + return offset; +} + +std::expected<std::unique_ptr<Reader>, OpenError> open( + const std::string& file_path) { + return openat(AT_FDCWD, file_path); +} + +std::expected<std::unique_ptr<Reader>, OpenError> openat( + int dirfd, const std::string& file_path) { + unique_fd fd(::openat(dirfd, file_path.c_str(), O_RDONLY)); + if (fd) { + struct stat buf; + if (fstat(fd.get(), &buf) == 0) { + if (std::cmp_less_equal(buf.st_size, + std::numeric_limits<size_t>::max())) { + auto size = static_cast<size_t>(buf.st_size); + void* ptr = mmap(nullptr, size, PROT_READ, MAP_PRIVATE, fd.get(), 0); + if (ptr != MAP_FAILED) { + return std::make_unique<MmapReader>(std::move(fd), ptr, size); + } + } + } + return std::make_unique<BasicReader>(std::move(fd)); + } + OpenError err; + switch (errno) { + case EINTR: + return openat(dirfd, file_path); + case EACCES: + err = OpenError::NoAccess; + break; + case ENOENT: + err = OpenError::NoSuchFile; + break; + default: + err = OpenError::Error; + break; + } + return std::unexpected(err); +} + +std::unique_ptr<Reader> memory(std::string data) { + return std::make_unique<StringReader>(std::move(data)); +} + +} // namespace io diff --git a/src/io.hh b/src/io.hh new file mode 100644 index 0000000..315d0bb --- /dev/null +++ b/src/io.hh @@ -0,0 +1,52 @@ +#ifndef IO_HH +#define IO_HH + +#include <cstddef> +#include <expected> +#include <memory> +#include <string> + +namespace io { + +enum class ReadError { + Error, + InvalidData, // Used by decompress and such +}; + +enum class OpenError { + NoSuchFile, + NoAccess, + Error, +}; + +class Reader { + public: + virtual ~Reader() = default; + + [[nodiscard]] virtual std::expected<size_t, ReadError> read(void* dst, + size_t max) = 0; + [[nodiscard]] virtual std::expected<size_t, ReadError> skip(size_t max) = 0; + + [[nodiscard]] std::expected<size_t, ReadError> read(std::string& str); + + [[nodiscard]] std::expected<size_t, ReadError> repeat_read(void* dst, + size_t max); + [[nodiscard]] std::expected<size_t, ReadError> repeat_read(std::string& str); + [[nodiscard]] std::expected<size_t, ReadError> repeat_skip(size_t max); + + protected: + Reader() = default; + + Reader(Reader const&) = delete; + Reader& operator=(Reader const&) = delete; +}; + +[[nodiscard]] std::expected<std::unique_ptr<Reader>, OpenError> open( + const std::string& file_path); +[[nodiscard]] std::expected<std::unique_ptr<Reader>, OpenError> openat( + int dirfd, const std::string& file_path); +[[nodiscard]] std::unique_ptr<Reader> memory(std::string data); + +} // namespace io + +#endif // IO_HH diff --git a/src/line.cc b/src/line.cc new file mode 100644 index 0000000..2eeb116 --- /dev/null +++ b/src/line.cc @@ -0,0 +1,133 @@ +#include "line.hh" + +#include "check.hh" + +#include <algorithm> +#include <cassert> +#include <cstdint> +#include <cstring> +#include <expected> +#include <memory> +#include <string_view> +#include <utility> + +namespace line { + +namespace { + +const char kLineTerminators[] = "\r\n"; + +class ReaderImpl : public Reader { + public: + ReaderImpl(std::unique_ptr<io::Reader> reader, size_t max_len) + : reader_(std::move(reader)), max_len_(max_len), + buffer_(std::make_unique_for_overwrite<char[]>( + check::add(max_len, static_cast<size_t>(2)))), + rptr_(buffer_.get()), wptr_(buffer_.get()), search_(rptr_), + end_(buffer_.get() + check::add(max_len, static_cast<size_t>(2))) {} + + [[nodiscard]] std::expected<std::string_view, ReadError> read() override { + while (true) { + search_ = std::find_first_of(search_, wptr_, + kLineTerminators, kLineTerminators + 2); + if (search_ < wptr_) { + if (std::cmp_greater(search_ - rptr_, max_len_)) { + return line(max_len_, 0); + } + + size_t tlen; + if (*search_ == '\n') { + tlen = 1; + } else { + if (search_ + 1 == wptr_) { + make_space_if_needed(); + auto got = fill(); + if (got.has_value()) { + if (got.value() == 0) { + return line(search_ - rptr_, 1); + } + } else { + return std::unexpected(ReadError(got.error())); + } + } + if (search_[1] == '\n') { + tlen = 2; + } else { + tlen = 1; + } + } + return line(search_ - rptr_, tlen); + } + if (std::cmp_greater_equal(wptr_ - rptr_, max_len_)) { + return line(max_len_, 0); + } + + make_space_if_needed(); + auto got = fill(); + if (got.has_value()) { + if (got.value() == 0) { + if (rptr_ == wptr_) { + return std::unexpected(ReadError()); + } + return line(wptr_ - rptr_, 0); + } + } else { + return std::unexpected(ReadError(got.error())); + } + } + } + + [[nodiscard]] uint64_t number() const override { return number_; } + + private: + std::string_view line(size_t len, size_t terminator_len) { + assert(len <= max_len_); + auto ret = std::string_view(rptr_, len); + rptr_ += len + terminator_len; + search_ = rptr_; + ++number_; + return ret; + } + + void make_space_if_needed() { + size_t free = rptr_ - buffer_.get(); + if (free == 0) return; + size_t avail = end_ - wptr_; + if (avail > 1024) return; + memmove(buffer_.get(), rptr_, wptr_ - rptr_); + search_ -= free; + wptr_ -= free; + rptr_ = buffer_.get(); + } + + std::expected<size_t, io::ReadError> fill() { + auto ret = reader_->read(wptr_, end_ - wptr_); + if (ret.has_value()) + wptr_ += ret.value(); + return ret; + } + + std::unique_ptr<io::Reader> reader_; + size_t const max_len_; + uint64_t number_{0}; + std::unique_ptr<char[]> buffer_; + char* rptr_; + char* wptr_; + char* search_; + char* const end_; +}; + +} // namespace + +ReadError::ReadError() + : eof(true) {} + +ReadError::ReadError(io::ReadError error) + : eof(false), io_error(error) {} + +std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader, + size_t max_len) { + return std::make_unique<ReaderImpl>(std::move(reader), max_len); +} + +} // namespace line diff --git a/src/line.hh b/src/line.hh new file mode 100644 index 0000000..94e3646 --- /dev/null +++ b/src/line.hh @@ -0,0 +1,44 @@ +#ifndef LINE_HH +#define LINE_HH + +#include "io.hh" // IWYU pragma: export + +#include <cstddef> +#include <expected> +#include <memory> +#include <optional> +#include <string_view> + +namespace line { + +struct ReadError { + bool eof; + std::optional<io::ReadError> io_error; + + ReadError(); + explicit ReadError(io::ReadError error); +}; + +class Reader { + public: + virtual ~Reader() = default; + + // Returned view is only valid until next call to read. + [[nodiscard]] virtual std::expected<std::string_view, ReadError> read() = 0; + // Starts at zero. Returns next line. + // So, before first read it is zero, after first read it is one. + [[nodiscard]] virtual uint64_t number() const = 0; + + protected: + Reader() = default; + + Reader(Reader const&) = delete; + Reader& operator=(Reader const&) = delete; +}; + +[[nodiscard]] std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader, + size_t max_len = 8192); + +} // namespace line + +#endif // LINE_HH diff --git a/src/str.cc b/src/str.cc new file mode 100644 index 0000000..f81617d --- /dev/null +++ b/src/str.cc @@ -0,0 +1,34 @@ +#include "str.hh" + +#include <cstddef> +#include <string_view> +#include <vector> + +namespace str { + +void split(std::string_view str, std::vector<std::string_view>& out, + char separator, bool keep_empty) { + out.clear(); + + size_t offset = 0; + while (true) { + auto next = str.find(separator, offset); + if (next == std::string_view::npos) { + if (keep_empty || offset < str.size()) + out.push_back(str.substr(offset)); + break; + } + if (keep_empty || offset < next) + out.push_back(str.substr(offset, next - offset)); + offset = next + 1; + } +} + +std::vector<std::string_view> split(std::string_view str, + char separator, bool keep_empty) { + std::vector<std::string_view> vec; + split(str, vec, separator, keep_empty); + return vec; +} + +} // namespace str diff --git a/src/str.hh b/src/str.hh new file mode 100644 index 0000000..58d5d32 --- /dev/null +++ b/src/str.hh @@ -0,0 +1,18 @@ +#ifndef STR_HH +#define STR_HH + +#include <string_view> +#include <vector> + +namespace str { + +void split(std::string_view str, std::vector<std::string_view>& out, + char separator = ' ', bool keep_empty = false); + +[[nodiscard]] std::vector<std::string_view> split(std::string_view str, + char separator = ' ', + bool keep_empty = false); + +} // namespace str + +#endif // STR_HH diff --git a/src/u.cc b/src/u.cc new file mode 100644 index 0000000..3c06ba8 --- /dev/null +++ b/src/u.cc @@ -0,0 +1,46 @@ +#include "u.hh" + +#include <cstdint> +#include <utility> + +namespace u { + +// These are generated by gen_ugc +GeneralCategory u6_2_0_lookup_gc(uint32_t code); +GeneralCategory u8_0_0_lookup_gc(uint32_t code); +GeneralCategory u10_0_0_lookup_gc(uint32_t code); +GeneralCategory u11_0_0_lookup_gc(uint32_t code); +GeneralCategory u12_1_0_lookup_gc(uint32_t code); +GeneralCategory u13_0_0_lookup_gc(uint32_t code); +GeneralCategory u14_0_0_lookup_gc(uint32_t code); +GeneralCategory u15_0_0_lookup_gc(uint32_t code); +GeneralCategory u15_1_0_lookup_gc(uint32_t code); +GeneralCategory u16_0_0_lookup_gc(uint32_t code); + +GeneralCategory lookup_gc(uint32_t code, Version version) { + switch (version) { + case Version::u6_2_0: + return u6_2_0_lookup_gc(code); + case Version::u8_0_0: + return u8_0_0_lookup_gc(code); + case Version::u10_0_0: + return u10_0_0_lookup_gc(code); + case Version::u11_0_0: + return u11_0_0_lookup_gc(code); + case Version::u12_1_0: + return u12_1_0_lookup_gc(code); + case Version::u13_0_0: + return u13_0_0_lookup_gc(code); + case Version::u14_0_0: + return u14_0_0_lookup_gc(code); + case Version::u15_0_0: + return u15_0_0_lookup_gc(code); + case Version::u15_1_0: + return u15_1_0_lookup_gc(code); + case Version::u16_0_0: + return u16_0_0_lookup_gc(code); + } + std::unreachable(); +} + +} // namespace u @@ -1,6 +1,8 @@ #ifndef U_HH #define U_HH +#include "ugc.hh" // IWYU pragma: export + namespace u { enum class ReadError : uint8_t { @@ -14,6 +16,22 @@ enum class ReadErrorReplace : uint8_t { Incomplete, // Too few bytes }; +enum class Version : uint8_t { + u6_2_0, + u8_0_0, + u10_0_0, + u11_0_0, + u12_1_0, + u13_0_0, + u14_0_0, + u15_0_0, + u15_1_0, + u16_0_0, + LATEST = u16_0_0, +}; + +GeneralCategory lookup_gc(uint32_t code, Version version = Version::LATEST); + } // namespace u #endif // U_HH diff --git a/src/ugc.hh b/src/ugc.hh new file mode 100644 index 0000000..c49d50f --- /dev/null +++ b/src/ugc.hh @@ -0,0 +1,49 @@ +#ifndef UGC_HH +#define UGC_HH + +#include <cstdint> + +namespace u { + +enum class GeneralCategory : uint8_t { + LETTER_UPPERCASE, + LETTER_LOWERCASE, + LETTER_TITLECASE, + LETTER_MODIFIER, + LETTER_OTHER, + + MARK_NONSPACING, + MARK_SPACING_COMBINDING, + MARK_SPACING_ENCLOSING, + + NUMBER_DIGIT, + NUMBER_LETTER, + NUMBER_OTHER, + + PUNCTUATION_CONNECTOR, + PUNCTUATION_DASH, + PUNCTUATION_OPEN, + PUNCTUATION_CLOSE, + PUNCTUATION_INITIAL_QUOTE, + PUNCTUATION_FINAL_QUOTE, + PUNCTUATION_OTHER, + + SYMBOL_MATH, + SYMBOL_CURRENCY, + SYMBOL_MODIFIER, + SYMBOL_OTHER, + + SEPARATOR_SPACE, + SEPARATOR_LINE, + SEPARATOR_PARAGRAPH, + + OTHER_CONTROL, + OTHER_FORMAT, + OTHER_SURROGATE, + OTHER_PRIVATE_USE, + OTHER_UNASSIGNED, +}; + +} // namespace u + +#endif // UGC_HH diff --git a/src/unique_fd.cc b/src/unique_fd.cc new file mode 100644 index 0000000..135a449 --- /dev/null +++ b/src/unique_fd.cc @@ -0,0 +1,9 @@ +#include "unique_fd.hh" + +#include <unistd.h> + +void unique_fd::reset(int fd) { + if (fd_ != -1) + close(fd_); + fd_ = fd; +} diff --git a/src/unique_fd.hh b/src/unique_fd.hh new file mode 100644 index 0000000..189d513 --- /dev/null +++ b/src/unique_fd.hh @@ -0,0 +1,45 @@ +#ifndef UNIQUE_FD_HH +#define UNIQUE_FD_HH + +class unique_fd { + public: + constexpr unique_fd() + : fd_(-1) {} + explicit constexpr unique_fd(int fd) + : fd_(fd) {} + unique_fd(unique_fd& fd) = delete; + unique_fd& operator=(unique_fd& fd) = delete; + unique_fd(unique_fd&& fd) + : fd_(fd.release()) {} + unique_fd& operator=(unique_fd&& fd) { + reset(fd.release()); + return *this; + } + ~unique_fd() { + reset(); + } + + bool operator==(unique_fd const& fd) const { + return get() == fd.get(); + } + bool operator!=(unique_fd const& fd) const { + return get() != fd.get(); + } + + int get() const { return fd_; } + explicit operator bool() const { return fd_ != -1; } + int operator*() const { return fd_; } + + int release() { + int ret = fd_; + fd_ = -1; + return ret; + } + + void reset(int fd = -1); + + private: + int fd_; +}; + +#endif // UNIQUE_FD_HH diff --git a/test/buffer.cc b/test/buffer.cc new file mode 100644 index 0000000..869e781 --- /dev/null +++ b/test/buffer.cc @@ -0,0 +1,65 @@ +#include <gtest/gtest.h> + +#include "buffer.hh" + +#include <cstring> + +TEST(buffer_fixed, empty) { + auto buffer = Buffer::fixed(10); + EXPECT_TRUE(buffer->empty()); + EXPECT_FALSE(buffer->full()); + size_t avail; + buffer->rptr(avail); + EXPECT_EQ(0, avail); + buffer->wptr(avail); + EXPECT_EQ(10, avail); +} + +TEST(buffer_dynamic, empty) { + auto buffer = Buffer::dynamic(10, 100); + EXPECT_TRUE(buffer->empty()); + EXPECT_FALSE(buffer->full()); + size_t avail; + buffer->rptr(avail); + EXPECT_EQ(0, avail); + buffer->wptr(avail); + EXPECT_EQ(10, avail); +} + +TEST(buffer_fixed, write_read) { + auto buffer = Buffer::fixed(10); + size_t avail; + auto* wptr = buffer->wptr(avail); + EXPECT_EQ(10, avail); + memcpy(wptr, "Hello", 6); + buffer->commit(6); + EXPECT_FALSE(buffer->empty()); + auto* rptr = buffer->rptr(avail); + EXPECT_EQ(6, avail); + EXPECT_STREQ("Hello", reinterpret_cast<const char*>(rptr)); + buffer->consume(3); + rptr = buffer->rptr(avail); + EXPECT_EQ(3, avail); + EXPECT_STREQ("lo", reinterpret_cast<const char*>(rptr)); + buffer->consume(3); + EXPECT_TRUE(buffer->empty()); +} + +TEST(buffer_dynamic, write_read) { + auto buffer = Buffer::dynamic(10, 100); + size_t avail; + auto* wptr = buffer->wptr(avail); + EXPECT_EQ(10, avail); + memcpy(wptr, "Hello", 6); + buffer->commit(6); + EXPECT_FALSE(buffer->empty()); + auto* rptr = buffer->rptr(avail); + EXPECT_EQ(6, avail); + EXPECT_STREQ("Hello", reinterpret_cast<const char*>(rptr)); + buffer->consume(3); + rptr = buffer->rptr(avail); + EXPECT_EQ(3, avail); + EXPECT_STREQ("lo", reinterpret_cast<const char*>(rptr)); + buffer->consume(3); + EXPECT_TRUE(buffer->empty()); +} diff --git a/test/csv.cc b/test/csv.cc new file mode 100644 index 0000000..49fe540 --- /dev/null +++ b/test/csv.cc @@ -0,0 +1,90 @@ +#include <gtest/gtest.h> + +#include "csv.hh" + +TEST(csv, empty) { + auto csv = csv::open(io::memory("")); + auto line = csv->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ(0, line.value().size()); +} + +TEST(csv, one_value) { + auto csv = csv::open(io::memory("foo")); + auto line = csv->read(); + ASSERT_TRUE(line.has_value()); + ASSERT_EQ(1, line.value().size()); + EXPECT_EQ("foo", line.value()[0]); + line = csv->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ(0, line.value().size()); +} + +TEST(csv, two_value) { + auto csv = csv::open(io::memory("foo,bar")); + auto line = csv->read(); + ASSERT_TRUE(line.has_value()); + ASSERT_EQ(2, line.value().size()); + EXPECT_EQ("foo", line.value()[0]); + EXPECT_EQ("bar", line.value()[1]); + line = csv->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ(0, line.value().size()); +} + +TEST(csv, empty_value) { + auto csv = csv::open(io::memory("foo,,bar,")); + auto line = csv->read(); + ASSERT_TRUE(line.has_value()); + ASSERT_EQ(4, line.value().size()); + EXPECT_EQ("foo", line.value()[0]); + EXPECT_EQ("", line.value()[1]); + EXPECT_EQ("bar", line.value()[2]); + EXPECT_EQ("", line.value()[3]); + line = csv->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ(0, line.value().size()); +} + +TEST(csv, many_lines) { + auto csv = csv::open(io::memory("foo,bar\nfoobar\nf,o,o,")); + auto line = csv->read(); + ASSERT_TRUE(line.has_value()); + ASSERT_EQ(2, line.value().size()); + EXPECT_EQ("foo", line.value()[0]); + EXPECT_EQ("bar", line.value()[1]); + line = csv->read(); + ASSERT_TRUE(line.has_value()); + ASSERT_EQ(1, line.value().size()); + EXPECT_EQ("foobar", line.value()[0]); + line = csv->read(); + ASSERT_TRUE(line.has_value()); + ASSERT_EQ(4, line.value().size()); + EXPECT_EQ("f", line.value()[0]); + EXPECT_EQ("o", line.value()[1]); + EXPECT_EQ("o", line.value()[2]); + EXPECT_EQ("", line.value()[3]); + line = csv->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ(0, line.value().size()); +} + +TEST(csv, blank_lines) { + auto csv = csv::open(io::memory("foo,bar\n\nbar,foo\n\n")); + auto line = csv->read(); + ASSERT_TRUE(line.has_value()); + ASSERT_EQ(2, line.value().size()); + EXPECT_EQ("foo", line.value()[0]); + EXPECT_EQ("bar", line.value()[1]); + EXPECT_EQ(1, csv->number()); + line = csv->read(); + ASSERT_TRUE(line.has_value()); + ASSERT_EQ(2, line.value().size()); + EXPECT_EQ("bar", line.value()[0]); + EXPECT_EQ("foo", line.value()[1]); + EXPECT_EQ(3, csv->number()); + line = csv->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ(0, line.value().size()); + EXPECT_EQ(4, csv->number()); +} diff --git a/test/decompress.cc b/test/decompress.cc new file mode 100644 index 0000000..35c4477 --- /dev/null +++ b/test/decompress.cc @@ -0,0 +1,72 @@ +#include <gtest/gtest.h> + +#include "decompress.hh" + +TEST(z_decompress, empty) { + static const unsigned char data[] = { + 0x1f, 0x8b, 0x08, 0x08, 0x33, 0xd4, 0xbd, 0x68, + 0x02, 0x03, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00 + }; + auto reader = decompress::gzip(io::memory(std::string( + reinterpret_cast<const char*>(data), sizeof(data)))); + char buf[10]; + auto got = reader->read(buf, sizeof(buf)); + ASSERT_TRUE(got.has_value()); + EXPECT_EQ(0, got.value()); +} + +TEST(z_decompress, hello) { + static const unsigned char data[] = { + 0x1f, 0x8b, 0x08, 0x08, 0xf7, 0xd5, 0xbd, 0x68, + 0x02, 0x03, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x00, + 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x07, 0x00, 0x82, + 0x89, 0xd1, 0xf7, 0x05, 0x00, 0x00, 0x00, + }; + auto reader = decompress::gzip(io::memory(std::string( + reinterpret_cast<const char*>(data), sizeof(data)))); + char buf[10]; + auto got = reader->read(buf, sizeof(buf)); + ASSERT_TRUE(got.has_value()); + EXPECT_EQ(5, got.value()); + buf[5] = '\0'; + EXPECT_STREQ("Hello", buf); +} + +TEST(xz_decompress, empty) { + static const unsigned char data[] = { + 0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00, 0x00, 0x04, + 0xe6, 0xd6, 0xb4, 0x46, 0x00, 0x00, 0x00, 0x00, + 0x1c, 0xdf, 0x44, 0x21, 0x1f, 0xb6, 0xf3, 0x7d, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x04, 0x59, 0x5a + }; + auto reader = decompress::xz(io::memory(std::string( + reinterpret_cast<const char*>(data), sizeof(data)))); + char buf[10]; + auto got = reader->read(buf, sizeof(buf)); + ASSERT_TRUE(got.has_value()); + EXPECT_EQ(0, got.value()); +} + +TEST(xz_decompress, hello) { + static const unsigned char data[] = { + 0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00, 0x00, 0x04, + 0xe6, 0xd6, 0xb4, 0x46, 0x04, 0xc0, 0x09, 0x05, + 0x21, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x76, 0xe9, 0x07, 0x70, + 0x01, 0x00, 0x04, 0x48, 0x65, 0x6c, 0x6c, 0x6f, + 0x00, 0x00, 0x00, 0x00, 0xc8, 0xac, 0x7b, 0xc8, + 0x3b, 0x5c, 0xcf, 0x51, 0x00, 0x01, 0x25, 0x05, + 0x43, 0x91, 0x1f, 0xb8, 0x1f, 0xb6, 0xf3, 0x7d, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x04, 0x59, 0x5a, + }; + auto reader = decompress::xz(io::memory(std::string( + reinterpret_cast<const char*>(data), sizeof(data)))); + char buf[10]; + auto got = reader->read(buf, sizeof(buf)); + ASSERT_TRUE(got.has_value()); + EXPECT_EQ(5, got.value()); + buf[5] = '\0'; + EXPECT_STREQ("Hello", buf); +} diff --git a/test/io.cc b/test/io.cc new file mode 100644 index 0000000..ad192ed --- /dev/null +++ b/test/io.cc @@ -0,0 +1,142 @@ +#include <gtest/gtest.h> + +#include "io.hh" + +#include <cstdlib> +#include <cerrno> +#include <dirent.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <unistd.h> + +namespace { + +bool remove_recursive(int fd) { + auto* dir = fdopendir(fd); + if (!dir) return false; + while (auto* ent = readdir(dir)) { + if (ent->d_name[0] == '.') { + if (ent->d_name[1] == '\0') continue; + if (ent->d_name[1] == '.' && ent->d_name[2] == '\0') continue; + } + bool is_dir; + if (ent->d_type == DT_DIR) { + is_dir = true; + } else if (ent->d_type == DT_UNKNOWN) { + struct stat buf; + if (fstatat(dirfd(dir), ent->d_name, &buf, AT_SYMLINK_NOFOLLOW) == 0) { + is_dir = S_ISDIR(buf.st_mode); + } else { + if (errno != ENOENT) { + closedir(dir); + return false; + } + is_dir = false; + } + } else { + is_dir = false; + } + + if (is_dir) { + int fd2 = openat(dirfd(dir), ent->d_name, O_RDONLY | O_DIRECTORY); + if (fd2 == -1) { + if (errno != ENOENT) { + closedir(dir); + return false; + } + } else { + if (!remove_recursive(fd2)) { + closedir(dir); + return false; + } + } + } + if (unlinkat(dirfd(dir), ent->d_name, is_dir ? AT_REMOVEDIR : 0)) { + if (errno != ENOENT) { + closedir(dir); + return false; + } + } + } + closedir(dir); + return true; +} + +class IoTest : public testing::Test { + protected: + void SetUp() override { + // NOLINTNEXTLINE(misc-include-cleaner) + tmpdir_ = P_tmpdir "/jkc-test-io-XXXXXX"; + // NOLINTNEXTLINE(misc-include-cleaner) + auto* ret = mkdtemp(tmpdir_.data()); + ASSERT_EQ(ret, tmpdir_.data()); + dirfd_ = open(tmpdir_.c_str(), O_PATH | O_DIRECTORY); + ASSERT_NE(-1, dirfd_); + } + + void TearDown() override { + int fd = openat(dirfd_, ".", O_RDONLY | O_DIRECTORY); + EXPECT_NE(-1, fd); + if (fd != -1) { + EXPECT_TRUE(remove_recursive(fd)); + } + close(dirfd_); + rmdir(tmpdir_.c_str()); + } + + [[nodiscard]] int dirfd() const { + return dirfd_; + } + + void touch(const std::string& name, const std::string& value = "") { + auto fd = openat(dirfd(), name.c_str(), O_CREAT | O_WRONLY | O_TRUNC, 0700); + EXPECT_NE(-1, fd); + if (fd == -1) return; + size_t offset = 0; + while (offset < value.size()) { + auto ret = write(fd, value.data() + offset, value.size() - offset); + EXPECT_LT(0, ret); + if (ret <= 0) { + break; + } + offset += ret; + } + close(fd); + } + + private: + int dirfd_; + std::string tmpdir_; +}; + +} // namespace + +TEST_F(IoTest, no_such_file) { + auto ret = io::openat(dirfd(), "no-such-file"); + ASSERT_FALSE(ret.has_value()); + EXPECT_EQ(io::OpenError::NoSuchFile, ret.error()); +} + +TEST_F(IoTest, read_empty) { + touch("test"); + + auto ret = io::openat(dirfd(), "test"); + ASSERT_TRUE(ret.has_value()); + std::string tmp(10, ' '); + auto ret2 = ret.value()->read(tmp); + ASSERT_TRUE(ret2.has_value()); + EXPECT_EQ(0, ret2.value()); +} + +TEST_F(IoTest, read) { + touch("test", "hello world"); + + auto ret = io::openat(dirfd(), "test"); + ASSERT_TRUE(ret.has_value()); + std::string tmp(12, ' '); + auto ret2 = ret.value()->repeat_read(tmp); + ASSERT_TRUE(ret2.has_value()); + EXPECT_EQ(11, ret2.value()); + tmp.resize(ret2.value()); + EXPECT_EQ("hello world", tmp); +} diff --git a/test/io_test_helper.cc b/test/io_test_helper.cc new file mode 100644 index 0000000..514e888 --- /dev/null +++ b/test/io_test_helper.cc @@ -0,0 +1,82 @@ +#include "io_test_helper.hh" + +#include "io.hh" + +#include <algorithm> +#include <cstddef> +#include <expected> +#include <memory> +#include <utility> + +namespace { + +class BreakingReader : public io::Reader { + public: + BreakingReader(std::unique_ptr<io::Reader> reader, size_t offset, + io::ReadError error) + : reader_(std::move(reader)), offset_(offset), error_(error) {} + + [[nodiscard]] + std::expected<size_t, io::ReadError> read(void* dst, size_t max) override { + if (offset_ == 0) + return std::unexpected(error_); + size_t avail = std::min(offset_, max); + auto ret = reader_->read(dst, avail); + if (ret.has_value()) { + offset_ -= ret.value(); + } + return ret; + } + + [[nodiscard]] + std::expected<size_t, io::ReadError> skip(size_t max) override { + if (offset_ == 0) + return std::unexpected(error_); + size_t avail = std::min(offset_, max); + auto ret = reader_->skip(avail); + if (ret.has_value()) { + offset_ -= ret.value(); + } + return ret; + } + + private: + std::unique_ptr<io::Reader> reader_; + size_t offset_; + io::ReadError const error_; +}; + +class MaxBlockReader : public io::Reader { + public: + MaxBlockReader(std::unique_ptr<io::Reader> reader, size_t max_block_size) + : reader_(std::move(reader)), max_block_size_(max_block_size) {} + + [[nodiscard]] + std::expected<size_t, io::ReadError> read(void* dst, size_t max) override { + size_t avail = std::min(max_block_size_, max); + return reader_->read(dst, avail); + } + + [[nodiscard]] + std::expected<size_t, io::ReadError> skip(size_t max) override { + size_t avail = std::min(max_block_size_, max); + return reader_->skip(avail); + } + + private: + std::unique_ptr<io::Reader> reader_; + size_t const max_block_size_; +}; + +} // namespace + +std::unique_ptr<io::Reader> io_make_breaking( + std::unique_ptr<io::Reader> reader, size_t offset, + io::ReadError error) { + return std::make_unique<BreakingReader>(std::move(reader), offset, error); +} + +std::unique_ptr<io::Reader> io_make_max_block( + std::unique_ptr<io::Reader> reader, size_t max_block_size) { + return std::make_unique<MaxBlockReader>(std::move(reader), max_block_size); +} diff --git a/test/io_test_helper.hh b/test/io_test_helper.hh new file mode 100644 index 0000000..ce191cf --- /dev/null +++ b/test/io_test_helper.hh @@ -0,0 +1,18 @@ +#ifndef IO_TEST_HELPER_HH +#define IO_TEST_HELPER_HH + +#include "io.hh" // IWYU pragma: export + +#include <cstddef> +#include <memory> + +[[nodiscard]] +std::unique_ptr<io::Reader> io_make_breaking( + std::unique_ptr<io::Reader> reader, size_t offset = 0, + io::ReadError error = io::ReadError::Error); + +[[nodiscard]] +std::unique_ptr<io::Reader> io_make_max_block( + std::unique_ptr<io::Reader> reader, size_t max_block_size); + +#endif // IO_TEST_HELPER_HH diff --git a/test/line.cc b/test/line.cc new file mode 100644 index 0000000..0f90723 --- /dev/null +++ b/test/line.cc @@ -0,0 +1,184 @@ +#include <gtest/gtest.h> + +#include "io_test_helper.hh" +#include "line.hh" + +#include <cstddef> +#include <limits> +#include <utility> + +TEST(line, empty) { + auto reader = line::open(io::memory("")); + EXPECT_EQ(0, reader->number()); + auto line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_TRUE(line.error().eof); + EXPECT_EQ(0, reader->number()); +} + +TEST(line, one_line) { + auto reader = line::open(io::memory("foo")); + EXPECT_EQ(0, reader->number()); + auto line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("foo", line.value()); + EXPECT_EQ(1, reader->number()); + line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_TRUE(line.error().eof); + EXPECT_EQ(1, reader->number()); +} + +TEST(line, many_lines) { + auto reader = line::open(io::memory("foo\nbar\nfoobar\n")); + EXPECT_EQ(0, reader->number()); + auto line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("foo", line.value()); + EXPECT_EQ(1, reader->number()); + line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("bar", line.value()); + EXPECT_EQ(2, reader->number()); + line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("foobar", line.value()); + EXPECT_EQ(3, reader->number()); + line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_TRUE(line.error().eof); + EXPECT_EQ(3, reader->number()); +} + +TEST(line, many_lines_mixed) { + auto reader = line::open(io::memory("foo\r\nbar\rfoobar\n")); + EXPECT_EQ(0, reader->number()); + auto line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("foo", line.value()); + EXPECT_EQ(1, reader->number()); + line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("bar", line.value()); + EXPECT_EQ(2, reader->number()); + line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("foobar", line.value()); + EXPECT_EQ(3, reader->number()); + line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_TRUE(line.error().eof); + EXPECT_EQ(3, reader->number()); +} + +TEST(line, empty_line) { + auto reader = line::open(io::memory("\n")); + EXPECT_EQ(0, reader->number()); + auto line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("", line.value()); + EXPECT_EQ(1, reader->number()); + line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_TRUE(line.error().eof); + EXPECT_EQ(1, reader->number()); +} + +TEST(line, max_line) { + auto reader = line::open(io::memory("012345678901234567890123456789"), 10); + EXPECT_EQ(0, reader->number()); + auto line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("0123456789", line.value()); + EXPECT_EQ(1, reader->number()); + line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("0123456789", line.value()); + EXPECT_EQ(2, reader->number()); + line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("0123456789", line.value()); + EXPECT_EQ(3, reader->number()); + line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_TRUE(line.error().eof); + EXPECT_EQ(3, reader->number()); +} + +TEST(line, read_error) { + auto reader = line::open( + io_make_breaking(io::memory("foo bar fum\nfim zam"), /* offset */ 5)); + auto line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_FALSE(line.error().eof); + EXPECT_EQ(io::ReadError::Error, line.error().io_error.value()); +} + +TEST(line, read_error_newline) { + auto reader = line::open( + io_make_breaking(io::memory("foo bar\r\nfim zam"), /* offset */ 8)); + auto line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_FALSE(line.error().eof); + EXPECT_EQ(io::ReadError::Error, line.error().io_error.value()); +} + +TEST(line, blocky) { + auto reader = line::open( + io_make_max_block(io::memory("foo bar\r\nfim zam"), + /* max_block_size */ 1)); + auto line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("foo bar", line.value()); + line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("fim zam", line.value()); + line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_TRUE(line.error().eof); +} + +TEST(line, blocky_newline) { + auto reader = line::open( + io_make_max_block(io::memory("foo bar\r\nfim zam"), + /* max_block_size */ 8)); + auto line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("foo bar", line.value()); + line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("fim zam", line.value()); + line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_TRUE(line.error().eof); +} + +TEST(line, eof_newline) { + auto reader = line::open(io::memory("foo bar\r")); + auto line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("foo bar", line.value()); + line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_TRUE(line.error().eof); +} + +TEST(line, max_newline) { + auto reader = line::open(io::memory("foo bar\r"), 6); + auto line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("foo ba", line.value()); + line = reader->read(); + ASSERT_TRUE(line.has_value()); + EXPECT_EQ("r", line.value()); + line = reader->read(); + ASSERT_FALSE(line.has_value()); + EXPECT_TRUE(line.error().eof); +} + +TEST(line, max_line_overflow) { + EXPECT_DEATH_IF_SUPPORTED({ + std::ignore = line::open(io::memory(""), + std::numeric_limits<size_t>::max()); + }, ""); +} diff --git a/test/str.cc b/test/str.cc new file mode 100644 index 0000000..35d70d7 --- /dev/null +++ b/test/str.cc @@ -0,0 +1,38 @@ +#include <gtest/gtest.h> + +#include "str.hh" + +TEST(str, split) { + auto ret = str::split(""); + EXPECT_EQ(0, ret.size()); + + ret = str::split("", ' ', true); + ASSERT_EQ(1, ret.size()); + EXPECT_EQ("", ret[0]); + + ret = str::split(" "); + EXPECT_EQ(0, ret.size()); + + ret = str::split(" ", ' ', true); + ASSERT_EQ(2, ret.size()); + EXPECT_EQ("", ret[0]); + EXPECT_EQ("", ret[1]); + + ret = str::split(" a b "); + ASSERT_EQ(2, ret.size()); + EXPECT_EQ("a", ret[0]); + EXPECT_EQ("b", ret[1]); + + ret = str::split(" a b ", ' ', true); + ASSERT_EQ(4, ret.size()); + EXPECT_EQ("", ret[0]); + EXPECT_EQ("a", ret[1]); + EXPECT_EQ("b", ret[2]); + EXPECT_EQ("", ret[3]); + + ret = str::split(" a b", ' ', true); + ASSERT_EQ(3, ret.size()); + EXPECT_EQ("", ret[0]); + EXPECT_EQ("a", ret[1]); + EXPECT_EQ("b", ret[2]); +} @@ -681,3 +681,45 @@ TEST(u16, invalid) { EXPECT_EQ(it, literal.end()); } } + +TEST(u, lookup_gc) { + EXPECT_EQ(u::lookup_gc(0x41), u::GeneralCategory::LETTER_UPPERCASE); + EXPECT_EQ(u::lookup_gc(0x61), u::GeneralCategory::LETTER_LOWERCASE); + EXPECT_EQ(u::lookup_gc(0x1c5), u::GeneralCategory::LETTER_TITLECASE); + EXPECT_EQ(u::lookup_gc(0x374), u::GeneralCategory::LETTER_MODIFIER); + EXPECT_EQ(u::lookup_gc(0x34ff), u::GeneralCategory::LETTER_OTHER); + + EXPECT_EQ(u::lookup_gc(0x483), u::GeneralCategory::MARK_NONSPACING); + EXPECT_EQ(u::lookup_gc(0x93b), u::GeneralCategory::MARK_SPACING_COMBINDING); + EXPECT_EQ(u::lookup_gc(0x20de), u::GeneralCategory::MARK_SPACING_ENCLOSING); + + EXPECT_EQ(u::lookup_gc(0xa620), u::GeneralCategory::NUMBER_DIGIT); + EXPECT_EQ(u::lookup_gc(0xa6e6), u::GeneralCategory::NUMBER_LETTER); + EXPECT_EQ(u::lookup_gc(0xa830), u::GeneralCategory::NUMBER_OTHER); + + EXPECT_EQ(u::lookup_gc(0xfe33), u::GeneralCategory::PUNCTUATION_CONNECTOR); + EXPECT_EQ(u::lookup_gc(0xfe58), u::GeneralCategory::PUNCTUATION_DASH); + EXPECT_EQ(u::lookup_gc(0xff08), u::GeneralCategory::PUNCTUATION_OPEN); + EXPECT_EQ(u::lookup_gc(0xff09), u::GeneralCategory::PUNCTUATION_CLOSE); + EXPECT_EQ(u::lookup_gc(0xab), u::GeneralCategory::PUNCTUATION_INITIAL_QUOTE); + EXPECT_EQ(u::lookup_gc(0xbb), u::GeneralCategory::PUNCTUATION_FINAL_QUOTE); + EXPECT_EQ(u::lookup_gc(0xff1a), u::GeneralCategory::PUNCTUATION_OTHER); + + EXPECT_EQ(u::lookup_gc(0xd7), u::GeneralCategory::SYMBOL_MATH); + EXPECT_EQ(u::lookup_gc(0x58f), u::GeneralCategory::SYMBOL_CURRENCY); + EXPECT_EQ(u::lookup_gc(0x5e), u::GeneralCategory::SYMBOL_MODIFIER); + EXPECT_EQ(u::lookup_gc(0xf03), u::GeneralCategory::SYMBOL_OTHER); + + EXPECT_EQ(u::lookup_gc(0x20), u::GeneralCategory::SEPARATOR_SPACE); + EXPECT_EQ(u::lookup_gc(0x2028), u::GeneralCategory::SEPARATOR_LINE); + EXPECT_EQ(u::lookup_gc(0x2029), u::GeneralCategory::SEPARATOR_PARAGRAPH); + + EXPECT_EQ(u::lookup_gc(0xa), u::GeneralCategory::OTHER_CONTROL); + EXPECT_EQ(u::lookup_gc(0x202d), u::GeneralCategory::OTHER_FORMAT); + EXPECT_EQ(u::lookup_gc(0xd800), u::GeneralCategory::OTHER_SURROGATE); + EXPECT_EQ(u::lookup_gc(0xdbff), u::GeneralCategory::OTHER_SURROGATE); + EXPECT_EQ(u::lookup_gc(0xdfff), u::GeneralCategory::OTHER_SURROGATE); + EXPECT_EQ(u::lookup_gc(0xe000), u::GeneralCategory::OTHER_PRIVATE_USE); + + EXPECT_EQ(u::lookup_gc(0xffffffff), u::GeneralCategory::OTHER_UNASSIGNED); +} |
