summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xdata/get_unicode.sh44
-rw-r--r--data/unicode-10.0.0/UnicodeData.txt.xzbin0 -> 158464 bytes
-rw-r--r--data/unicode-11.0.0/UnicodeData.txt.xzbin0 -> 161832 bytes
-rw-r--r--data/unicode-12.1.0/UnicodeData.txt.xzbin0 -> 164764 bytes
-rw-r--r--data/unicode-13.0.0/UnicodeData.txt.xzbin0 -> 167840 bytes
-rw-r--r--data/unicode-14.0.0/UnicodeData.txt.xzbin0 -> 172072 bytes
-rw-r--r--data/unicode-15.0.0/UnicodeData.txt.xzbin0 -> 173628 bytes
-rw-r--r--data/unicode-15.1.0/UnicodeData.txt.xzbin0 -> 173736 bytes
-rw-r--r--data/unicode-16.0.0/UnicodeData.txt.xzbin0 -> 181704 bytes
-rw-r--r--data/unicode-6.2.0/UnicodeData.txt.xzbin0 -> 129932 bytes
-rw-r--r--data/unicode-8.0.0/UnicodeData.txt.xzbin0 -> 151784 bytes
-rw-r--r--meson.build238
-rw-r--r--src/buffer.cc213
-rw-r--r--src/buffer.hh31
-rw-r--r--src/check.hh39
-rw-r--r--src/csv.cc63
-rw-r--r--src/csv.hh44
-rw-r--r--src/decompress.hh19
-rw-r--r--src/decompress_lzma.cc110
-rw-r--r--src/decompress_z.cc120
-rw-r--r--src/gen_ugc.cc317
-rw-r--r--src/io.cc238
-rw-r--r--src/io.hh52
-rw-r--r--src/line.cc133
-rw-r--r--src/line.hh44
-rw-r--r--src/str.cc34
-rw-r--r--src/str.hh18
-rw-r--r--src/u.cc46
-rw-r--r--src/u.hh18
-rw-r--r--src/ugc.hh49
-rw-r--r--src/unique_fd.cc9
-rw-r--r--src/unique_fd.hh45
-rw-r--r--test/buffer.cc65
-rw-r--r--test/csv.cc90
-rw-r--r--test/decompress.cc72
-rw-r--r--test/io.cc142
-rw-r--r--test/io_test_helper.cc82
-rw-r--r--test/io_test_helper.hh18
-rw-r--r--test/line.cc184
-rw-r--r--test/str.cc38
-rw-r--r--test/u.cc42
41 files changed, 2639 insertions, 18 deletions
diff --git a/data/get_unicode.sh b/data/get_unicode.sh
new file mode 100755
index 0000000..99662b3
--- /dev/null
+++ b/data/get_unicode.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+declare -a versions
+
+# Java 8
+versions+=("6.2.0")
+
+# Java 9
+versions+=("8.0.0")
+
+# Java 11
+versions+=("10.0.0")
+
+# Java 12
+versions+=("11.0.0")
+
+# Java 13
+versions+=("12.1.0")
+
+# Java 15
+versions+=("13.0.0")
+
+# Java 19
+versions+=("14.0.0")
+
+# Java 20
+versions+=("15.0.0")
+
+# Java 22
+versions+=("15.1.0")
+
+# Java 24
+versions+=("16.0.0")
+
+basedir=$(dirname -- "${BASH_SOURCE[0]}")
+
+for version in "${versions[@]}"; do
+ target="$basedir"/unicode-"$version"/UnicodeData.txt
+ if [ ! -e "$target".xz ]; then
+ mkdir -p "$basedir"/unicode-"$version"
+ curl "https://www.unicode.org/Public/${version}/ucd/UnicodeData.txt" -o "$target"
+ xz -9 "$target"
+ fi
+done
diff --git a/data/unicode-10.0.0/UnicodeData.txt.xz b/data/unicode-10.0.0/UnicodeData.txt.xz
new file mode 100644
index 0000000..25eb906
--- /dev/null
+++ b/data/unicode-10.0.0/UnicodeData.txt.xz
Binary files differ
diff --git a/data/unicode-11.0.0/UnicodeData.txt.xz b/data/unicode-11.0.0/UnicodeData.txt.xz
new file mode 100644
index 0000000..e586bd3
--- /dev/null
+++ b/data/unicode-11.0.0/UnicodeData.txt.xz
Binary files differ
diff --git a/data/unicode-12.1.0/UnicodeData.txt.xz b/data/unicode-12.1.0/UnicodeData.txt.xz
new file mode 100644
index 0000000..15f8880
--- /dev/null
+++ b/data/unicode-12.1.0/UnicodeData.txt.xz
Binary files differ
diff --git a/data/unicode-13.0.0/UnicodeData.txt.xz b/data/unicode-13.0.0/UnicodeData.txt.xz
new file mode 100644
index 0000000..9e723dd
--- /dev/null
+++ b/data/unicode-13.0.0/UnicodeData.txt.xz
Binary files differ
diff --git a/data/unicode-14.0.0/UnicodeData.txt.xz b/data/unicode-14.0.0/UnicodeData.txt.xz
new file mode 100644
index 0000000..8ccc9cb
--- /dev/null
+++ b/data/unicode-14.0.0/UnicodeData.txt.xz
Binary files differ
diff --git a/data/unicode-15.0.0/UnicodeData.txt.xz b/data/unicode-15.0.0/UnicodeData.txt.xz
new file mode 100644
index 0000000..dfb9976
--- /dev/null
+++ b/data/unicode-15.0.0/UnicodeData.txt.xz
Binary files differ
diff --git a/data/unicode-15.1.0/UnicodeData.txt.xz b/data/unicode-15.1.0/UnicodeData.txt.xz
new file mode 100644
index 0000000..aa89857
--- /dev/null
+++ b/data/unicode-15.1.0/UnicodeData.txt.xz
Binary files differ
diff --git a/data/unicode-16.0.0/UnicodeData.txt.xz b/data/unicode-16.0.0/UnicodeData.txt.xz
new file mode 100644
index 0000000..199d7c1
--- /dev/null
+++ b/data/unicode-16.0.0/UnicodeData.txt.xz
Binary files differ
diff --git a/data/unicode-6.2.0/UnicodeData.txt.xz b/data/unicode-6.2.0/UnicodeData.txt.xz
new file mode 100644
index 0000000..3001c3c
--- /dev/null
+++ b/data/unicode-6.2.0/UnicodeData.txt.xz
Binary files differ
diff --git a/data/unicode-8.0.0/UnicodeData.txt.xz b/data/unicode-8.0.0/UnicodeData.txt.xz
new file mode 100644
index 0000000..d68fed1
--- /dev/null
+++ b/data/unicode-8.0.0/UnicodeData.txt.xz
Binary files differ
diff --git a/meson.build b/meson.build
index ff6a6fc..c3b6302 100644
--- a/meson.build
+++ b/meson.build
@@ -5,9 +5,10 @@ project(
meson_version : '>= 1.3.0',
default_options : [
'warning_level=3',
- 'cpp_std=c++23',
+ 'cpp_std=c++26',
'cpp_eh=none',
'cpp_rtti=false',
+ 'default_library=static',
],
)
@@ -17,25 +18,149 @@ configure_file(input: 'src/config.h.in',
output: 'config.h',
configuration : conf_data)
-dependencies = [
-]
+z_dep = dependency('zlib', version: '>=1.3.0')
+lzma_dep = dependency('liblzma', version: '>=5.8.0')
inc = include_directories('src')
-exe = executable(
- 'jkc',
+args_lib = library(
+ 'args',
sources: [
'src/args.cc',
'src/args.hh',
+ ],
+ include_directories: inc,
+)
+args_dep = declare_dependency(link_with: args_lib)
+
+buffer_lib = library(
+ 'buffer',
+ sources: [
+ 'src/buffer.cc',
+ 'src/buffer.hh',
+ ],
+ include_directories: inc,
+)
+buffer_dep = declare_dependency(link_with: buffer_lib)
+
+io_lib = library(
+ 'io',
+ sources: [
+ 'src/line.cc',
+ 'src/line.hh',
+ 'src/io.cc',
+ 'src/io.hh',
+ 'src/unique_fd.cc',
+ 'src/unique_fd.hh',
+ ],
+ include_directories: inc,
+)
+io_dep = declare_dependency(link_with: io_lib)
+
+str_lib = library(
+ 'str',
+ sources: [
+ 'src/str.cc',
+ 'src/str.hh',
+ ],
+ include_directories: inc,
+)
+str_dep = declare_dependency(link_with: str_lib)
+
+csv_lib = library(
+ 'csv',
+ sources: [
+ 'src/csv.cc',
+ 'src/csv.hh',
+ ],
+ include_directories: inc,
+ dependencies: [io_dep, str_dep],
+)
+csv_dep = declare_dependency(
+ link_with: csv_lib,
+ dependencies: [io_dep, str_dep],
+)
+
+decompress_lib = library(
+ 'decompress',
+ sources: [
+ 'src/decompress.hh',
+ 'src/decompress_lzma.cc',
+ 'src/decompress_z.cc',
+ ],
+ include_directories: inc,
+ dependencies: [buffer_dep, io_dep, lzma_dep, z_dep],
+)
+decompress_dep = declare_dependency(
+ link_with: decompress_lib,
+ dependencies: [buffer_dep, io_dep, lzma_dep, z_dep],
+)
+
+gen_ugc = executable(
+ 'gen_ugc',
+ sources: [
+ 'src/gen_ugc.cc',
+ ],
+ include_directories: inc,
+ install : false,
+ dependencies : [
+ args_dep,
+ csv_dep,
+ decompress_dep,
+ ],
+)
+
+unicode_versions = [
+ '6.2.0',
+ '8.0.0',
+ '10.0.0',
+ '11.0.0',
+ '12.1.0',
+ '13.0.0',
+ '14.0.0',
+ '15.0.0',
+ '15.1.0',
+ '16.0.0',
+]
+
+ugc_sources = []
+foreach unicode_version : unicode_versions
+ ugc_sources += custom_target(
+ 'gen-ugc-' + unicode_version,
+ input: ['data/unicode-' + unicode_version + '/UnicodeData.txt.xz'],
+ output: ['ugc_lookup_' + unicode_version + '.cc'],
+ command : [gen_ugc, '--prefix',
+ 'u' + unicode_version.replace('.', '_') + '_',
+ '@INPUT@', '@OUTPUT@'])
+endforeach
+
+unicode_lib = library(
+ 'unicode',
+ sources: [
'src/u.hh',
+ 'src/u.cc',
'src/u16.hh',
'src/u8.hh',
+ 'src/ugc.hh',
'src/umod8.hh',
+ ugc_sources,
+ ],
+ include_directories: inc,
+)
+unicode_dep = declare_dependency(link_with: unicode_lib)
+
+jkc = executable(
+ 'jkc',
+ sources: [
'src/main.cc',
],
include_directories: inc,
install : true,
- dependencies : dependencies,
+ dependencies : [
+ args_dep,
+ io_dep,
+ unicode_dep,
+ ],
)
gtest_main_dep = dependency('gtest_main', fallback : ['gtest_main'])
@@ -46,22 +171,99 @@ test_dependencies = [
test('args', executable(
'test_args',
- sources: [
- 'src/args.cc',
- 'src/args.hh',
- 'test/args.cc',
- ],
+ sources: ['test/args.cc'],
include_directories: inc,
- dependencies : test_dependencies))
+ dependencies: [
+ args_dep,
+ test_dependencies,
+ ],
+))
test('u', executable(
'test_u',
+ sources: ['test/u.cc'],
+ include_directories: inc,
+ dependencies: [
+ unicode_dep,
+ test_dependencies,
+ ],
+))
+
+test('csv', executable(
+ 'test_csv',
+ sources: ['test/csv.cc'],
+ include_directories: inc,
+ dependencies: [
+ csv_dep,
+ test_dependencies,
+ ],
+))
+
+test('line', executable(
+ 'test_line',
sources: [
- 'src/u.hh',
- 'src/u16.hh',
- 'src/u8.hh',
- 'src/umod8.hh',
- 'test/u.cc',
+ 'test/line.cc',
+ 'test/io_test_helper.hh',
+ 'test/io_test_helper.cc',
+ ],
+ include_directories: inc,
+ dependencies: [
+ io_dep,
+ test_dependencies,
+ ],
+))
+
+test('str', executable(
+ 'test_str',
+ sources: ['test/str.cc'],
+ include_directories: inc,
+ dependencies: [
+ str_dep,
+ test_dependencies,
],
+))
+
+test('io', executable(
+ 'test_io',
+ sources: ['test/io.cc'],
+ include_directories: inc,
+ dependencies: [
+ io_dep,
+ test_dependencies,
+ ],
+))
+
+test('buffer', executable(
+ 'test_buffer',
+ sources: ['test/buffer.cc'],
+ include_directories: inc,
+ dependencies : [
+ buffer_dep,
+ test_dependencies,
+ ],
+))
+
+test('decompress', executable(
+ 'test_decompress',
+ sources: ['test/decompress.cc'],
include_directories: inc,
- dependencies : test_dependencies))
+ dependencies : [
+ decompress_dep,
+ test_dependencies,
+ ],
+))
+
+run_clang_tidy = find_program('run-clang-tidy', required: false)
+
+if run_clang_tidy.found()
+ # The clang-tidy target generated by meson misses most of the
+ # source files, so create our own.
+ run_target(
+ 'clang-tidy',
+ command: [
+ run_clang_tidy,
+ '-quiet',
+ '-use-color',
+ ],
+ )
+endif
diff --git a/src/buffer.cc b/src/buffer.cc
new file mode 100644
index 0000000..65c6757
--- /dev/null
+++ b/src/buffer.cc
@@ -0,0 +1,213 @@
+#include "buffer.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <memory>
+#include <utility>
+
+namespace {
+
+class FixedBuffer : public Buffer {
+ public:
+ explicit FixedBuffer(size_t size)
+ : size_(size) {}
+
+ void const* rptr(size_t& avail, size_t need) override {
+ if (rptr_ < wptr_) {
+ avail = wptr_ - rptr_;
+ } else if (rptr_ == wptr_ && !full_) {
+ avail = 0;
+ } else {
+ avail = (data_.get() + size_) - rptr_;
+ if (avail < need) {
+ rotate();
+ return rptr(avail, need);
+ }
+ }
+ return rptr_;
+ }
+
+ void consume(size_t size) override {
+ if (size == 0) return;
+ if (rptr_ < wptr_) {
+ assert(std::cmp_greater_equal(wptr_ - rptr_, size));
+ rptr_ += size;
+ if (rptr_ == wptr_)
+ reset();
+ } else {
+ assert(rptr_ != wptr_ || !full_);
+ assert(std::cmp_greater_equal((data_.get() + size_) - rptr_, size));
+ rptr_ += size;
+ if (rptr_ == data_.get() + size_) {
+ rptr_ = data_.get();
+ if (rptr_ == wptr_)
+ reset();
+ }
+ }
+ }
+
+ void* wptr(size_t& avail, size_t need) override {
+ if (wptr_ == nullptr) {
+ data_ = std::make_unique_for_overwrite<char[]>(size_);
+ rptr_ = wptr_ = data_.get();
+ }
+
+ if (wptr_ < rptr_) {
+ avail = rptr_ - wptr_;
+ } else if (rptr_ == wptr_ && full_) {
+ avail = 0;
+ } else {
+ avail = (data_.get() + size_) - wptr_;
+ if (avail < need) {
+ rotate();
+ return wptr(avail, need);
+ }
+ }
+ return wptr_;
+ }
+
+ void commit(size_t size) override {
+ if (size == 0) return;
+ if (wptr_ < rptr_) {
+ assert(std::cmp_greater_equal(rptr_ - wptr_, size));
+ wptr_ += size;
+ if (wptr_ == rptr_) {
+ full_ = true;
+ }
+ } else {
+ assert(rptr_ != wptr_ || !full_);
+ assert(std::cmp_greater_equal((data_.get() + size_) - wptr_, size));
+ wptr_ += size;
+ if (wptr_ == data_.get() + size_) {
+ wptr_ = data_.get();
+ if (wptr_ == rptr_)
+ full_ = true;
+ }
+ }
+ }
+
+ [[nodiscard]] bool full() const override {
+ return rptr_ == wptr_ && full_;
+ }
+
+ [[nodiscard]] bool empty() const override {
+ return rptr_ == wptr_ && !full_;
+ }
+
+ private:
+ void reset() {
+ rptr_ = wptr_ = data_.get();
+ full_ = false;
+ }
+
+ void rotate() {
+ size_t to_move = (data_.get() + size_) - rptr_;
+ if (wptr_ + to_move > rptr_) {
+ auto tmp = std::make_unique_for_overwrite<char[]>(to_move);
+ memcpy(tmp.get(), rptr_, to_move);
+ memmove(data_.get() + to_move, data_.get(), wptr_ - data_.get());
+ memcpy(data_.get(), tmp.get(), to_move);
+ } else {
+ memmove(data_.get() + to_move, data_.get(), wptr_ - data_.get());
+ memcpy(data_.get(), rptr_, to_move);
+ }
+ rptr_ = data_.get();
+ wptr_ += to_move;
+ }
+
+ size_t const size_;
+ std::unique_ptr<char[]> data_;
+ char* rptr_{nullptr};
+ char* wptr_{nullptr};
+ bool full_{false};
+};
+
+class DynamicBuffer : public Buffer {
+ public:
+ DynamicBuffer(size_t start_size, size_t max_size)
+ : start_size_(start_size), max_size_(max_size) {}
+
+ void const* rptr(size_t& avail, size_t /* need */) override {
+ avail = wptr_ - rptr_;
+ return rptr_;
+ }
+
+ void consume(size_t size) override {
+ assert(std::cmp_greater_equal(wptr_ - rptr_, size));
+ rptr_ += size;
+ if (rptr_ == wptr_) {
+ reset();
+ }
+ }
+
+ void* wptr(size_t& avail, size_t need) override {
+ avail = end_ - wptr_;
+ if (avail < need) {
+ if (end_ == nullptr) {
+ size_t size = std::min(max_size_, std::max(need, start_size_));
+ data_ = std::make_unique_for_overwrite<char[]>(size);
+ end_ = data_.get() + size;
+ rptr_ = wptr_ = data_.get();
+ avail = end_ - wptr_;
+ } else if (std::cmp_greater_equal(rptr_ - data_.get(), need - avail)) {
+ memmove(data_.get(), rptr_, wptr_ - rptr_);
+ wptr_ = data_.get() + (wptr_ - rptr_);
+ rptr_ = data_.get();
+ avail = end_ - wptr_;
+ } else if (std::cmp_less(end_ - data_.get(), max_size_)) {
+ size_t current_size = end_ - data_.get();
+ size_t new_size = std::min(max_size_,
+ current_size + std::max(need - avail,
+ current_size));
+ auto tmp = std::make_unique_for_overwrite<char[]>(new_size);
+ memcpy(tmp.get(), rptr_, wptr_ - rptr_);
+ end_ = tmp.get() + new_size;
+ wptr_ = tmp.get() + (wptr_ - rptr_);
+ rptr_ = tmp.get();
+ data_ = std::move(tmp);
+ avail = end_ - wptr_;
+ }
+ }
+ return wptr_;
+ }
+
+ void commit(size_t size) override {
+ assert(std::cmp_greater_equal(end_ - wptr_, size));
+ wptr_ += size;
+ }
+
+ [[nodiscard]] bool full() const override {
+ return rptr_ == data_.get() && wptr_ == end_ &&
+ std::cmp_equal(end_ - data_.get(), max_size_);
+ }
+
+ [[nodiscard]] bool empty() const override {
+ return rptr_ == wptr_;
+ }
+
+ private:
+ void reset() {
+ if (std::cmp_greater(end_ - data_.get(), start_size_)) {
+ data_ = std::make_unique_for_overwrite<char[]>(start_size_);
+ }
+ rptr_ = wptr_ = data_.get();
+ }
+
+ size_t const start_size_;
+ size_t const max_size_;
+ std::unique_ptr<char[]> data_;
+ char* end_{nullptr};
+ char* rptr_{nullptr};
+ char* wptr_{nullptr};
+};
+
+} // namespace
+
+std::unique_ptr<Buffer> Buffer::fixed(size_t size) {
+ return std::make_unique<FixedBuffer>(size);
+}
+
+std::unique_ptr<Buffer> Buffer::dynamic(size_t start_size, size_t max_size) {
+ return std::make_unique<DynamicBuffer>(start_size, max_size);
+}
diff --git a/src/buffer.hh b/src/buffer.hh
new file mode 100644
index 0000000..685cd36
--- /dev/null
+++ b/src/buffer.hh
@@ -0,0 +1,31 @@
+#ifndef BUFFER_HH
+#define BUFFER_HH
+
+#include <cstddef>
+#include <memory>
+
+class Buffer {
+ public:
+ virtual ~Buffer() = default;
+
+ virtual void const* rptr(size_t& avail, size_t need = 1) = 0;
+ virtual void consume(size_t size) = 0;
+
+ virtual void* wptr(size_t& avail, size_t need = 1) = 0;
+ virtual void commit(size_t size) = 0;
+
+ [[nodiscard]] virtual bool full() const = 0;
+ [[nodiscard]] virtual bool empty() const = 0;
+
+ [[nodiscard]]
+ static std::unique_ptr<Buffer> fixed(size_t size);
+ [[nodiscard]]
+ static std::unique_ptr<Buffer> dynamic(size_t start_size, size_t max_size);
+
+ protected:
+ Buffer() = default;
+ Buffer(Buffer const&) = delete;
+ Buffer& operator=(Buffer const&) = delete;
+};
+
+#endif // BUFFER_HH
diff --git a/src/check.hh b/src/check.hh
new file mode 100644
index 0000000..be65437
--- /dev/null
+++ b/src/check.hh
@@ -0,0 +1,39 @@
+#ifndef CHECK_HH
+#define CHECK_HH
+
+#include <cstdlib>
+#include <stdckdint.h>
+#include <type_traits>
+
+namespace check {
+
+template<typename T>
+requires std::is_arithmetic_v<T>
+T add(T a, T b) {
+ T ret;
+ if (ckd_add(&ret, a, b))
+ abort();
+ return ret;
+}
+
+template<typename T>
+requires std::is_arithmetic_v<T>
+T sub(T a, T b) {
+ T ret;
+ if (ckd_sub(&ret, a, b))
+ abort();
+ return ret;
+}
+
+template<typename T>
+requires std::is_arithmetic_v<T>
+T mul(T a, T b) {
+ T ret;
+ if (ckd_mul(&ret, a, b))
+ abort();
+ return ret;
+}
+
+} // namespace check
+
+#endif // CHECK_HH
diff --git a/src/csv.cc b/src/csv.cc
new file mode 100644
index 0000000..4135555
--- /dev/null
+++ b/src/csv.cc
@@ -0,0 +1,63 @@
+#include "csv.hh"
+
+#include "line.hh"
+#include "str.hh"
+
+#include <cstdint>
+#include <expected>
+#include <memory>
+#include <span>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace csv {
+
+namespace {
+
+class ReaderImpl : public Reader {
+ public:
+ ReaderImpl(std::unique_ptr<line::Reader> reader, char separator)
+ : reader_(std::move(reader)), separator_(separator) {
+ }
+
+ [[nodiscard]]
+ std::expected<std::span<std::string_view>, io::ReadError> read() override {
+ while (true) {
+ auto line = reader_->read();
+ if (line.has_value()) {
+ str::split(line.value(), line_, separator_, /* keep_empty */ true);
+ if (line_.size() == 1 && line_[0].empty())
+ continue;
+ return line_;
+ }
+ if (line.error().eof) {
+ return {};
+ }
+ return std::unexpected(line.error().io_error.value());
+ }
+ }
+
+ [[nodiscard]] uint64_t number() const override {
+ return reader_->number();
+ }
+
+ private:
+ std::unique_ptr<line::Reader> reader_;
+ char const separator_;
+ std::vector<std::string_view> line_;
+};
+
+} // namespace
+
+std::unique_ptr<Reader> open(std::unique_ptr<line::Reader> reader,
+ char separator) {
+ return std::make_unique<ReaderImpl>(std::move(reader), separator);
+}
+
+std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader,
+ char separator) {
+ return open(line::open(std::move(reader)), separator);
+}
+
+} // namespace csv
diff --git a/src/csv.hh b/src/csv.hh
new file mode 100644
index 0000000..8c47ceb
--- /dev/null
+++ b/src/csv.hh
@@ -0,0 +1,44 @@
+#ifndef CSV_HH
+#define CSV_HH
+
+#include "io.hh" // IWYU pragma: export
+#include "line.hh"
+
+#include <expected>
+#include <memory>
+#include <span>
+#include <string_view>
+
+namespace csv {
+
+// Note that this reader is very simple, no quotes or escapes.
+// Empty lines are ignored.
+class Reader {
+ public:
+ virtual ~Reader() = default;
+
+ // Returned span is only valid until next call to read.
+ // Returns empty span at end-of-file and only then.
+ [[nodiscard]]
+ virtual std::expected<std::span<std::string_view>, io::ReadError> read() = 0;
+
+ // Starts at zero. Returns next line.
+ // So, before first read it is zero, after first read it is one.
+ [[nodiscard]] virtual uint64_t number() const = 0;
+
+ protected:
+ Reader() = default;
+
+ Reader(Reader const&) = delete;
+ Reader& operator=(Reader const&) = delete;
+};
+
+[[nodiscard]] std::unique_ptr<Reader> open(std::unique_ptr<line::Reader> reader,
+ char separator = ',');
+
+[[nodiscard]] std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader,
+ char separator = ',');
+
+} // namespace csv
+
+#endif // CSV_HH
diff --git a/src/decompress.hh b/src/decompress.hh
new file mode 100644
index 0000000..a15efdc
--- /dev/null
+++ b/src/decompress.hh
@@ -0,0 +1,19 @@
+#ifndef DECOMPRESS_HH
+#define DECOMPRESS_HH
+
+#include "io.hh" // IWYU pragma: export
+
+namespace decompress {
+
+// zlib format
+std::unique_ptr<io::Reader> zlib(std::unique_ptr<io::Reader> reader);
+
+// gzip (.gz) format
+std::unique_ptr<io::Reader> gzip(std::unique_ptr<io::Reader> reader);
+
+// xz format
+std::unique_ptr<io::Reader> xz(std::unique_ptr<io::Reader> reader);
+
+} // namespace decompress
+
+#endif // DECOMPRESS_HH
diff --git a/src/decompress_lzma.cc b/src/decompress_lzma.cc
new file mode 100644
index 0000000..6baea18
--- /dev/null
+++ b/src/decompress_lzma.cc
@@ -0,0 +1,110 @@
+#include "decompress.hh"
+
+#include "buffer.hh"
+
+#include <lzma.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <expected>
+#include <memory>
+#include <optional>
+#include <utility>
+
+namespace decompress {
+
+namespace {
+
+const size_t kBufferSizeXz = static_cast<size_t>(1024) * 1024;
+
+class XzReader : public io::Reader {
+ public:
+ explicit XzReader(std::unique_ptr<io::Reader> reader)
+ : reader_(std::move(reader)) {}
+
+ ~XzReader() override {
+ if (initialized_)
+ lzma_end(&stream_);
+ }
+
+ std::expected<size_t, io::ReadError> read(void* dst, size_t max) override {
+ auto err = fill();
+ if (err.has_value())
+ return std::unexpected(err.value());
+
+ stream_.next_out = reinterpret_cast<unsigned char*>(dst);
+ stream_.avail_out = max;
+
+ if (!initialized_) {
+ if (in_eof_ && buffer_->empty())
+ return 0;
+
+ lzma_mt options;
+ memset(&options, 0, sizeof(options));
+ options.threads = std::max(static_cast<uint32_t>(1), lzma_cputhreads());
+ options.memlimit_threading = lzma_physmem() / 4;
+ options.memlimit_stop = lzma_physmem() / 4;
+ auto ret = lzma_stream_decoder_mt(&stream_, &options);
+ if (ret != LZMA_OK)
+ return std::unexpected(io::ReadError::Error);
+ initialized_ = true;
+ }
+
+ auto* const rptr = stream_.next_in;
+ auto ret = lzma_code(&stream_, in_eof_ ? LZMA_FINISH : LZMA_RUN);
+ auto got = max - stream_.avail_out;
+ if (ret == LZMA_STREAM_END) {
+ lzma_end(&stream_);
+ initialized_ = false;
+ buffer_->consume(stream_.next_in - rptr);
+ } else if (ret == LZMA_OK) {
+ if (!in_eof_)
+ buffer_->consume(stream_.next_in - rptr);
+ } else {
+ return std::unexpected(
+ ret == LZMA_DATA_ERROR
+ ? io::ReadError::InvalidData : io::ReadError::Error);
+ }
+ return got;
+ }
+
+ std::expected<size_t, io::ReadError> skip(size_t max) override {
+ auto tmp = std::make_unique_for_overwrite<char[]>(max);
+ return read(tmp.get(), max);
+ }
+
+ private:
+ std::optional<io::ReadError> fill() {
+ auto* rptr = buffer_->rptr(stream_.avail_in);
+ if (!in_eof_ && stream_.avail_in < kBufferSizeXz / 2) {
+ auto* wptr = buffer_->wptr(stream_.avail_in);
+ auto got = reader_->read(wptr, stream_.avail_in);
+ if (got.has_value()) {
+ buffer_->commit(got.value());
+ if (got.value() == 0)
+ in_eof_ = true;
+ } else {
+ return got.error();
+ }
+ rptr = buffer_->rptr(stream_.avail_in);
+ }
+ stream_.next_in = reinterpret_cast<const unsigned char*>(rptr);
+ return std::nullopt;
+ }
+
+ std::unique_ptr<io::Reader> reader_;
+ bool in_eof_{false};
+ std::unique_ptr<Buffer> buffer_{Buffer::fixed(kBufferSizeXz)};
+ bool initialized_{false};
+ lzma_stream stream_ = LZMA_STREAM_INIT;
+};
+
+} // namespace
+
+std::unique_ptr<io::Reader> xz(std::unique_ptr<io::Reader> reader) {
+ return std::make_unique<XzReader>(std::move(reader));
+}
+
+} // namespace decompress
diff --git a/src/decompress_z.cc b/src/decompress_z.cc
new file mode 100644
index 0000000..f9f87ae
--- /dev/null
+++ b/src/decompress_z.cc
@@ -0,0 +1,120 @@
+#include "decompress.hh"
+
+#include "buffer.hh"
+
+#define ZLIB_CONST
+#include <zlib.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <expected>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <utility>
+
+namespace decompress {
+
+namespace {
+
+const size_t kBufferSizeZ = static_cast<size_t>(1024) * 1024;
+
+class DecompressReader : public io::Reader {
+ public:
+ DecompressReader(std::unique_ptr<io::Reader> reader, bool gzip)
+ : reader_(std::move(reader)), gzip_(gzip) {}
+
+ ~DecompressReader() override {
+ if (initialized_)
+ inflateEnd(&stream_);
+ }
+
+ std::expected<size_t, io::ReadError> read(void* dst, size_t max) override {
+ auto err = fill();
+ if (err.has_value())
+ return std::unexpected(err.value());
+
+ // NOLINTNEXTLINE(misc-include-cleaner)
+ stream_.next_out = reinterpret_cast<Bytef*>(dst);
+ stream_.avail_out = max;
+
+ if (!initialized_) {
+ if (in_eof_ && buffer_->empty())
+ return 0;
+
+ stream_.zalloc = Z_NULL;
+ stream_.zfree = Z_NULL;
+ stream_.opaque = Z_NULL;
+ if (inflateInit2(&stream_, gzip_ ? 16 : 0) != Z_OK) {
+ return std::unexpected(io::ReadError::Error);
+ }
+ initialized_ = true;
+ }
+
+ auto* const rptr = stream_.next_in;
+ auto ret = inflate(&stream_, in_eof_ ? Z_FINISH : Z_NO_FLUSH);
+ auto got = max - stream_.avail_out;
+ if (ret == Z_STREAM_END) {
+ inflateEnd(&stream_);
+ initialized_ = false;
+ buffer_->consume(stream_.next_in - rptr);
+ } else if (ret == Z_OK) {
+ if (!in_eof_)
+ buffer_->consume(stream_.next_in - rptr);
+ } else {
+ return std::unexpected(
+ ret == Z_DATA_ERROR
+ ? io::ReadError::InvalidData : io::ReadError::Error);
+ }
+ return got;
+ }
+
+ std::expected<size_t, io::ReadError> skip(size_t max) override {
+ auto tmp = std::make_unique_for_overwrite<char[]>(max);
+ return read(tmp.get(), max);
+ }
+
+ private:
+ std::optional<io::ReadError> fill() {
+ size_t avail;
+ auto* rptr = buffer_->rptr(avail);
+ if (!in_eof_ && avail < kBufferSizeZ / 2) {
+ auto* wptr = buffer_->wptr(avail);
+ auto got = reader_->read(wptr, avail);
+ if (got.has_value()) {
+ buffer_->commit(got.value());
+ if (got.value() == 0)
+ in_eof_ = true;
+ } else {
+ return got.error();
+ }
+ rptr = buffer_->rptr(avail);
+ }
+ // NOLINTNEXTLINE(misc-include-cleaner)
+ stream_.next_in = reinterpret_cast<z_const Bytef*>(rptr);
+ stream_.avail_in = std::min(
+ // NOLINTNEXTLINE(misc-include-cleaner)
+ static_cast<size_t>(std::numeric_limits<uInt>::max()), avail);
+ return std::nullopt;
+ }
+
+ std::unique_ptr<io::Reader> reader_;
+ bool const gzip_;
+ bool in_eof_{false};
+ std::unique_ptr<Buffer> buffer_{Buffer::fixed(kBufferSizeZ)};
+ bool initialized_{false};
+ z_stream stream_;
+};
+
+} // namespace
+
+std::unique_ptr<io::Reader> zlib(std::unique_ptr<io::Reader> reader) {
+ return std::make_unique<DecompressReader>(std::move(reader), /* gzip = */ false);
+}
+
+std::unique_ptr<io::Reader> gzip(std::unique_ptr<io::Reader> reader) {
+ return std::make_unique<DecompressReader>(std::move(reader), /* gzip = */ true);
+}
+
+
+} // namespace decompress
diff --git a/src/gen_ugc.cc b/src/gen_ugc.cc
new file mode 100644
index 0000000..e9bce11
--- /dev/null
+++ b/src/gen_ugc.cc
@@ -0,0 +1,317 @@
+#include "args.hh"
+#include "csv.hh"
+#include "decompress.hh"
+#include "ugc.hh"
+
+#include <charconv>
+#include <cstdint>
+#include <expected>
+#include <format>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <span>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <utility>
+#include <vector>
+
+namespace {
+
+std::map<std::string, u::GeneralCategory, std::less<>> str2gc{
+ {"Lu", u::GeneralCategory::LETTER_UPPERCASE},
+ {"Ll", u::GeneralCategory::LETTER_LOWERCASE},
+ {"Lt", u::GeneralCategory::LETTER_TITLECASE},
+ {"Lm", u::GeneralCategory::LETTER_MODIFIER},
+ {"Lo", u::GeneralCategory::LETTER_OTHER},
+
+ {"Mn", u::GeneralCategory::MARK_NONSPACING},
+ {"Mc", u::GeneralCategory::MARK_SPACING_COMBINDING},
+ {"Me", u::GeneralCategory::MARK_SPACING_ENCLOSING},
+
+ {"Nd", u::GeneralCategory::NUMBER_DIGIT},
+ {"Nl", u::GeneralCategory::NUMBER_LETTER},
+ {"No", u::GeneralCategory::NUMBER_OTHER},
+
+ {"Pc", u::GeneralCategory::PUNCTUATION_CONNECTOR},
+ {"Pd", u::GeneralCategory::PUNCTUATION_DASH},
+ {"Ps", u::GeneralCategory::PUNCTUATION_OPEN},
+ {"Pe", u::GeneralCategory::PUNCTUATION_CLOSE},
+ {"Pi", u::GeneralCategory::PUNCTUATION_INITIAL_QUOTE},
+ {"Pf", u::GeneralCategory::PUNCTUATION_FINAL_QUOTE},
+ {"Po", u::GeneralCategory::PUNCTUATION_OTHER},
+
+ {"Sm", u::GeneralCategory::SYMBOL_MATH},
+ {"Sc", u::GeneralCategory::SYMBOL_CURRENCY},
+ {"Sk", u::GeneralCategory::SYMBOL_MODIFIER},
+ {"So", u::GeneralCategory::SYMBOL_OTHER},
+
+ {"Zs", u::GeneralCategory::SEPARATOR_SPACE},
+ {"Zl", u::GeneralCategory::SEPARATOR_LINE},
+ {"Zp", u::GeneralCategory::SEPARATOR_PARAGRAPH},
+
+ {"Cc", u::GeneralCategory::OTHER_CONTROL},
+ {"Cf", u::GeneralCategory::OTHER_FORMAT},
+ {"Cs", u::GeneralCategory::OTHER_SURROGATE},
+ {"Co", u::GeneralCategory::OTHER_PRIVATE_USE},
+ {"Cn", u::GeneralCategory::OTHER_UNASSIGNED},
+};
+
+void print_header(std::ostream& out, std::string_view prefix) {
+ out << "#include \"ugc.hh\"\n"
+ << "\n"
+ << "#include <array>\n"
+ << "#include <cstddef>\n"
+ << "#include <cstdint>\n"
+ << "\n"
+ << "namespace u {\n"
+ << "\n"
+ << "extern GeneralCategory " << prefix << "lookup_gc(uint32_t code) {\n";
+}
+
+void print_body(std::ostream& out,
+ std::map<uint32_t, u::GeneralCategory> const& data) {
+ std::vector<uint32_t> codes;
+ std::vector<u::GeneralCategory> categories;
+
+ auto it = data.begin();
+ codes.emplace_back(it->first);
+ categories.emplace_back(it->second);
+
+ uint32_t next = it->first + 1;
+
+ for (++it; it != data.end(); ++it) {
+ if (it->first == next && categories.back() == it->second) {
+ ++next;
+ } else {
+ codes.emplace_back(next - 1);
+ codes.emplace_back(it->first);
+ categories.emplace_back(it->second);
+ next = it->first + 1;
+ }
+ }
+
+ codes.emplace_back(next - 1);
+
+ out << " static std::array<uint32_t, " << codes.size() << "> codes{";
+ for (auto code : codes) {
+ out << code << ",";
+ }
+ out << " };\n";
+ out << " static std::array<uint8_t, " << categories.size()
+ << "> categories{";
+ for (auto category : categories) {
+ out << static_cast<uint16_t>(category) << ",";
+ }
+ out << "};\n";
+
+ out << " size_t low = 0;\n"
+ << " size_t high = " << (codes.size() / 2) << ";\n"
+ << " while (low < high) {\n"
+ << " size_t m = (low + high) / 2;\n"
+ << " uint32_t start = codes[m * 2];\n"
+ << " if (code < start) {\n"
+ << " high = m;\n"
+ << " } else {\n"
+ << " uint32_t end = codes[(m * 2) + 1];\n"
+ << " if (code <= end) {\n"
+ << " return static_cast<u::GeneralCategory>(categories[m]);\n"
+ << " }\n"
+ << " low = m + 1;\n"
+ << " }\n"
+ << " }\n"
+ << " return u::GeneralCategory::OTHER_UNASSIGNED;\n";
+}
+
+void print_footer(std::ostream& out, std::string_view /* prefix */) {
+ out << "}\n"
+ << "\n"
+ << "} // namespace u\n";
+}
+
+std::string_view ioerr2str(io::OpenError error) {
+ switch (error) {
+ case io::OpenError::NoSuchFile:
+ return "No such file";
+ case io::OpenError::NoAccess:
+ return "No access";
+ case io::OpenError::Error:
+ return "Fatal error";
+ }
+ std::unreachable();
+}
+
+std::string_view ioerr2str(io::ReadError error) {
+ switch (error) {
+ case io::ReadError::InvalidData:
+ return "Invalid (compressed) data";
+ case io::ReadError::Error:
+ return "Fatal error";
+ }
+ std::unreachable();
+}
+
+std::expected<std::pair<uint32_t, u::GeneralCategory>, std::string> parse_row(
+ std::span<std::string_view> row) {
+ // [code];[name];[gc];[cc];[bc];[decomposition];[nv-dec];[nv-dig];[nv-num];[bm];[alias];;[upper case];[lower case];[title case]
+ if (row.size() != 15) {
+ return std::unexpected(std::format("Invalid row ({} columns)", row.size()));
+ }
+ auto code_col = row[0];
+ auto category_col = row[2];
+
+ uint32_t code;
+ auto [ptr, ec] = std::from_chars(code_col.data(),
+ code_col.data() + code_col.size(), code,
+ /* base */ 16);
+ if (ec != std::errc() || ptr != code_col.data() + code_col.size()) {
+ return std::unexpected(std::format("Invalid code value {}", code_col));
+ }
+ u::GeneralCategory category;
+ auto it = str2gc.find(category_col);
+ if (it == str2gc.end()) {
+ return std::unexpected(std::format("Invalid general category {}",
+ category_col));
+ }
+ category = it->second;
+
+ return std::make_pair(code, category);
+}
+
+std::expected<std::map<uint32_t, u::GeneralCategory>, std::string> read(
+ std::string_view filename) {
+ auto maybe_reader = io::open(std::string(filename));
+ if (!maybe_reader.has_value()) {
+ return std::unexpected(std::format(
+ "Unable to open {} for reading: {}",
+ filename, ioerr2str(maybe_reader.error())));
+ }
+ auto reader = std::move(maybe_reader.value());
+ if (filename.ends_with(".gz")) {
+ reader = decompress::gzip(std::move(reader));
+ } else if (filename.ends_with(".xz")) {
+ reader = decompress::xz(std::move(reader));
+ }
+
+ std::map<uint32_t, u::GeneralCategory> ret;
+ auto csv_reader = csv::open(std::move(reader), ';');
+ while (true) {
+ auto row = csv_reader->read();
+ if (!row.has_value()) {
+ return std::unexpected(std::format(
+ "{}:{}: Error reading file: {}",
+ filename, csv_reader->number(), ioerr2str(row.error())));
+ }
+ if (row->empty())
+ break;
+
+ auto pair = parse_row(row.value());
+ if (!pair.has_value()) {
+ return std::unexpected(std::format(
+ "{}:{}: {}", filename, csv_reader->number(), pair.error()));
+ }
+ auto name_col = (*row)[1];
+
+ if (name_col.ends_with(", First>")) {
+ std::string prefix(name_col.substr(0, name_col.size() - 8));
+ row = csv_reader->read();
+ if (!row.has_value()) {
+ return std::unexpected(std::format(
+ "{}:{}: Error reading file: {}",
+ filename, csv_reader->number(), ioerr2str(row.error())));
+ }
+
+ auto second_pair = parse_row(row.value());
+ if (!pair.has_value()) {
+ return std::unexpected(std::format(
+ "{}:{}: {}", filename, csv_reader->number(), pair.error()));
+ }
+
+ name_col = (*row)[1];
+ if (name_col.ends_with(", Last>") &&
+ name_col.substr(0, name_col.size() - 7) == prefix) {
+ if (pair->second != second_pair->second) {
+ return std::unexpected(std::format(
+ "{}:{}: Invalid range, general category doesn't match",
+ filename, csv_reader->number()));
+ }
+
+ for (uint32_t c = pair->first; c <= second_pair->first; ++c) {
+ auto emplace_ret = ret.emplace(c, pair->second);
+ if (!emplace_ret.second) {
+ return std::unexpected(std::format(
+ "{}:{}: Duplicate value for {:#08x}",
+ filename, csv_reader->number(), c));
+ }
+ }
+ } else {
+ return std::unexpected(std::format(
+ "{}:{}: Invalid range, {} doesn't match {}",
+ filename, csv_reader->number(), prefix, name_col));
+ }
+ } else {
+ auto emplace_ret = ret.emplace(std::move(pair.value()));
+ if (!emplace_ret.second) {
+ return std::unexpected(std::format(
+ "{}:{}: Duplicate value for {:#08x}",
+ filename, csv_reader->number(), emplace_ret.first->first));
+ }
+ }
+ }
+
+ return ret;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+ auto args = Args::create();
+ auto opt_help = args->option('h', "help", "display this text and exit");
+ auto opt_prefix =
+ args->option_argument('p', "prefix", "ARG", "Prefix for exported method");
+ std::vector<std::string_view> arguments;
+ if (!args->run(argc, argv, &arguments)) {
+ std::cerr << "Try `gen_u --help` for usage\n";
+ return 1;
+ }
+ if (opt_help->is_set()) {
+ std::cout << "Usage: `gen_u [OPTIONS...] UnicodeData [OUTPUT]`\n"
+ << "Generates a method for getting the general category for a "
+ << "code point.\n"
+ << "\n";
+ args->print_help(std::cout);
+ return 0;
+ }
+ if (!opt_prefix->is_set()) {
+ std::cerr << "No prefix given.\n"
+ << "Try `gen_u --help` for usage\n";
+ return 1;
+ }
+ auto prefix = opt_prefix->argument();
+ if (arguments.empty() || arguments.size() > 2) {
+ std::cerr << "Expecting one or two argument. No more, no less.\n"
+ << "Try `gen_u --help` for usage\n";
+ return 1;
+ }
+
+ auto general_categories = read(arguments[0]);
+ if (!general_categories.has_value()) {
+ std::cerr << general_categories.error() << '\n';
+ return 1;
+ }
+
+ if (arguments.size() < 2 || arguments[1] == "-") {
+ print_header(std::cout, prefix);
+ print_body(std::cout, general_categories.value());
+ print_footer(std::cout, prefix);
+ } else {
+ std::fstream out{std::string(arguments[1]),
+ std::fstream::trunc | std::fstream::out};
+ print_header(out, prefix);
+ print_body(out, general_categories.value());
+ print_footer(out, prefix);
+ }
+ return 0;
+}
diff --git a/src/io.cc b/src/io.cc
new file mode 100644
index 0000000..baf162a
--- /dev/null
+++ b/src/io.cc
@@ -0,0 +1,238 @@
+#include "io.hh"
+
+#include "unique_fd.hh"
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdio>
+#include <cstring>
+#include <expected>
+#include <fcntl.h>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <utility>
+
+namespace io {
+
+namespace {
+
+class BasicReader : public Reader {
+ public:
+ explicit BasicReader(unique_fd fd)
+ : fd_(std::move(fd)) {
+ }
+
+ [[nodiscard]]
+ std::expected<size_t, ReadError> read(void* dst, size_t max) override {
+ ssize_t ret = ::read(
+ fd_.get(), dst, std::min(static_cast<size_t>(
+ std::numeric_limits<ssize_t>::max()), max));
+ if (ret < 0) {
+ switch (errno) {
+ case EINTR:
+ return read(dst, max);
+ default:
+ return std::unexpected(ReadError::Error);
+ }
+ }
+ offset_ += ret;
+ return ret;
+ }
+
+ [[nodiscard]]
+ std::expected<size_t, ReadError> skip(size_t max) override {
+ off_t ret;
+ if (sizeof(size_t) > sizeof(off_t)) {
+ // NOLINTNEXTLINE(bugprone-narrowing-conversions)
+ ret = lseek(fd_.get(), std::min(static_cast<size_t>(
+ std::numeric_limits<off_t>::max()), max), SEEK_CUR);
+ } else {
+ ret = lseek(fd_.get(), static_cast<off_t>(max), SEEK_CUR);
+ }
+ if (ret < 0) {
+ return std::unexpected(ReadError::Error);
+ }
+ // Don't want skip to go past (cached) file end.
+ if (!size_.has_value() || ret > size_.value()) {
+ // When going past end, double check that it still is the end.
+ off_t ret2 = lseek(fd_.get(), 0, SEEK_END);
+ if (ret2 < 0) {
+ // We're screwed, but try to go back to original position and then
+ // return error.
+ size_.reset();
+ lseek(fd_.get(), offset_, SEEK_SET);
+ return std::unexpected(ReadError::Error);
+ }
+ size_ = ret2;
+ if (ret > ret2) {
+ auto distance = ret2 - offset_;
+ offset_ = ret2;
+ return distance;
+ }
+ // Seek back to where we should be
+ if (lseek(fd_.get(), ret, SEEK_SET) < 0) {
+ return std::unexpected(ReadError::Error);
+ }
+ }
+ auto distance = ret - offset_;
+ offset_ = ret;
+ return distance;
+ }
+
+ private:
+ unique_fd fd_;
+ off_t offset_{0};
+ std::optional<off_t> size_;
+};
+
+class MemoryReader : public Reader {
+ public:
+ MemoryReader(void* ptr, size_t size)
+ : ptr_(ptr), size_(size) {
+ }
+
+ [[nodiscard]]
+ std::expected<size_t, ReadError> read(void* dst, size_t max) override {
+ size_t avail = size_ - offset_;
+ size_t ret = std::min(max, avail);
+ memcpy(dst, reinterpret_cast<char*>(ptr_) + offset_, ret);
+ offset_ += ret;
+ return ret;
+ }
+
+ [[nodiscard]]
+ std::expected<size_t, ReadError> skip(size_t max) override {
+ size_t avail = size_ - offset_;
+ size_t ret = std::min(max, avail);
+ offset_ += ret;
+ return ret;
+ }
+
+ protected:
+ void* ptr_;
+ size_t const size_;
+
+ private:
+ size_t offset_{0};
+};
+
+class MmapReader : public MemoryReader {
+ public:
+ MmapReader(unique_fd fd, void* ptr, size_t size)
+ : MemoryReader(ptr, size), fd_(std::move(fd)) {
+ }
+
+ ~MmapReader() override {
+ munmap(ptr_, size_);
+ }
+
+ private:
+ unique_fd fd_;
+};
+
+class StringReader : public MemoryReader {
+ public:
+ explicit StringReader(std::string data)
+ : MemoryReader(nullptr, data.size()), data_(std::move(data)) {
+ ptr_ = data_.data();
+ }
+
+ private:
+ std::string data_;
+};
+
+} // namespace
+
+std::expected<size_t, ReadError> Reader::read(std::string& str) {
+ return read(str.data(), str.size());
+}
+
+std::expected<size_t, ReadError> Reader::repeat_read(void* dst, size_t max) {
+ auto ret = read(dst, max);
+ if (!ret.has_value() || ret.value() == 0 || ret.value() == max)
+ return ret;
+
+ char* d = reinterpret_cast<char*>(dst);
+ size_t offset = ret.value();
+ while (true) {
+ ret = read(d + offset, max - offset);
+ if (!ret.has_value() || ret.value() == 0)
+ break;
+ offset += ret.value();
+ if (offset == max)
+ break;
+ }
+ return offset;
+}
+
+std::expected<size_t, ReadError> Reader::repeat_read(std::string& str) {
+ return repeat_read(str.data(), str.size());
+}
+
+std::expected<size_t, ReadError> Reader::repeat_skip(size_t max) {
+ auto ret = skip(max);
+ if (!ret.has_value() || ret.value() == 0 || ret.value() == max)
+ return ret;
+
+ size_t offset = ret.value();
+ while (true) {
+ ret = skip(max - offset);
+ if (!ret.has_value() || ret.value() == 0)
+ break;
+ offset += ret.value();
+ if (offset == max)
+ break;
+ }
+ return offset;
+}
+
+std::expected<std::unique_ptr<Reader>, OpenError> open(
+ const std::string& file_path) {
+ return openat(AT_FDCWD, file_path);
+}
+
+std::expected<std::unique_ptr<Reader>, OpenError> openat(
+ int dirfd, const std::string& file_path) {
+ unique_fd fd(::openat(dirfd, file_path.c_str(), O_RDONLY));
+ if (fd) {
+ struct stat buf;
+ if (fstat(fd.get(), &buf) == 0) {
+ if (std::cmp_less_equal(buf.st_size,
+ std::numeric_limits<size_t>::max())) {
+ auto size = static_cast<size_t>(buf.st_size);
+ void* ptr = mmap(nullptr, size, PROT_READ, MAP_PRIVATE, fd.get(), 0);
+ if (ptr != MAP_FAILED) {
+ return std::make_unique<MmapReader>(std::move(fd), ptr, size);
+ }
+ }
+ }
+ return std::make_unique<BasicReader>(std::move(fd));
+ }
+ OpenError err;
+ switch (errno) {
+ case EINTR:
+ return openat(dirfd, file_path);
+ case EACCES:
+ err = OpenError::NoAccess;
+ break;
+ case ENOENT:
+ err = OpenError::NoSuchFile;
+ break;
+ default:
+ err = OpenError::Error;
+ break;
+ }
+ return std::unexpected(err);
+}
+
+std::unique_ptr<Reader> memory(std::string data) {
+ return std::make_unique<StringReader>(std::move(data));
+}
+
+} // namespace io
diff --git a/src/io.hh b/src/io.hh
new file mode 100644
index 0000000..315d0bb
--- /dev/null
+++ b/src/io.hh
@@ -0,0 +1,52 @@
+#ifndef IO_HH
+#define IO_HH
+
+#include <cstddef>
+#include <expected>
+#include <memory>
+#include <string>
+
+namespace io {
+
+enum class ReadError {
+ Error,
+ InvalidData, // Used by decompress and such
+};
+
+enum class OpenError {
+ NoSuchFile,
+ NoAccess,
+ Error,
+};
+
+class Reader {
+ public:
+ virtual ~Reader() = default;
+
+ [[nodiscard]] virtual std::expected<size_t, ReadError> read(void* dst,
+ size_t max) = 0;
+ [[nodiscard]] virtual std::expected<size_t, ReadError> skip(size_t max) = 0;
+
+ [[nodiscard]] std::expected<size_t, ReadError> read(std::string& str);
+
+ [[nodiscard]] std::expected<size_t, ReadError> repeat_read(void* dst,
+ size_t max);
+ [[nodiscard]] std::expected<size_t, ReadError> repeat_read(std::string& str);
+ [[nodiscard]] std::expected<size_t, ReadError> repeat_skip(size_t max);
+
+ protected:
+ Reader() = default;
+
+ Reader(Reader const&) = delete;
+ Reader& operator=(Reader const&) = delete;
+};
+
+[[nodiscard]] std::expected<std::unique_ptr<Reader>, OpenError> open(
+ const std::string& file_path);
+[[nodiscard]] std::expected<std::unique_ptr<Reader>, OpenError> openat(
+ int dirfd, const std::string& file_path);
+[[nodiscard]] std::unique_ptr<Reader> memory(std::string data);
+
+} // namespace io
+
+#endif // IO_HH
diff --git a/src/line.cc b/src/line.cc
new file mode 100644
index 0000000..2eeb116
--- /dev/null
+++ b/src/line.cc
@@ -0,0 +1,133 @@
+#include "line.hh"
+
+#include "check.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <expected>
+#include <memory>
+#include <string_view>
+#include <utility>
+
+namespace line {
+
+namespace {
+
+const char kLineTerminators[] = "\r\n";
+
+class ReaderImpl : public Reader {
+ public:
+ ReaderImpl(std::unique_ptr<io::Reader> reader, size_t max_len)
+ : reader_(std::move(reader)), max_len_(max_len),
+ buffer_(std::make_unique_for_overwrite<char[]>(
+ check::add(max_len, static_cast<size_t>(2)))),
+ rptr_(buffer_.get()), wptr_(buffer_.get()), search_(rptr_),
+ end_(buffer_.get() + check::add(max_len, static_cast<size_t>(2))) {}
+
+ [[nodiscard]] std::expected<std::string_view, ReadError> read() override {
+ while (true) {
+ search_ = std::find_first_of(search_, wptr_,
+ kLineTerminators, kLineTerminators + 2);
+ if (search_ < wptr_) {
+ if (std::cmp_greater(search_ - rptr_, max_len_)) {
+ return line(max_len_, 0);
+ }
+
+ size_t tlen;
+ if (*search_ == '\n') {
+ tlen = 1;
+ } else {
+ if (search_ + 1 == wptr_) {
+ make_space_if_needed();
+ auto got = fill();
+ if (got.has_value()) {
+ if (got.value() == 0) {
+ return line(search_ - rptr_, 1);
+ }
+ } else {
+ return std::unexpected(ReadError(got.error()));
+ }
+ }
+ if (search_[1] == '\n') {
+ tlen = 2;
+ } else {
+ tlen = 1;
+ }
+ }
+ return line(search_ - rptr_, tlen);
+ }
+ if (std::cmp_greater_equal(wptr_ - rptr_, max_len_)) {
+ return line(max_len_, 0);
+ }
+
+ make_space_if_needed();
+ auto got = fill();
+ if (got.has_value()) {
+ if (got.value() == 0) {
+ if (rptr_ == wptr_) {
+ return std::unexpected(ReadError());
+ }
+ return line(wptr_ - rptr_, 0);
+ }
+ } else {
+ return std::unexpected(ReadError(got.error()));
+ }
+ }
+ }
+
+ [[nodiscard]] uint64_t number() const override { return number_; }
+
+ private:
+ std::string_view line(size_t len, size_t terminator_len) {
+ assert(len <= max_len_);
+ auto ret = std::string_view(rptr_, len);
+ rptr_ += len + terminator_len;
+ search_ = rptr_;
+ ++number_;
+ return ret;
+ }
+
+ void make_space_if_needed() {
+ size_t free = rptr_ - buffer_.get();
+ if (free == 0) return;
+ size_t avail = end_ - wptr_;
+ if (avail > 1024) return;
+ memmove(buffer_.get(), rptr_, wptr_ - rptr_);
+ search_ -= free;
+ wptr_ -= free;
+ rptr_ = buffer_.get();
+ }
+
+ std::expected<size_t, io::ReadError> fill() {
+ auto ret = reader_->read(wptr_, end_ - wptr_);
+ if (ret.has_value())
+ wptr_ += ret.value();
+ return ret;
+ }
+
+ std::unique_ptr<io::Reader> reader_;
+ size_t const max_len_;
+ uint64_t number_{0};
+ std::unique_ptr<char[]> buffer_;
+ char* rptr_;
+ char* wptr_;
+ char* search_;
+ char* const end_;
+};
+
+} // namespace
+
+ReadError::ReadError()
+ : eof(true) {}
+
+ReadError::ReadError(io::ReadError error)
+ : eof(false), io_error(error) {}
+
+std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader,
+ size_t max_len) {
+ return std::make_unique<ReaderImpl>(std::move(reader), max_len);
+}
+
+} // namespace line
diff --git a/src/line.hh b/src/line.hh
new file mode 100644
index 0000000..94e3646
--- /dev/null
+++ b/src/line.hh
@@ -0,0 +1,44 @@
+#ifndef LINE_HH
+#define LINE_HH
+
+#include "io.hh" // IWYU pragma: export
+
+#include <cstddef>
+#include <expected>
+#include <memory>
+#include <optional>
+#include <string_view>
+
+namespace line {
+
+struct ReadError {
+ bool eof;
+ std::optional<io::ReadError> io_error;
+
+ ReadError();
+ explicit ReadError(io::ReadError error);
+};
+
+class Reader {
+ public:
+ virtual ~Reader() = default;
+
+ // Returned view is only valid until next call to read.
+ [[nodiscard]] virtual std::expected<std::string_view, ReadError> read() = 0;
+ // Starts at zero. Returns next line.
+ // So, before first read it is zero, after first read it is one.
+ [[nodiscard]] virtual uint64_t number() const = 0;
+
+ protected:
+ Reader() = default;
+
+ Reader(Reader const&) = delete;
+ Reader& operator=(Reader const&) = delete;
+};
+
+[[nodiscard]] std::unique_ptr<Reader> open(std::unique_ptr<io::Reader> reader,
+ size_t max_len = 8192);
+
+} // namespace line
+
+#endif // LINE_HH
diff --git a/src/str.cc b/src/str.cc
new file mode 100644
index 0000000..f81617d
--- /dev/null
+++ b/src/str.cc
@@ -0,0 +1,34 @@
+#include "str.hh"
+
+#include <cstddef>
+#include <string_view>
+#include <vector>
+
+namespace str {
+
+void split(std::string_view str, std::vector<std::string_view>& out,
+ char separator, bool keep_empty) {
+ out.clear();
+
+ size_t offset = 0;
+ while (true) {
+ auto next = str.find(separator, offset);
+ if (next == std::string_view::npos) {
+ if (keep_empty || offset < str.size())
+ out.push_back(str.substr(offset));
+ break;
+ }
+ if (keep_empty || offset < next)
+ out.push_back(str.substr(offset, next - offset));
+ offset = next + 1;
+ }
+}
+
+std::vector<std::string_view> split(std::string_view str,
+ char separator, bool keep_empty) {
+ std::vector<std::string_view> vec;
+ split(str, vec, separator, keep_empty);
+ return vec;
+}
+
+} // namespace str
diff --git a/src/str.hh b/src/str.hh
new file mode 100644
index 0000000..58d5d32
--- /dev/null
+++ b/src/str.hh
@@ -0,0 +1,18 @@
+#ifndef STR_HH
+#define STR_HH
+
+#include <string_view>
+#include <vector>
+
+namespace str {
+
+void split(std::string_view str, std::vector<std::string_view>& out,
+ char separator = ' ', bool keep_empty = false);
+
+[[nodiscard]] std::vector<std::string_view> split(std::string_view str,
+ char separator = ' ',
+ bool keep_empty = false);
+
+} // namespace str
+
+#endif // STR_HH
diff --git a/src/u.cc b/src/u.cc
new file mode 100644
index 0000000..3c06ba8
--- /dev/null
+++ b/src/u.cc
@@ -0,0 +1,46 @@
+#include "u.hh"
+
+#include <cstdint>
+#include <utility>
+
+namespace u {
+
+// These are generated by gen_ugc
+GeneralCategory u6_2_0_lookup_gc(uint32_t code);
+GeneralCategory u8_0_0_lookup_gc(uint32_t code);
+GeneralCategory u10_0_0_lookup_gc(uint32_t code);
+GeneralCategory u11_0_0_lookup_gc(uint32_t code);
+GeneralCategory u12_1_0_lookup_gc(uint32_t code);
+GeneralCategory u13_0_0_lookup_gc(uint32_t code);
+GeneralCategory u14_0_0_lookup_gc(uint32_t code);
+GeneralCategory u15_0_0_lookup_gc(uint32_t code);
+GeneralCategory u15_1_0_lookup_gc(uint32_t code);
+GeneralCategory u16_0_0_lookup_gc(uint32_t code);
+
+GeneralCategory lookup_gc(uint32_t code, Version version) {
+ switch (version) {
+ case Version::u6_2_0:
+ return u6_2_0_lookup_gc(code);
+ case Version::u8_0_0:
+ return u8_0_0_lookup_gc(code);
+ case Version::u10_0_0:
+ return u10_0_0_lookup_gc(code);
+ case Version::u11_0_0:
+ return u11_0_0_lookup_gc(code);
+ case Version::u12_1_0:
+ return u12_1_0_lookup_gc(code);
+ case Version::u13_0_0:
+ return u13_0_0_lookup_gc(code);
+ case Version::u14_0_0:
+ return u14_0_0_lookup_gc(code);
+ case Version::u15_0_0:
+ return u15_0_0_lookup_gc(code);
+ case Version::u15_1_0:
+ return u15_1_0_lookup_gc(code);
+ case Version::u16_0_0:
+ return u16_0_0_lookup_gc(code);
+ }
+ std::unreachable();
+}
+
+} // namespace u
diff --git a/src/u.hh b/src/u.hh
index 101dec8..7cf835b 100644
--- a/src/u.hh
+++ b/src/u.hh
@@ -1,6 +1,8 @@
#ifndef U_HH
#define U_HH
+#include "ugc.hh" // IWYU pragma: export
+
namespace u {
enum class ReadError : uint8_t {
@@ -14,6 +16,22 @@ enum class ReadErrorReplace : uint8_t {
Incomplete, // Too few bytes
};
+enum class Version : uint8_t {
+ u6_2_0,
+ u8_0_0,
+ u10_0_0,
+ u11_0_0,
+ u12_1_0,
+ u13_0_0,
+ u14_0_0,
+ u15_0_0,
+ u15_1_0,
+ u16_0_0,
+ LATEST = u16_0_0,
+};
+
+GeneralCategory lookup_gc(uint32_t code, Version version = Version::LATEST);
+
} // namespace u
#endif // U_HH
diff --git a/src/ugc.hh b/src/ugc.hh
new file mode 100644
index 0000000..c49d50f
--- /dev/null
+++ b/src/ugc.hh
@@ -0,0 +1,49 @@
+#ifndef UGC_HH
+#define UGC_HH
+
+#include <cstdint>
+
+namespace u {
+
+enum class GeneralCategory : uint8_t {
+ LETTER_UPPERCASE,
+ LETTER_LOWERCASE,
+ LETTER_TITLECASE,
+ LETTER_MODIFIER,
+ LETTER_OTHER,
+
+ MARK_NONSPACING,
+ MARK_SPACING_COMBINDING,
+ MARK_SPACING_ENCLOSING,
+
+ NUMBER_DIGIT,
+ NUMBER_LETTER,
+ NUMBER_OTHER,
+
+ PUNCTUATION_CONNECTOR,
+ PUNCTUATION_DASH,
+ PUNCTUATION_OPEN,
+ PUNCTUATION_CLOSE,
+ PUNCTUATION_INITIAL_QUOTE,
+ PUNCTUATION_FINAL_QUOTE,
+ PUNCTUATION_OTHER,
+
+ SYMBOL_MATH,
+ SYMBOL_CURRENCY,
+ SYMBOL_MODIFIER,
+ SYMBOL_OTHER,
+
+ SEPARATOR_SPACE,
+ SEPARATOR_LINE,
+ SEPARATOR_PARAGRAPH,
+
+ OTHER_CONTROL,
+ OTHER_FORMAT,
+ OTHER_SURROGATE,
+ OTHER_PRIVATE_USE,
+ OTHER_UNASSIGNED,
+};
+
+} // namespace u
+
+#endif // UGC_HH
diff --git a/src/unique_fd.cc b/src/unique_fd.cc
new file mode 100644
index 0000000..135a449
--- /dev/null
+++ b/src/unique_fd.cc
@@ -0,0 +1,9 @@
+#include "unique_fd.hh"
+
+#include <unistd.h>
+
+void unique_fd::reset(int fd) {
+ if (fd_ != -1)
+ close(fd_);
+ fd_ = fd;
+}
diff --git a/src/unique_fd.hh b/src/unique_fd.hh
new file mode 100644
index 0000000..189d513
--- /dev/null
+++ b/src/unique_fd.hh
@@ -0,0 +1,45 @@
+#ifndef UNIQUE_FD_HH
+#define UNIQUE_FD_HH
+
+class unique_fd {
+ public:
+ constexpr unique_fd()
+ : fd_(-1) {}
+ explicit constexpr unique_fd(int fd)
+ : fd_(fd) {}
+ unique_fd(unique_fd& fd) = delete;
+ unique_fd& operator=(unique_fd& fd) = delete;
+ unique_fd(unique_fd&& fd)
+ : fd_(fd.release()) {}
+ unique_fd& operator=(unique_fd&& fd) {
+ reset(fd.release());
+ return *this;
+ }
+ ~unique_fd() {
+ reset();
+ }
+
+ bool operator==(unique_fd const& fd) const {
+ return get() == fd.get();
+ }
+ bool operator!=(unique_fd const& fd) const {
+ return get() != fd.get();
+ }
+
+ int get() const { return fd_; }
+ explicit operator bool() const { return fd_ != -1; }
+ int operator*() const { return fd_; }
+
+ int release() {
+ int ret = fd_;
+ fd_ = -1;
+ return ret;
+ }
+
+ void reset(int fd = -1);
+
+ private:
+ int fd_;
+};
+
+#endif // UNIQUE_FD_HH
diff --git a/test/buffer.cc b/test/buffer.cc
new file mode 100644
index 0000000..869e781
--- /dev/null
+++ b/test/buffer.cc
@@ -0,0 +1,65 @@
+#include <gtest/gtest.h>
+
+#include "buffer.hh"
+
+#include <cstring>
+
+TEST(buffer_fixed, empty) {
+ auto buffer = Buffer::fixed(10);
+ EXPECT_TRUE(buffer->empty());
+ EXPECT_FALSE(buffer->full());
+ size_t avail;
+ buffer->rptr(avail);
+ EXPECT_EQ(0, avail);
+ buffer->wptr(avail);
+ EXPECT_EQ(10, avail);
+}
+
+TEST(buffer_dynamic, empty) {
+ auto buffer = Buffer::dynamic(10, 100);
+ EXPECT_TRUE(buffer->empty());
+ EXPECT_FALSE(buffer->full());
+ size_t avail;
+ buffer->rptr(avail);
+ EXPECT_EQ(0, avail);
+ buffer->wptr(avail);
+ EXPECT_EQ(10, avail);
+}
+
+TEST(buffer_fixed, write_read) {
+ auto buffer = Buffer::fixed(10);
+ size_t avail;
+ auto* wptr = buffer->wptr(avail);
+ EXPECT_EQ(10, avail);
+ memcpy(wptr, "Hello", 6);
+ buffer->commit(6);
+ EXPECT_FALSE(buffer->empty());
+ auto* rptr = buffer->rptr(avail);
+ EXPECT_EQ(6, avail);
+ EXPECT_STREQ("Hello", reinterpret_cast<const char*>(rptr));
+ buffer->consume(3);
+ rptr = buffer->rptr(avail);
+ EXPECT_EQ(3, avail);
+ EXPECT_STREQ("lo", reinterpret_cast<const char*>(rptr));
+ buffer->consume(3);
+ EXPECT_TRUE(buffer->empty());
+}
+
+TEST(buffer_dynamic, write_read) {
+ auto buffer = Buffer::dynamic(10, 100);
+ size_t avail;
+ auto* wptr = buffer->wptr(avail);
+ EXPECT_EQ(10, avail);
+ memcpy(wptr, "Hello", 6);
+ buffer->commit(6);
+ EXPECT_FALSE(buffer->empty());
+ auto* rptr = buffer->rptr(avail);
+ EXPECT_EQ(6, avail);
+ EXPECT_STREQ("Hello", reinterpret_cast<const char*>(rptr));
+ buffer->consume(3);
+ rptr = buffer->rptr(avail);
+ EXPECT_EQ(3, avail);
+ EXPECT_STREQ("lo", reinterpret_cast<const char*>(rptr));
+ buffer->consume(3);
+ EXPECT_TRUE(buffer->empty());
+}
diff --git a/test/csv.cc b/test/csv.cc
new file mode 100644
index 0000000..49fe540
--- /dev/null
+++ b/test/csv.cc
@@ -0,0 +1,90 @@
+#include <gtest/gtest.h>
+
+#include "csv.hh"
+
+TEST(csv, empty) {
+ auto csv = csv::open(io::memory(""));
+ auto line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ(0, line.value().size());
+}
+
+TEST(csv, one_value) {
+ auto csv = csv::open(io::memory("foo"));
+ auto line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ ASSERT_EQ(1, line.value().size());
+ EXPECT_EQ("foo", line.value()[0]);
+ line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ(0, line.value().size());
+}
+
+TEST(csv, two_value) {
+ auto csv = csv::open(io::memory("foo,bar"));
+ auto line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ ASSERT_EQ(2, line.value().size());
+ EXPECT_EQ("foo", line.value()[0]);
+ EXPECT_EQ("bar", line.value()[1]);
+ line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ(0, line.value().size());
+}
+
+TEST(csv, empty_value) {
+ auto csv = csv::open(io::memory("foo,,bar,"));
+ auto line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ ASSERT_EQ(4, line.value().size());
+ EXPECT_EQ("foo", line.value()[0]);
+ EXPECT_EQ("", line.value()[1]);
+ EXPECT_EQ("bar", line.value()[2]);
+ EXPECT_EQ("", line.value()[3]);
+ line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ(0, line.value().size());
+}
+
+TEST(csv, many_lines) {
+ auto csv = csv::open(io::memory("foo,bar\nfoobar\nf,o,o,"));
+ auto line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ ASSERT_EQ(2, line.value().size());
+ EXPECT_EQ("foo", line.value()[0]);
+ EXPECT_EQ("bar", line.value()[1]);
+ line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ ASSERT_EQ(1, line.value().size());
+ EXPECT_EQ("foobar", line.value()[0]);
+ line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ ASSERT_EQ(4, line.value().size());
+ EXPECT_EQ("f", line.value()[0]);
+ EXPECT_EQ("o", line.value()[1]);
+ EXPECT_EQ("o", line.value()[2]);
+ EXPECT_EQ("", line.value()[3]);
+ line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ(0, line.value().size());
+}
+
+TEST(csv, blank_lines) {
+ auto csv = csv::open(io::memory("foo,bar\n\nbar,foo\n\n"));
+ auto line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ ASSERT_EQ(2, line.value().size());
+ EXPECT_EQ("foo", line.value()[0]);
+ EXPECT_EQ("bar", line.value()[1]);
+ EXPECT_EQ(1, csv->number());
+ line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ ASSERT_EQ(2, line.value().size());
+ EXPECT_EQ("bar", line.value()[0]);
+ EXPECT_EQ("foo", line.value()[1]);
+ EXPECT_EQ(3, csv->number());
+ line = csv->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ(0, line.value().size());
+ EXPECT_EQ(4, csv->number());
+}
diff --git a/test/decompress.cc b/test/decompress.cc
new file mode 100644
index 0000000..35c4477
--- /dev/null
+++ b/test/decompress.cc
@@ -0,0 +1,72 @@
+#include <gtest/gtest.h>
+
+#include "decompress.hh"
+
+TEST(z_decompress, empty) {
+ static const unsigned char data[] = {
+ 0x1f, 0x8b, 0x08, 0x08, 0x33, 0xd4, 0xbd, 0x68,
+ 0x02, 0x03, 0x65, 0x6d, 0x70, 0x74, 0x79, 0x00,
+ 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00
+ };
+ auto reader = decompress::gzip(io::memory(std::string(
+ reinterpret_cast<const char*>(data), sizeof(data))));
+ char buf[10];
+ auto got = reader->read(buf, sizeof(buf));
+ ASSERT_TRUE(got.has_value());
+ EXPECT_EQ(0, got.value());
+}
+
+TEST(z_decompress, hello) {
+ static const unsigned char data[] = {
+ 0x1f, 0x8b, 0x08, 0x08, 0xf7, 0xd5, 0xbd, 0x68,
+ 0x02, 0x03, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x00,
+ 0xf3, 0x48, 0xcd, 0xc9, 0xc9, 0x07, 0x00, 0x82,
+ 0x89, 0xd1, 0xf7, 0x05, 0x00, 0x00, 0x00,
+ };
+ auto reader = decompress::gzip(io::memory(std::string(
+ reinterpret_cast<const char*>(data), sizeof(data))));
+ char buf[10];
+ auto got = reader->read(buf, sizeof(buf));
+ ASSERT_TRUE(got.has_value());
+ EXPECT_EQ(5, got.value());
+ buf[5] = '\0';
+ EXPECT_STREQ("Hello", buf);
+}
+
+TEST(xz_decompress, empty) {
+ static const unsigned char data[] = {
+ 0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00, 0x00, 0x04,
+ 0xe6, 0xd6, 0xb4, 0x46, 0x00, 0x00, 0x00, 0x00,
+ 0x1c, 0xdf, 0x44, 0x21, 0x1f, 0xb6, 0xf3, 0x7d,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x04, 0x59, 0x5a
+ };
+ auto reader = decompress::xz(io::memory(std::string(
+ reinterpret_cast<const char*>(data), sizeof(data))));
+ char buf[10];
+ auto got = reader->read(buf, sizeof(buf));
+ ASSERT_TRUE(got.has_value());
+ EXPECT_EQ(0, got.value());
+}
+
+TEST(xz_decompress, hello) {
+ static const unsigned char data[] = {
+ 0xfd, 0x37, 0x7a, 0x58, 0x5a, 0x00, 0x00, 0x04,
+ 0xe6, 0xd6, 0xb4, 0x46, 0x04, 0xc0, 0x09, 0x05,
+ 0x21, 0x01, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x76, 0xe9, 0x07, 0x70,
+ 0x01, 0x00, 0x04, 0x48, 0x65, 0x6c, 0x6c, 0x6f,
+ 0x00, 0x00, 0x00, 0x00, 0xc8, 0xac, 0x7b, 0xc8,
+ 0x3b, 0x5c, 0xcf, 0x51, 0x00, 0x01, 0x25, 0x05,
+ 0x43, 0x91, 0x1f, 0xb8, 0x1f, 0xb6, 0xf3, 0x7d,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x04, 0x59, 0x5a,
+ };
+ auto reader = decompress::xz(io::memory(std::string(
+ reinterpret_cast<const char*>(data), sizeof(data))));
+ char buf[10];
+ auto got = reader->read(buf, sizeof(buf));
+ ASSERT_TRUE(got.has_value());
+ EXPECT_EQ(5, got.value());
+ buf[5] = '\0';
+ EXPECT_STREQ("Hello", buf);
+}
diff --git a/test/io.cc b/test/io.cc
new file mode 100644
index 0000000..ad192ed
--- /dev/null
+++ b/test/io.cc
@@ -0,0 +1,142 @@
+#include <gtest/gtest.h>
+
+#include "io.hh"
+
+#include <cstdlib>
+#include <cerrno>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace {
+
+bool remove_recursive(int fd) {
+ auto* dir = fdopendir(fd);
+ if (!dir) return false;
+ while (auto* ent = readdir(dir)) {
+ if (ent->d_name[0] == '.') {
+ if (ent->d_name[1] == '\0') continue;
+ if (ent->d_name[1] == '.' && ent->d_name[2] == '\0') continue;
+ }
+ bool is_dir;
+ if (ent->d_type == DT_DIR) {
+ is_dir = true;
+ } else if (ent->d_type == DT_UNKNOWN) {
+ struct stat buf;
+ if (fstatat(dirfd(dir), ent->d_name, &buf, AT_SYMLINK_NOFOLLOW) == 0) {
+ is_dir = S_ISDIR(buf.st_mode);
+ } else {
+ if (errno != ENOENT) {
+ closedir(dir);
+ return false;
+ }
+ is_dir = false;
+ }
+ } else {
+ is_dir = false;
+ }
+
+ if (is_dir) {
+ int fd2 = openat(dirfd(dir), ent->d_name, O_RDONLY | O_DIRECTORY);
+ if (fd2 == -1) {
+ if (errno != ENOENT) {
+ closedir(dir);
+ return false;
+ }
+ } else {
+ if (!remove_recursive(fd2)) {
+ closedir(dir);
+ return false;
+ }
+ }
+ }
+ if (unlinkat(dirfd(dir), ent->d_name, is_dir ? AT_REMOVEDIR : 0)) {
+ if (errno != ENOENT) {
+ closedir(dir);
+ return false;
+ }
+ }
+ }
+ closedir(dir);
+ return true;
+}
+
+class IoTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ // NOLINTNEXTLINE(misc-include-cleaner)
+ tmpdir_ = P_tmpdir "/jkc-test-io-XXXXXX";
+ // NOLINTNEXTLINE(misc-include-cleaner)
+ auto* ret = mkdtemp(tmpdir_.data());
+ ASSERT_EQ(ret, tmpdir_.data());
+ dirfd_ = open(tmpdir_.c_str(), O_PATH | O_DIRECTORY);
+ ASSERT_NE(-1, dirfd_);
+ }
+
+ void TearDown() override {
+ int fd = openat(dirfd_, ".", O_RDONLY | O_DIRECTORY);
+ EXPECT_NE(-1, fd);
+ if (fd != -1) {
+ EXPECT_TRUE(remove_recursive(fd));
+ }
+ close(dirfd_);
+ rmdir(tmpdir_.c_str());
+ }
+
+ [[nodiscard]] int dirfd() const {
+ return dirfd_;
+ }
+
+ void touch(const std::string& name, const std::string& value = "") {
+ auto fd = openat(dirfd(), name.c_str(), O_CREAT | O_WRONLY | O_TRUNC, 0700);
+ EXPECT_NE(-1, fd);
+ if (fd == -1) return;
+ size_t offset = 0;
+ while (offset < value.size()) {
+ auto ret = write(fd, value.data() + offset, value.size() - offset);
+ EXPECT_LT(0, ret);
+ if (ret <= 0) {
+ break;
+ }
+ offset += ret;
+ }
+ close(fd);
+ }
+
+ private:
+ int dirfd_;
+ std::string tmpdir_;
+};
+
+} // namespace
+
+TEST_F(IoTest, no_such_file) {
+ auto ret = io::openat(dirfd(), "no-such-file");
+ ASSERT_FALSE(ret.has_value());
+ EXPECT_EQ(io::OpenError::NoSuchFile, ret.error());
+}
+
+TEST_F(IoTest, read_empty) {
+ touch("test");
+
+ auto ret = io::openat(dirfd(), "test");
+ ASSERT_TRUE(ret.has_value());
+ std::string tmp(10, ' ');
+ auto ret2 = ret.value()->read(tmp);
+ ASSERT_TRUE(ret2.has_value());
+ EXPECT_EQ(0, ret2.value());
+}
+
+TEST_F(IoTest, read) {
+ touch("test", "hello world");
+
+ auto ret = io::openat(dirfd(), "test");
+ ASSERT_TRUE(ret.has_value());
+ std::string tmp(12, ' ');
+ auto ret2 = ret.value()->repeat_read(tmp);
+ ASSERT_TRUE(ret2.has_value());
+ EXPECT_EQ(11, ret2.value());
+ tmp.resize(ret2.value());
+ EXPECT_EQ("hello world", tmp);
+}
diff --git a/test/io_test_helper.cc b/test/io_test_helper.cc
new file mode 100644
index 0000000..514e888
--- /dev/null
+++ b/test/io_test_helper.cc
@@ -0,0 +1,82 @@
+#include "io_test_helper.hh"
+
+#include "io.hh"
+
+#include <algorithm>
+#include <cstddef>
+#include <expected>
+#include <memory>
+#include <utility>
+
+namespace {
+
+class BreakingReader : public io::Reader {
+ public:
+ BreakingReader(std::unique_ptr<io::Reader> reader, size_t offset,
+ io::ReadError error)
+ : reader_(std::move(reader)), offset_(offset), error_(error) {}
+
+ [[nodiscard]]
+ std::expected<size_t, io::ReadError> read(void* dst, size_t max) override {
+ if (offset_ == 0)
+ return std::unexpected(error_);
+ size_t avail = std::min(offset_, max);
+ auto ret = reader_->read(dst, avail);
+ if (ret.has_value()) {
+ offset_ -= ret.value();
+ }
+ return ret;
+ }
+
+ [[nodiscard]]
+ std::expected<size_t, io::ReadError> skip(size_t max) override {
+ if (offset_ == 0)
+ return std::unexpected(error_);
+ size_t avail = std::min(offset_, max);
+ auto ret = reader_->skip(avail);
+ if (ret.has_value()) {
+ offset_ -= ret.value();
+ }
+ return ret;
+ }
+
+ private:
+ std::unique_ptr<io::Reader> reader_;
+ size_t offset_;
+ io::ReadError const error_;
+};
+
+class MaxBlockReader : public io::Reader {
+ public:
+ MaxBlockReader(std::unique_ptr<io::Reader> reader, size_t max_block_size)
+ : reader_(std::move(reader)), max_block_size_(max_block_size) {}
+
+ [[nodiscard]]
+ std::expected<size_t, io::ReadError> read(void* dst, size_t max) override {
+ size_t avail = std::min(max_block_size_, max);
+ return reader_->read(dst, avail);
+ }
+
+ [[nodiscard]]
+ std::expected<size_t, io::ReadError> skip(size_t max) override {
+ size_t avail = std::min(max_block_size_, max);
+ return reader_->skip(avail);
+ }
+
+ private:
+ std::unique_ptr<io::Reader> reader_;
+ size_t const max_block_size_;
+};
+
+} // namespace
+
+std::unique_ptr<io::Reader> io_make_breaking(
+ std::unique_ptr<io::Reader> reader, size_t offset,
+ io::ReadError error) {
+ return std::make_unique<BreakingReader>(std::move(reader), offset, error);
+}
+
+std::unique_ptr<io::Reader> io_make_max_block(
+ std::unique_ptr<io::Reader> reader, size_t max_block_size) {
+ return std::make_unique<MaxBlockReader>(std::move(reader), max_block_size);
+}
diff --git a/test/io_test_helper.hh b/test/io_test_helper.hh
new file mode 100644
index 0000000..ce191cf
--- /dev/null
+++ b/test/io_test_helper.hh
@@ -0,0 +1,18 @@
+#ifndef IO_TEST_HELPER_HH
+#define IO_TEST_HELPER_HH
+
+#include "io.hh" // IWYU pragma: export
+
+#include <cstddef>
+#include <memory>
+
+[[nodiscard]]
+std::unique_ptr<io::Reader> io_make_breaking(
+ std::unique_ptr<io::Reader> reader, size_t offset = 0,
+ io::ReadError error = io::ReadError::Error);
+
+[[nodiscard]]
+std::unique_ptr<io::Reader> io_make_max_block(
+ std::unique_ptr<io::Reader> reader, size_t max_block_size);
+
+#endif // IO_TEST_HELPER_HH
diff --git a/test/line.cc b/test/line.cc
new file mode 100644
index 0000000..0f90723
--- /dev/null
+++ b/test/line.cc
@@ -0,0 +1,184 @@
+#include <gtest/gtest.h>
+
+#include "io_test_helper.hh"
+#include "line.hh"
+
+#include <cstddef>
+#include <limits>
+#include <utility>
+
+TEST(line, empty) {
+ auto reader = line::open(io::memory(""));
+ EXPECT_EQ(0, reader->number());
+ auto line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_TRUE(line.error().eof);
+ EXPECT_EQ(0, reader->number());
+}
+
+TEST(line, one_line) {
+ auto reader = line::open(io::memory("foo"));
+ EXPECT_EQ(0, reader->number());
+ auto line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("foo", line.value());
+ EXPECT_EQ(1, reader->number());
+ line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_TRUE(line.error().eof);
+ EXPECT_EQ(1, reader->number());
+}
+
+TEST(line, many_lines) {
+ auto reader = line::open(io::memory("foo\nbar\nfoobar\n"));
+ EXPECT_EQ(0, reader->number());
+ auto line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("foo", line.value());
+ EXPECT_EQ(1, reader->number());
+ line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("bar", line.value());
+ EXPECT_EQ(2, reader->number());
+ line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("foobar", line.value());
+ EXPECT_EQ(3, reader->number());
+ line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_TRUE(line.error().eof);
+ EXPECT_EQ(3, reader->number());
+}
+
+TEST(line, many_lines_mixed) {
+ auto reader = line::open(io::memory("foo\r\nbar\rfoobar\n"));
+ EXPECT_EQ(0, reader->number());
+ auto line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("foo", line.value());
+ EXPECT_EQ(1, reader->number());
+ line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("bar", line.value());
+ EXPECT_EQ(2, reader->number());
+ line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("foobar", line.value());
+ EXPECT_EQ(3, reader->number());
+ line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_TRUE(line.error().eof);
+ EXPECT_EQ(3, reader->number());
+}
+
+TEST(line, empty_line) {
+ auto reader = line::open(io::memory("\n"));
+ EXPECT_EQ(0, reader->number());
+ auto line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("", line.value());
+ EXPECT_EQ(1, reader->number());
+ line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_TRUE(line.error().eof);
+ EXPECT_EQ(1, reader->number());
+}
+
+TEST(line, max_line) {
+ auto reader = line::open(io::memory("012345678901234567890123456789"), 10);
+ EXPECT_EQ(0, reader->number());
+ auto line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("0123456789", line.value());
+ EXPECT_EQ(1, reader->number());
+ line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("0123456789", line.value());
+ EXPECT_EQ(2, reader->number());
+ line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("0123456789", line.value());
+ EXPECT_EQ(3, reader->number());
+ line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_TRUE(line.error().eof);
+ EXPECT_EQ(3, reader->number());
+}
+
+TEST(line, read_error) {
+ auto reader = line::open(
+ io_make_breaking(io::memory("foo bar fum\nfim zam"), /* offset */ 5));
+ auto line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_FALSE(line.error().eof);
+ EXPECT_EQ(io::ReadError::Error, line.error().io_error.value());
+}
+
+TEST(line, read_error_newline) {
+ auto reader = line::open(
+ io_make_breaking(io::memory("foo bar\r\nfim zam"), /* offset */ 8));
+ auto line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_FALSE(line.error().eof);
+ EXPECT_EQ(io::ReadError::Error, line.error().io_error.value());
+}
+
+TEST(line, blocky) {
+ auto reader = line::open(
+ io_make_max_block(io::memory("foo bar\r\nfim zam"),
+ /* max_block_size */ 1));
+ auto line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("foo bar", line.value());
+ line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("fim zam", line.value());
+ line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_TRUE(line.error().eof);
+}
+
+TEST(line, blocky_newline) {
+ auto reader = line::open(
+ io_make_max_block(io::memory("foo bar\r\nfim zam"),
+ /* max_block_size */ 8));
+ auto line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("foo bar", line.value());
+ line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("fim zam", line.value());
+ line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_TRUE(line.error().eof);
+}
+
+TEST(line, eof_newline) {
+ auto reader = line::open(io::memory("foo bar\r"));
+ auto line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("foo bar", line.value());
+ line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_TRUE(line.error().eof);
+}
+
+TEST(line, max_newline) {
+ auto reader = line::open(io::memory("foo bar\r"), 6);
+ auto line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("foo ba", line.value());
+ line = reader->read();
+ ASSERT_TRUE(line.has_value());
+ EXPECT_EQ("r", line.value());
+ line = reader->read();
+ ASSERT_FALSE(line.has_value());
+ EXPECT_TRUE(line.error().eof);
+}
+
+TEST(line, max_line_overflow) {
+ EXPECT_DEATH_IF_SUPPORTED({
+ std::ignore = line::open(io::memory(""),
+ std::numeric_limits<size_t>::max());
+ }, "");
+}
diff --git a/test/str.cc b/test/str.cc
new file mode 100644
index 0000000..35d70d7
--- /dev/null
+++ b/test/str.cc
@@ -0,0 +1,38 @@
+#include <gtest/gtest.h>
+
+#include "str.hh"
+
+TEST(str, split) {
+ auto ret = str::split("");
+ EXPECT_EQ(0, ret.size());
+
+ ret = str::split("", ' ', true);
+ ASSERT_EQ(1, ret.size());
+ EXPECT_EQ("", ret[0]);
+
+ ret = str::split(" ");
+ EXPECT_EQ(0, ret.size());
+
+ ret = str::split(" ", ' ', true);
+ ASSERT_EQ(2, ret.size());
+ EXPECT_EQ("", ret[0]);
+ EXPECT_EQ("", ret[1]);
+
+ ret = str::split(" a b ");
+ ASSERT_EQ(2, ret.size());
+ EXPECT_EQ("a", ret[0]);
+ EXPECT_EQ("b", ret[1]);
+
+ ret = str::split(" a b ", ' ', true);
+ ASSERT_EQ(4, ret.size());
+ EXPECT_EQ("", ret[0]);
+ EXPECT_EQ("a", ret[1]);
+ EXPECT_EQ("b", ret[2]);
+ EXPECT_EQ("", ret[3]);
+
+ ret = str::split(" a b", ' ', true);
+ ASSERT_EQ(3, ret.size());
+ EXPECT_EQ("", ret[0]);
+ EXPECT_EQ("a", ret[1]);
+ EXPECT_EQ("b", ret[2]);
+}
diff --git a/test/u.cc b/test/u.cc
index 933a4f2..de04e39 100644
--- a/test/u.cc
+++ b/test/u.cc
@@ -681,3 +681,45 @@ TEST(u16, invalid) {
EXPECT_EQ(it, literal.end());
}
}
+
+TEST(u, lookup_gc) {
+ EXPECT_EQ(u::lookup_gc(0x41), u::GeneralCategory::LETTER_UPPERCASE);
+ EXPECT_EQ(u::lookup_gc(0x61), u::GeneralCategory::LETTER_LOWERCASE);
+ EXPECT_EQ(u::lookup_gc(0x1c5), u::GeneralCategory::LETTER_TITLECASE);
+ EXPECT_EQ(u::lookup_gc(0x374), u::GeneralCategory::LETTER_MODIFIER);
+ EXPECT_EQ(u::lookup_gc(0x34ff), u::GeneralCategory::LETTER_OTHER);
+
+ EXPECT_EQ(u::lookup_gc(0x483), u::GeneralCategory::MARK_NONSPACING);
+ EXPECT_EQ(u::lookup_gc(0x93b), u::GeneralCategory::MARK_SPACING_COMBINDING);
+ EXPECT_EQ(u::lookup_gc(0x20de), u::GeneralCategory::MARK_SPACING_ENCLOSING);
+
+ EXPECT_EQ(u::lookup_gc(0xa620), u::GeneralCategory::NUMBER_DIGIT);
+ EXPECT_EQ(u::lookup_gc(0xa6e6), u::GeneralCategory::NUMBER_LETTER);
+ EXPECT_EQ(u::lookup_gc(0xa830), u::GeneralCategory::NUMBER_OTHER);
+
+ EXPECT_EQ(u::lookup_gc(0xfe33), u::GeneralCategory::PUNCTUATION_CONNECTOR);
+ EXPECT_EQ(u::lookup_gc(0xfe58), u::GeneralCategory::PUNCTUATION_DASH);
+ EXPECT_EQ(u::lookup_gc(0xff08), u::GeneralCategory::PUNCTUATION_OPEN);
+ EXPECT_EQ(u::lookup_gc(0xff09), u::GeneralCategory::PUNCTUATION_CLOSE);
+ EXPECT_EQ(u::lookup_gc(0xab), u::GeneralCategory::PUNCTUATION_INITIAL_QUOTE);
+ EXPECT_EQ(u::lookup_gc(0xbb), u::GeneralCategory::PUNCTUATION_FINAL_QUOTE);
+ EXPECT_EQ(u::lookup_gc(0xff1a), u::GeneralCategory::PUNCTUATION_OTHER);
+
+ EXPECT_EQ(u::lookup_gc(0xd7), u::GeneralCategory::SYMBOL_MATH);
+ EXPECT_EQ(u::lookup_gc(0x58f), u::GeneralCategory::SYMBOL_CURRENCY);
+ EXPECT_EQ(u::lookup_gc(0x5e), u::GeneralCategory::SYMBOL_MODIFIER);
+ EXPECT_EQ(u::lookup_gc(0xf03), u::GeneralCategory::SYMBOL_OTHER);
+
+ EXPECT_EQ(u::lookup_gc(0x20), u::GeneralCategory::SEPARATOR_SPACE);
+ EXPECT_EQ(u::lookup_gc(0x2028), u::GeneralCategory::SEPARATOR_LINE);
+ EXPECT_EQ(u::lookup_gc(0x2029), u::GeneralCategory::SEPARATOR_PARAGRAPH);
+
+ EXPECT_EQ(u::lookup_gc(0xa), u::GeneralCategory::OTHER_CONTROL);
+ EXPECT_EQ(u::lookup_gc(0x202d), u::GeneralCategory::OTHER_FORMAT);
+ EXPECT_EQ(u::lookup_gc(0xd800), u::GeneralCategory::OTHER_SURROGATE);
+ EXPECT_EQ(u::lookup_gc(0xdbff), u::GeneralCategory::OTHER_SURROGATE);
+ EXPECT_EQ(u::lookup_gc(0xdfff), u::GeneralCategory::OTHER_SURROGATE);
+ EXPECT_EQ(u::lookup_gc(0xe000), u::GeneralCategory::OTHER_PRIVATE_USE);
+
+ EXPECT_EQ(u::lookup_gc(0xffffffff), u::GeneralCategory::OTHER_UNASSIGNED);
+}