summaryrefslogtreecommitdiff
path: root/src/gen_ugc.cc
diff options
context:
space:
mode:
Diffstat (limited to 'src/gen_ugc.cc')
-rw-r--r--src/gen_ugc.cc317
1 files changed, 317 insertions, 0 deletions
diff --git a/src/gen_ugc.cc b/src/gen_ugc.cc
new file mode 100644
index 0000000..e9bce11
--- /dev/null
+++ b/src/gen_ugc.cc
@@ -0,0 +1,317 @@
+#include "args.hh"
+#include "csv.hh"
+#include "decompress.hh"
+#include "ugc.hh"
+
+#include <charconv>
+#include <cstdint>
+#include <expected>
+#include <format>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <span>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <utility>
+#include <vector>
+
+namespace {
+
+std::map<std::string, u::GeneralCategory, std::less<>> str2gc{
+ {"Lu", u::GeneralCategory::LETTER_UPPERCASE},
+ {"Ll", u::GeneralCategory::LETTER_LOWERCASE},
+ {"Lt", u::GeneralCategory::LETTER_TITLECASE},
+ {"Lm", u::GeneralCategory::LETTER_MODIFIER},
+ {"Lo", u::GeneralCategory::LETTER_OTHER},
+
+ {"Mn", u::GeneralCategory::MARK_NONSPACING},
+ {"Mc", u::GeneralCategory::MARK_SPACING_COMBINDING},
+ {"Me", u::GeneralCategory::MARK_SPACING_ENCLOSING},
+
+ {"Nd", u::GeneralCategory::NUMBER_DIGIT},
+ {"Nl", u::GeneralCategory::NUMBER_LETTER},
+ {"No", u::GeneralCategory::NUMBER_OTHER},
+
+ {"Pc", u::GeneralCategory::PUNCTUATION_CONNECTOR},
+ {"Pd", u::GeneralCategory::PUNCTUATION_DASH},
+ {"Ps", u::GeneralCategory::PUNCTUATION_OPEN},
+ {"Pe", u::GeneralCategory::PUNCTUATION_CLOSE},
+ {"Pi", u::GeneralCategory::PUNCTUATION_INITIAL_QUOTE},
+ {"Pf", u::GeneralCategory::PUNCTUATION_FINAL_QUOTE},
+ {"Po", u::GeneralCategory::PUNCTUATION_OTHER},
+
+ {"Sm", u::GeneralCategory::SYMBOL_MATH},
+ {"Sc", u::GeneralCategory::SYMBOL_CURRENCY},
+ {"Sk", u::GeneralCategory::SYMBOL_MODIFIER},
+ {"So", u::GeneralCategory::SYMBOL_OTHER},
+
+ {"Zs", u::GeneralCategory::SEPARATOR_SPACE},
+ {"Zl", u::GeneralCategory::SEPARATOR_LINE},
+ {"Zp", u::GeneralCategory::SEPARATOR_PARAGRAPH},
+
+ {"Cc", u::GeneralCategory::OTHER_CONTROL},
+ {"Cf", u::GeneralCategory::OTHER_FORMAT},
+ {"Cs", u::GeneralCategory::OTHER_SURROGATE},
+ {"Co", u::GeneralCategory::OTHER_PRIVATE_USE},
+ {"Cn", u::GeneralCategory::OTHER_UNASSIGNED},
+};
+
+void print_header(std::ostream& out, std::string_view prefix) {
+ out << "#include \"ugc.hh\"\n"
+ << "\n"
+ << "#include <array>\n"
+ << "#include <cstddef>\n"
+ << "#include <cstdint>\n"
+ << "\n"
+ << "namespace u {\n"
+ << "\n"
+ << "extern GeneralCategory " << prefix << "lookup_gc(uint32_t code) {\n";
+}
+
+void print_body(std::ostream& out,
+ std::map<uint32_t, u::GeneralCategory> const& data) {
+ std::vector<uint32_t> codes;
+ std::vector<u::GeneralCategory> categories;
+
+ auto it = data.begin();
+ codes.emplace_back(it->first);
+ categories.emplace_back(it->second);
+
+ uint32_t next = it->first + 1;
+
+ for (++it; it != data.end(); ++it) {
+ if (it->first == next && categories.back() == it->second) {
+ ++next;
+ } else {
+ codes.emplace_back(next - 1);
+ codes.emplace_back(it->first);
+ categories.emplace_back(it->second);
+ next = it->first + 1;
+ }
+ }
+
+ codes.emplace_back(next - 1);
+
+ out << " static std::array<uint32_t, " << codes.size() << "> codes{";
+ for (auto code : codes) {
+ out << code << ",";
+ }
+ out << " };\n";
+ out << " static std::array<uint8_t, " << categories.size()
+ << "> categories{";
+ for (auto category : categories) {
+ out << static_cast<uint16_t>(category) << ",";
+ }
+ out << "};\n";
+
+ out << " size_t low = 0;\n"
+ << " size_t high = " << (codes.size() / 2) << ";\n"
+ << " while (low < high) {\n"
+ << " size_t m = (low + high) / 2;\n"
+ << " uint32_t start = codes[m * 2];\n"
+ << " if (code < start) {\n"
+ << " high = m;\n"
+ << " } else {\n"
+ << " uint32_t end = codes[(m * 2) + 1];\n"
+ << " if (code <= end) {\n"
+ << " return static_cast<u::GeneralCategory>(categories[m]);\n"
+ << " }\n"
+ << " low = m + 1;\n"
+ << " }\n"
+ << " }\n"
+ << " return u::GeneralCategory::OTHER_UNASSIGNED;\n";
+}
+
+void print_footer(std::ostream& out, std::string_view /* prefix */) {
+ out << "}\n"
+ << "\n"
+ << "} // namespace u\n";
+}
+
+std::string_view ioerr2str(io::OpenError error) {
+ switch (error) {
+ case io::OpenError::NoSuchFile:
+ return "No such file";
+ case io::OpenError::NoAccess:
+ return "No access";
+ case io::OpenError::Error:
+ return "Fatal error";
+ }
+ std::unreachable();
+}
+
+std::string_view ioerr2str(io::ReadError error) {
+ switch (error) {
+ case io::ReadError::InvalidData:
+ return "Invalid (compressed) data";
+ case io::ReadError::Error:
+ return "Fatal error";
+ }
+ std::unreachable();
+}
+
+std::expected<std::pair<uint32_t, u::GeneralCategory>, std::string> parse_row(
+ std::span<std::string_view> row) {
+ // [code];[name];[gc];[cc];[bc];[decomposition];[nv-dec];[nv-dig];[nv-num];[bm];[alias];;[upper case];[lower case];[title case]
+ if (row.size() != 15) {
+ return std::unexpected(std::format("Invalid row ({} columns)", row.size()));
+ }
+ auto code_col = row[0];
+ auto category_col = row[2];
+
+ uint32_t code;
+ auto [ptr, ec] = std::from_chars(code_col.data(),
+ code_col.data() + code_col.size(), code,
+ /* base */ 16);
+ if (ec != std::errc() || ptr != code_col.data() + code_col.size()) {
+ return std::unexpected(std::format("Invalid code value {}", code_col));
+ }
+ u::GeneralCategory category;
+ auto it = str2gc.find(category_col);
+ if (it == str2gc.end()) {
+ return std::unexpected(std::format("Invalid general category {}",
+ category_col));
+ }
+ category = it->second;
+
+ return std::make_pair(code, category);
+}
+
+std::expected<std::map<uint32_t, u::GeneralCategory>, std::string> read(
+ std::string_view filename) {
+ auto maybe_reader = io::open(std::string(filename));
+ if (!maybe_reader.has_value()) {
+ return std::unexpected(std::format(
+ "Unable to open {} for reading: {}",
+ filename, ioerr2str(maybe_reader.error())));
+ }
+ auto reader = std::move(maybe_reader.value());
+ if (filename.ends_with(".gz")) {
+ reader = decompress::gzip(std::move(reader));
+ } else if (filename.ends_with(".xz")) {
+ reader = decompress::xz(std::move(reader));
+ }
+
+ std::map<uint32_t, u::GeneralCategory> ret;
+ auto csv_reader = csv::open(std::move(reader), ';');
+ while (true) {
+ auto row = csv_reader->read();
+ if (!row.has_value()) {
+ return std::unexpected(std::format(
+ "{}:{}: Error reading file: {}",
+ filename, csv_reader->number(), ioerr2str(row.error())));
+ }
+ if (row->empty())
+ break;
+
+ auto pair = parse_row(row.value());
+ if (!pair.has_value()) {
+ return std::unexpected(std::format(
+ "{}:{}: {}", filename, csv_reader->number(), pair.error()));
+ }
+ auto name_col = (*row)[1];
+
+ if (name_col.ends_with(", First>")) {
+ std::string prefix(name_col.substr(0, name_col.size() - 8));
+ row = csv_reader->read();
+ if (!row.has_value()) {
+ return std::unexpected(std::format(
+ "{}:{}: Error reading file: {}",
+ filename, csv_reader->number(), ioerr2str(row.error())));
+ }
+
+ auto second_pair = parse_row(row.value());
+ if (!pair.has_value()) {
+ return std::unexpected(std::format(
+ "{}:{}: {}", filename, csv_reader->number(), pair.error()));
+ }
+
+ name_col = (*row)[1];
+ if (name_col.ends_with(", Last>") &&
+ name_col.substr(0, name_col.size() - 7) == prefix) {
+ if (pair->second != second_pair->second) {
+ return std::unexpected(std::format(
+ "{}:{}: Invalid range, general category doesn't match",
+ filename, csv_reader->number()));
+ }
+
+ for (uint32_t c = pair->first; c <= second_pair->first; ++c) {
+ auto emplace_ret = ret.emplace(c, pair->second);
+ if (!emplace_ret.second) {
+ return std::unexpected(std::format(
+ "{}:{}: Duplicate value for {:#08x}",
+ filename, csv_reader->number(), c));
+ }
+ }
+ } else {
+ return std::unexpected(std::format(
+ "{}:{}: Invalid range, {} doesn't match {}",
+ filename, csv_reader->number(), prefix, name_col));
+ }
+ } else {
+ auto emplace_ret = ret.emplace(std::move(pair.value()));
+ if (!emplace_ret.second) {
+ return std::unexpected(std::format(
+ "{}:{}: Duplicate value for {:#08x}",
+ filename, csv_reader->number(), emplace_ret.first->first));
+ }
+ }
+ }
+
+ return ret;
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+ auto args = Args::create();
+ auto opt_help = args->option('h', "help", "display this text and exit");
+ auto opt_prefix =
+ args->option_argument('p', "prefix", "ARG", "Prefix for exported method");
+ std::vector<std::string_view> arguments;
+ if (!args->run(argc, argv, &arguments)) {
+ std::cerr << "Try `gen_u --help` for usage\n";
+ return 1;
+ }
+ if (opt_help->is_set()) {
+ std::cout << "Usage: `gen_u [OPTIONS...] UnicodeData [OUTPUT]`\n"
+ << "Generates a method for getting the general category for a "
+ << "code point.\n"
+ << "\n";
+ args->print_help(std::cout);
+ return 0;
+ }
+ if (!opt_prefix->is_set()) {
+ std::cerr << "No prefix given.\n"
+ << "Try `gen_u --help` for usage\n";
+ return 1;
+ }
+ auto prefix = opt_prefix->argument();
+ if (arguments.empty() || arguments.size() > 2) {
+ std::cerr << "Expecting one or two argument. No more, no less.\n"
+ << "Try `gen_u --help` for usage\n";
+ return 1;
+ }
+
+ auto general_categories = read(arguments[0]);
+ if (!general_categories.has_value()) {
+ std::cerr << general_categories.error() << '\n';
+ return 1;
+ }
+
+ if (arguments.size() < 2 || arguments[1] == "-") {
+ print_header(std::cout, prefix);
+ print_body(std::cout, general_categories.value());
+ print_footer(std::cout, prefix);
+ } else {
+ std::fstream out{std::string(arguments[1]),
+ std::fstream::trunc | std::fstream::out};
+ print_header(out, prefix);
+ print_body(out, general_categories.value());
+ print_footer(out, prefix);
+ }
+ return 0;
+}