diff options
Diffstat (limited to 'src/gen_ugc.cc')
| -rw-r--r-- | src/gen_ugc.cc | 317 |
1 files changed, 317 insertions, 0 deletions
diff --git a/src/gen_ugc.cc b/src/gen_ugc.cc new file mode 100644 index 0000000..e9bce11 --- /dev/null +++ b/src/gen_ugc.cc @@ -0,0 +1,317 @@ +#include "args.hh" +#include "csv.hh" +#include "decompress.hh" +#include "ugc.hh" + +#include <charconv> +#include <cstdint> +#include <expected> +#include <format> +#include <fstream> +#include <functional> +#include <iostream> +#include <map> +#include <span> +#include <string> +#include <string_view> +#include <system_error> +#include <utility> +#include <vector> + +namespace { + +std::map<std::string, u::GeneralCategory, std::less<>> str2gc{ + {"Lu", u::GeneralCategory::LETTER_UPPERCASE}, + {"Ll", u::GeneralCategory::LETTER_LOWERCASE}, + {"Lt", u::GeneralCategory::LETTER_TITLECASE}, + {"Lm", u::GeneralCategory::LETTER_MODIFIER}, + {"Lo", u::GeneralCategory::LETTER_OTHER}, + + {"Mn", u::GeneralCategory::MARK_NONSPACING}, + {"Mc", u::GeneralCategory::MARK_SPACING_COMBINDING}, + {"Me", u::GeneralCategory::MARK_SPACING_ENCLOSING}, + + {"Nd", u::GeneralCategory::NUMBER_DIGIT}, + {"Nl", u::GeneralCategory::NUMBER_LETTER}, + {"No", u::GeneralCategory::NUMBER_OTHER}, + + {"Pc", u::GeneralCategory::PUNCTUATION_CONNECTOR}, + {"Pd", u::GeneralCategory::PUNCTUATION_DASH}, + {"Ps", u::GeneralCategory::PUNCTUATION_OPEN}, + {"Pe", u::GeneralCategory::PUNCTUATION_CLOSE}, + {"Pi", u::GeneralCategory::PUNCTUATION_INITIAL_QUOTE}, + {"Pf", u::GeneralCategory::PUNCTUATION_FINAL_QUOTE}, + {"Po", u::GeneralCategory::PUNCTUATION_OTHER}, + + {"Sm", u::GeneralCategory::SYMBOL_MATH}, + {"Sc", u::GeneralCategory::SYMBOL_CURRENCY}, + {"Sk", u::GeneralCategory::SYMBOL_MODIFIER}, + {"So", u::GeneralCategory::SYMBOL_OTHER}, + + {"Zs", u::GeneralCategory::SEPARATOR_SPACE}, + {"Zl", u::GeneralCategory::SEPARATOR_LINE}, + {"Zp", u::GeneralCategory::SEPARATOR_PARAGRAPH}, + + {"Cc", u::GeneralCategory::OTHER_CONTROL}, + {"Cf", u::GeneralCategory::OTHER_FORMAT}, + {"Cs", u::GeneralCategory::OTHER_SURROGATE}, + {"Co", u::GeneralCategory::OTHER_PRIVATE_USE}, + {"Cn", u::GeneralCategory::OTHER_UNASSIGNED}, +}; + +void print_header(std::ostream& out, std::string_view prefix) { + out << "#include \"ugc.hh\"\n" + << "\n" + << "#include <array>\n" + << "#include <cstddef>\n" + << "#include <cstdint>\n" + << "\n" + << "namespace u {\n" + << "\n" + << "extern GeneralCategory " << prefix << "lookup_gc(uint32_t code) {\n"; +} + +void print_body(std::ostream& out, + std::map<uint32_t, u::GeneralCategory> const& data) { + std::vector<uint32_t> codes; + std::vector<u::GeneralCategory> categories; + + auto it = data.begin(); + codes.emplace_back(it->first); + categories.emplace_back(it->second); + + uint32_t next = it->first + 1; + + for (++it; it != data.end(); ++it) { + if (it->first == next && categories.back() == it->second) { + ++next; + } else { + codes.emplace_back(next - 1); + codes.emplace_back(it->first); + categories.emplace_back(it->second); + next = it->first + 1; + } + } + + codes.emplace_back(next - 1); + + out << " static std::array<uint32_t, " << codes.size() << "> codes{"; + for (auto code : codes) { + out << code << ","; + } + out << " };\n"; + out << " static std::array<uint8_t, " << categories.size() + << "> categories{"; + for (auto category : categories) { + out << static_cast<uint16_t>(category) << ","; + } + out << "};\n"; + + out << " size_t low = 0;\n" + << " size_t high = " << (codes.size() / 2) << ";\n" + << " while (low < high) {\n" + << " size_t m = (low + high) / 2;\n" + << " uint32_t start = codes[m * 2];\n" + << " if (code < start) {\n" + << " high = m;\n" + << " } else {\n" + << " uint32_t end = codes[(m * 2) + 1];\n" + << " if (code <= end) {\n" + << " return static_cast<u::GeneralCategory>(categories[m]);\n" + << " }\n" + << " low = m + 1;\n" + << " }\n" + << " }\n" + << " return u::GeneralCategory::OTHER_UNASSIGNED;\n"; +} + +void print_footer(std::ostream& out, std::string_view /* prefix */) { + out << "}\n" + << "\n" + << "} // namespace u\n"; +} + +std::string_view ioerr2str(io::OpenError error) { + switch (error) { + case io::OpenError::NoSuchFile: + return "No such file"; + case io::OpenError::NoAccess: + return "No access"; + case io::OpenError::Error: + return "Fatal error"; + } + std::unreachable(); +} + +std::string_view ioerr2str(io::ReadError error) { + switch (error) { + case io::ReadError::InvalidData: + return "Invalid (compressed) data"; + case io::ReadError::Error: + return "Fatal error"; + } + std::unreachable(); +} + +std::expected<std::pair<uint32_t, u::GeneralCategory>, std::string> parse_row( + std::span<std::string_view> row) { + // [code];[name];[gc];[cc];[bc];[decomposition];[nv-dec];[nv-dig];[nv-num];[bm];[alias];;[upper case];[lower case];[title case] + if (row.size() != 15) { + return std::unexpected(std::format("Invalid row ({} columns)", row.size())); + } + auto code_col = row[0]; + auto category_col = row[2]; + + uint32_t code; + auto [ptr, ec] = std::from_chars(code_col.data(), + code_col.data() + code_col.size(), code, + /* base */ 16); + if (ec != std::errc() || ptr != code_col.data() + code_col.size()) { + return std::unexpected(std::format("Invalid code value {}", code_col)); + } + u::GeneralCategory category; + auto it = str2gc.find(category_col); + if (it == str2gc.end()) { + return std::unexpected(std::format("Invalid general category {}", + category_col)); + } + category = it->second; + + return std::make_pair(code, category); +} + +std::expected<std::map<uint32_t, u::GeneralCategory>, std::string> read( + std::string_view filename) { + auto maybe_reader = io::open(std::string(filename)); + if (!maybe_reader.has_value()) { + return std::unexpected(std::format( + "Unable to open {} for reading: {}", + filename, ioerr2str(maybe_reader.error()))); + } + auto reader = std::move(maybe_reader.value()); + if (filename.ends_with(".gz")) { + reader = decompress::gzip(std::move(reader)); + } else if (filename.ends_with(".xz")) { + reader = decompress::xz(std::move(reader)); + } + + std::map<uint32_t, u::GeneralCategory> ret; + auto csv_reader = csv::open(std::move(reader), ';'); + while (true) { + auto row = csv_reader->read(); + if (!row.has_value()) { + return std::unexpected(std::format( + "{}:{}: Error reading file: {}", + filename, csv_reader->number(), ioerr2str(row.error()))); + } + if (row->empty()) + break; + + auto pair = parse_row(row.value()); + if (!pair.has_value()) { + return std::unexpected(std::format( + "{}:{}: {}", filename, csv_reader->number(), pair.error())); + } + auto name_col = (*row)[1]; + + if (name_col.ends_with(", First>")) { + std::string prefix(name_col.substr(0, name_col.size() - 8)); + row = csv_reader->read(); + if (!row.has_value()) { + return std::unexpected(std::format( + "{}:{}: Error reading file: {}", + filename, csv_reader->number(), ioerr2str(row.error()))); + } + + auto second_pair = parse_row(row.value()); + if (!pair.has_value()) { + return std::unexpected(std::format( + "{}:{}: {}", filename, csv_reader->number(), pair.error())); + } + + name_col = (*row)[1]; + if (name_col.ends_with(", Last>") && + name_col.substr(0, name_col.size() - 7) == prefix) { + if (pair->second != second_pair->second) { + return std::unexpected(std::format( + "{}:{}: Invalid range, general category doesn't match", + filename, csv_reader->number())); + } + + for (uint32_t c = pair->first; c <= second_pair->first; ++c) { + auto emplace_ret = ret.emplace(c, pair->second); + if (!emplace_ret.second) { + return std::unexpected(std::format( + "{}:{}: Duplicate value for {:#08x}", + filename, csv_reader->number(), c)); + } + } + } else { + return std::unexpected(std::format( + "{}:{}: Invalid range, {} doesn't match {}", + filename, csv_reader->number(), prefix, name_col)); + } + } else { + auto emplace_ret = ret.emplace(std::move(pair.value())); + if (!emplace_ret.second) { + return std::unexpected(std::format( + "{}:{}: Duplicate value for {:#08x}", + filename, csv_reader->number(), emplace_ret.first->first)); + } + } + } + + return ret; +} + +} // namespace + +int main(int argc, char** argv) { + auto args = Args::create(); + auto opt_help = args->option('h', "help", "display this text and exit"); + auto opt_prefix = + args->option_argument('p', "prefix", "ARG", "Prefix for exported method"); + std::vector<std::string_view> arguments; + if (!args->run(argc, argv, &arguments)) { + std::cerr << "Try `gen_u --help` for usage\n"; + return 1; + } + if (opt_help->is_set()) { + std::cout << "Usage: `gen_u [OPTIONS...] UnicodeData [OUTPUT]`\n" + << "Generates a method for getting the general category for a " + << "code point.\n" + << "\n"; + args->print_help(std::cout); + return 0; + } + if (!opt_prefix->is_set()) { + std::cerr << "No prefix given.\n" + << "Try `gen_u --help` for usage\n"; + return 1; + } + auto prefix = opt_prefix->argument(); + if (arguments.empty() || arguments.size() > 2) { + std::cerr << "Expecting one or two argument. No more, no less.\n" + << "Try `gen_u --help` for usage\n"; + return 1; + } + + auto general_categories = read(arguments[0]); + if (!general_categories.has_value()) { + std::cerr << general_categories.error() << '\n'; + return 1; + } + + if (arguments.size() < 2 || arguments[1] == "-") { + print_header(std::cout, prefix); + print_body(std::cout, general_categories.value()); + print_footer(std::cout, prefix); + } else { + std::fstream out{std::string(arguments[1]), + std::fstream::trunc | std::fstream::out}; + print_header(out, prefix); + print_body(out, general_categories.value()); + print_footer(out, prefix); + } + return 0; +} |
