diff options
| author | Joel Klinghed <the_jk@spawned.biz> | 2025-09-28 22:49:11 +0200 |
|---|---|---|
| committer | Joel Klinghed <the_jk@spawned.biz> | 2025-09-28 22:49:11 +0200 |
| commit | 17394f1e34b15f1d2bada7a5f37fbf08c239ca5c (patch) | |
| tree | 76d5e0b18b26f6a66a18376db2a4b218b82b05fc /src/prefix_tree.cc | |
| parent | 2f13baa843bd1fb5db6630a2823681ffaff9fb11 (diff) | |
prefix_tree: Make a byte optimized version for really small trees
Currently supports 8-bit distances and 16-bit distances.
Diffstat (limited to 'src/prefix_tree.cc')
| -rw-r--r-- | src/prefix_tree.cc | 112 |
1 files changed, 97 insertions, 15 deletions
diff --git a/src/prefix_tree.cc b/src/prefix_tree.cc index f16df22..56466e8 100644 --- a/src/prefix_tree.cc +++ b/src/prefix_tree.cc @@ -42,6 +42,17 @@ class BuilderImpl : public Builder { [[nodiscard]] std::optional<std::string> build() const override { + auto ret = build(1); + if (!ret.has_value()) + ret = build(2); + return ret; + } + + private: + [[nodiscard]] + std::optional<std::string> build(uint8_t size) const { + assert(size > 0); + std::string tree; std::string strings; @@ -49,20 +60,19 @@ class BuilderImpl : public Builder { for (auto const& str : strings_) tmp.emplace(str); - if (!write_tree(tmp, tree, strings)) + if (!write_tree(tmp, tree, strings, size)) return std::nullopt; std::string header; - if (tree.size() > 0xffff) + write_u8(header, size); + if (!write_size(header, size, tree.size())) return std::nullopt; - write_u16(header, tree.size()); return header + tree + strings; } - private: bool write_tree(std::set<std::string_view> const& input, std::string& tree, - std::string& strings) const { + std::string& strings, uint8_t size) const { std::map<char, std::set<std::string_view>> buckets; bool match = false; for (auto& str : input) { @@ -76,8 +86,8 @@ class BuilderImpl : public Builder { write_u8(tree, buckets.size() + (match ? 1 : 0)); if (match) { write_u8(tree, 0); - write_u16(tree, 0); - write_u16(tree, 0); + write_size(tree, size, 0); + write_size(tree, size, 0); } std::string extra; for (auto& pair : buckets) { @@ -95,23 +105,23 @@ class BuilderImpl : public Builder { } write_u8(tree, 1 + str.size()); - if (strings.size() > 0xffff) + if (!write_size(tree, size, strings.size())) return false; - write_u16(tree, strings.size()); strings.push_back(pair.first); strings.append(str); if (extra.size() > 0xffff) return false; - write_u16(tree, extra.size()); + if (!write_size(tree, size, extra.size())) + return false; if (str.empty()) { - if (!write_tree(pair.second, extra, strings)) + if (!write_tree(pair.second, extra, strings, size)) return false; } else { std::set<std::string_view> tmp; for (auto& str2 : pair.second) { tmp.emplace(str2.substr(str.size())); } - if (!write_tree(tmp, extra, strings)) + if (!write_tree(tmp, extra, strings, size)) return false; } } @@ -120,12 +130,27 @@ class BuilderImpl : public Builder { return true; } + static bool write_size(std::string& str, uint8_t size, size_t value) { + if (size == 1) { + if (value > 0xff) + return false; + write_u8(str, value); + return true; + } + if (size == 2) { + if (value > 0xffff) + return false; + write_u16(str, value); + return true; + } + assert(false); + return false; + } + std::set<std::string> strings_; }; -} // namespace - -std::optional<size_t> lookup(std::string_view tree, std::string_view str) { +std::optional<size_t> lookup16(std::string_view tree, std::string_view str) { size_t base_str = 2 + get_u16(tree, 0); size_t node = 2; std::optional<size_t> match; @@ -168,6 +193,63 @@ std::optional<size_t> lookup(std::string_view tree, std::string_view str) { return match; } +std::optional<size_t> lookup8(std::string_view tree, std::string_view str) { + size_t base_str = 1 + get_u8(tree, 0); + size_t node = 1; + std::optional<size_t> match; + std::optional<size_t> earlier_match; + + while (node < base_str && !str.empty()) { + auto children = get_u8(tree, node); + + if (children == 0) { + // Leaf + return match; + } + + size_t child_node = node + 1; + size_t child_end = child_node + (static_cast<size_t>(children) * 3); + for (; child_node < child_end; child_node += 3) { + uint8_t len = get_u8(tree, child_node); + uint8_t offset = get_u8(tree, child_node + 1); + + if (str.starts_with(tree.substr(base_str + offset, len))) { + // Match but not a leaf, always first in the list of children + if (len == 0) { + earlier_match = match; + continue; + } + match = match.value_or(0) + len; + str = str.substr(len); + auto jump = get_u8(tree, child_node + 2); + node = child_end + jump; + break; + } + } + + if (child_node == child_end) + return earlier_match; + } + + if (node == base_str) + return earlier_match; + return match; +} + +} // namespace + +std::optional<size_t> lookup(std::string_view tree, std::string_view str) { + auto size = get_u8(tree, 0); + if (size == 1) { + return lookup8(tree.substr(1), str); + } + if (size == 2) { + return lookup16(tree.substr(1), str); + } + assert(false); + return std::nullopt; +} + std::unique_ptr<Builder> builder() { return std::make_unique<BuilderImpl>(); } } // namespace prefix_tree |
