diff options
| -rw-r--r-- | meson.build | 20 | ||||
| -rw-r--r-- | src/prefix_tree.cc | 173 | ||||
| -rw-r--r-- | src/prefix_tree.hh | 36 | ||||
| -rw-r--r-- | test/prefix_tree.cc | 47 |
4 files changed, 276 insertions, 0 deletions
diff --git a/meson.build b/meson.build index 99c7ec9..1607088 100644 --- a/meson.build +++ b/meson.build @@ -191,6 +191,16 @@ java_uescape_dep = declare_dependency( dependencies: [buffer_dep, uio_dep], ) +prefix_tree_lib = library( + 'prefix_tree', + sources: [ + 'src/prefix_tree.hh', + 'src/prefix_tree.cc', + ], + include_directories: inc, +) +prefix_tree_dep = declare_dependency(link_with: prefix_tree_lib) + jkc = executable( 'jkc', sources: [ @@ -341,6 +351,16 @@ test('java_uescape', executable( ], )) +test('prefix_tree', executable( + 'test_prefix_tree', + sources: ['test/prefix_tree.cc'], + include_directories: inc, + dependencies: [ + prefix_tree_dep, + test_dependencies, + ], +)) + run_clang_tidy = find_program('run-clang-tidy', required: false) if run_clang_tidy.found() diff --git a/src/prefix_tree.cc b/src/prefix_tree.cc new file mode 100644 index 0000000..f16df22 --- /dev/null +++ b/src/prefix_tree.cc @@ -0,0 +1,173 @@ +#include "prefix_tree.hh" + +#include <cassert> +#include <cstddef> +#include <cstdint> +#include <map> +#include <memory> +#include <optional> +#include <set> +#include <string> +#include <string_view> + +namespace prefix_tree { + +namespace { + +[[nodiscard]] +inline uint8_t get_u8(std::string_view str, size_t index) { + return static_cast<uint8_t>(str[index] & 0xff); +} + +[[nodiscard]] +inline uint16_t get_u16(std::string_view str, size_t index) { + return (static_cast<uint16_t>(get_u8(str, index)) << 8) | + get_u8(str, index + 1); +} + +inline void write_u8(std::string& str, uint8_t value) { + str.push_back(static_cast<char>(value)); +} + +inline void write_u16(std::string& str, uint16_t value) { + write_u8(str, value >> 8); + write_u8(str, value & 0xff); +} + +class BuilderImpl : public Builder { + public: + BuilderImpl() = default; + + void add(std::string_view str) override { strings_.emplace(str); } + + [[nodiscard]] + std::optional<std::string> build() const override { + std::string tree; + std::string strings; + + std::set<std::string_view> tmp; + for (auto const& str : strings_) + tmp.emplace(str); + + if (!write_tree(tmp, tree, strings)) + return std::nullopt; + + std::string header; + if (tree.size() > 0xffff) + return std::nullopt; + write_u16(header, tree.size()); + + return header + tree + strings; + } + + private: + bool write_tree(std::set<std::string_view> const& input, std::string& tree, + std::string& strings) const { + std::map<char, std::set<std::string_view>> buckets; + bool match = false; + for (auto& str : input) { + if (str.empty()) { + match = true; + continue; + } + buckets[str.front()].emplace(str.substr(1)); + } + + write_u8(tree, buckets.size() + (match ? 1 : 0)); + if (match) { + write_u8(tree, 0); + write_u16(tree, 0); + write_u16(tree, 0); + } + std::string extra; + for (auto& pair : buckets) { + auto it = pair.second.begin(); + auto str = *it; + for (++it; it != pair.second.end(); ++it) { + size_t i = 0; + while (i < str.size() && i < it->size() && str[i] == it->at(i)) + ++i; + if (i == 0) { + str = ""; + break; + } + str = str.substr(0, i); + } + + write_u8(tree, 1 + str.size()); + if (strings.size() > 0xffff) + return false; + write_u16(tree, strings.size()); + strings.push_back(pair.first); + strings.append(str); + if (extra.size() > 0xffff) + return false; + write_u16(tree, extra.size()); + if (str.empty()) { + if (!write_tree(pair.second, extra, strings)) + return false; + } else { + std::set<std::string_view> tmp; + for (auto& str2 : pair.second) { + tmp.emplace(str2.substr(str.size())); + } + if (!write_tree(tmp, extra, strings)) + return false; + } + } + + tree.append(extra); + return true; + } + + std::set<std::string> strings_; +}; + +} // namespace + +std::optional<size_t> lookup(std::string_view tree, std::string_view str) { + size_t base_str = 2 + get_u16(tree, 0); + size_t node = 2; + std::optional<size_t> match; + std::optional<size_t> earlier_match; + + while (node < base_str && !str.empty()) { + auto children = get_u8(tree, node); + + if (children == 0) { + // Leaf + return match; + } + + size_t child_node = node + 1; + size_t child_end = child_node + (static_cast<size_t>(children) * 5); + for (; child_node < child_end; child_node += 5) { + uint8_t len = get_u8(tree, child_node); + uint16_t offset = get_u16(tree, child_node + 1); + + if (str.starts_with(tree.substr(base_str + offset, len))) { + // Match but not a leaf, always first in the list of children + if (len == 0) { + earlier_match = match; + continue; + } + match = match.value_or(0) + len; + str = str.substr(len); + auto jump = get_u16(tree, child_node + 3); + node = child_end + jump; + break; + } + } + + if (child_node == child_end) + return earlier_match; + } + + if (node == base_str) + return earlier_match; + return match; +} + +std::unique_ptr<Builder> builder() { return std::make_unique<BuilderImpl>(); } + +} // namespace prefix_tree diff --git a/src/prefix_tree.hh b/src/prefix_tree.hh new file mode 100644 index 0000000..6e4c792 --- /dev/null +++ b/src/prefix_tree.hh @@ -0,0 +1,36 @@ +#ifndef PREFIX_TREE_HH +#define PREFIX_TREE_HH + +#include <memory> +#include <optional> +#include <string> +#include <string_view> + +namespace prefix_tree { + +class Builder { + public: + virtual ~Builder() = default; + + virtual void add(std::string_view str) = 0; + [[nodiscard]] + virtual std::optional<std::string> build() const = 0; + + protected: + Builder() = default; + + Builder(Builder const&) = delete; + Builder& operator=(Builder const&) = delete; +}; + +[[nodiscard]] +std::unique_ptr<Builder> builder(); + +// Returns the length of the prefix of str that exists in tree. +// Returns nullopt if prefix of str doesn't match any string in tree. +[[nodiscard]] +std::optional<size_t> lookup(std::string_view tree, std::string_view str); + +} // namespace prefix_tree + +#endif // PREFIX_TREE_HH diff --git a/test/prefix_tree.cc b/test/prefix_tree.cc new file mode 100644 index 0000000..6c00adb --- /dev/null +++ b/test/prefix_tree.cc @@ -0,0 +1,47 @@ +#include "prefix_tree.hh" + +#include <gtest/gtest.h> + +TEST(prefix_tree, empty) { + auto builder = prefix_tree::builder(); + auto tree = builder->build(); + ASSERT_TRUE(tree.has_value()); + auto ret = prefix_tree::lookup(tree.value(), ""); + EXPECT_FALSE(ret.has_value()); + ret = prefix_tree::lookup(tree.value(), "foo"); + EXPECT_FALSE(ret.has_value()); +} + +TEST(prefix_tree, sanity) { + auto builder = prefix_tree::builder(); + builder->add("foo"); + builder->add("bar"); + builder->add("foobar"); + auto tree = builder->build(); + ASSERT_TRUE(tree.has_value()); + auto ret = prefix_tree::lookup(tree.value(), ""); + EXPECT_FALSE(ret.has_value()); + ret = prefix_tree::lookup(tree.value(), "foo"); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(3, ret.value()); + ret = prefix_tree::lookup(tree.value(), "foobar"); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(6, ret.value()); + ret = prefix_tree::lookup(tree.value(), "bar"); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(3, ret.value()); + ret = prefix_tree::lookup(tree.value(), "ba"); + EXPECT_FALSE(ret.has_value()); + ret = prefix_tree::lookup(tree.value(), "barf"); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(3, ret.value()); + ret = prefix_tree::lookup(tree.value(), "foob"); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(3, ret.value()); + ret = prefix_tree::lookup(tree.value(), "foozum"); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(3, ret.value()); + ret = prefix_tree::lookup(tree.value(), "barfoo"); + ASSERT_TRUE(ret.has_value()); + EXPECT_EQ(3, ret.value()); +} |
