diff --git a/src/core/util/CMakeLists.txt b/src/core/util/CMakeLists.txt index 06052098..b8cd483d 100644 --- a/src/core/util/CMakeLists.txt +++ b/src/core/util/CMakeLists.txt @@ -89,3 +89,6 @@ endif (BUILD_LLVM_DISASSEMBLER) add_executable(memorymap-test testing/memorymap-test.cc) target_link_libraries(memorymap-test fail-util) add_test(NAME memorymap-test WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testing COMMAND memorymap-test) + +add_executable(sumtree-test testing/SumTreeTest.cc) +add_test(NAME sumtree-test WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testing COMMAND sumtree-test) diff --git a/src/core/util/SumTree.hpp b/src/core/util/SumTree.hpp new file mode 100644 index 00000000..998e3e05 --- /dev/null +++ b/src/core/util/SumTree.hpp @@ -0,0 +1,189 @@ +#ifndef __SUM_TREE_HPP__ +#define __SUM_TREE_HPP__ + +#include +#include +#include + +// The SumTree implements an efficient tree data structure for +// "roulette-wheel" sampling, or "sampling with fault expansion", i.e., +// sampling of trace entries / pilots without replacement and with a +// picking probability proportional to the entries' sizes. +// +// For every sample, the naive approach picks a random number between 0 +// and the sum of all entry sizes minus one. It then iterates over all +// entries and sums their sizes until the sum exceeds the random number. +// The current entry gets picked. The main disadvantage is the linear +// complexity, which gets unpleasant for millions of entries. +// +// The core idea behind the SumTree implementation is to maintain the +// size sum of groups of entries, kept in "buckets". Thereby, a bucket +// can be quickly jumped over. To keep bucket sizes (and thereby linear +// search times) bounded, more bucket hierarchy levels are introduced +// when a defined bucket size limit is reached. +// +// Note that the current implementation is built for a pure growth phase +// (when the tree gets filled with pilots from the database), followed by +// a sampling phase when the tree gets emptied. It does not handle a +// mixed add/remove case very smartly, although it should remain +// functional. + +namespace fail { + +template +class SumTree { + //! Bucket data structure for tree nodes + struct Bucket { + Bucket() : size(0) {} + ~Bucket(); + //! Sum of all children / elements + typename T::size_type size; + //! Sub-buckets, empty for leaf nodes + std::vector children; + //! Contained elements, empty for inner nodes + std::vector elements; + }; + + //! Root node + Bucket *m_root; + //! Tree depth: nodes at level m_depth are leaf nodes, others are inner nodes + unsigned m_depth; +public: + SumTree() : m_root(new Bucket), m_depth(0) {} + ~SumTree() { delete m_root; } + //! Adds a new element to the tree. + void add(const T& element); + //! Retrieves (and removes) element at random number position. + T get(typename T::size_type pos) { return get(pos, m_root, 0); } + //! Yields the sum over all elements in the tree. + typename T::size_type get_size() const { return m_root->size; } +private: + //! Internal, recursive version of add(). + bool add(Bucket **node, const T& element, unsigned depth_remaining); + //! Internal, recursive version of get(). + T get(typename T::size_type pos, Bucket *node, typename T::size_type sum); +}; + +// template implementation + +template +SumTree::Bucket::~Bucket() +{ + for (typename std::vector::const_iterator it = children.begin(); + it != children.end(); ++it) { + delete *it; + } +} + +template +void SumTree::add(const T& element) +{ + if (element.size() == 0) { + // pilots with size == 0 cannot be picked anyways + return; + } + + if (add(&m_root, element, m_depth)) { + // tree wasn't full yet, add succeeded + return; + } + + // tree is full, move everything one level down + ++m_depth; + Bucket *b = new Bucket; + b->children.push_back(m_root); + b->size = m_root->size; + m_root = b; + + // retry + add(&m_root, element, m_depth); +} + +template +bool SumTree::add(Bucket **node, const T& element, unsigned depth_remaining) +{ + // non-leaf node? + if (depth_remaining) { + // no children yet? create one. + if ((*node)->children.size() == 0) { + (*node)->children.push_back(new Bucket); + } + + // adding to newest child worked? + if (add(&(*node)->children.back(), element, depth_remaining - 1)) { + (*node)->size += element.size(); + return true; + } + + // newest child full, may we create another one? + if ((*node)->children.size() < BUCKETSIZE) { + (*node)->children.push_back(new Bucket); + add(&(*node)->children.back(), element, depth_remaining - 1); + (*node)->size += element.size(); + return true; + } + // recursive add ultimately failed, subtree full + return false; + + // leaf node + } else { + if ((*node)->elements.size() < BUCKETSIZE) { + (*node)->elements.push_back(element); + (*node)->size += element.size(); + return true; + } + return false; + } +} + +template +T SumTree::get(typename T::size_type pos, Bucket *node, typename T::size_type sum) +{ + // sanity check + assert(pos >= sum && pos < sum + node->size); + + // will only be entered for inner nodes + for (typename std::vector::iterator it = node->children.begin(); + it != node->children.end(); ) { + sum += (*it)->size; + if (sum <= pos) { + ++it; + continue; + } + + // found containing bucket, recurse + sum -= (*it)->size; + T e = get(pos, *it, sum); + node->size -= e.size(); + // remove empty (or, at least, zero-sized) child? + if ((*it)->size == 0) { + delete *it; + node->children.erase(it); + } + return e; + } + + // will only be entered for leaf nodes + for (typename std::vector::iterator it = node->elements.begin(); + it != node->elements.end(); ) { + sum += it->size(); + if (sum <= pos) { + ++it; + continue; + } + + // found pilot + T e = *it; + node->size -= e.size(); + node->elements.erase(it); + return e; + } + + // this should never happen + assert(0); + return T(); +} + +} // namespace + +#endif diff --git a/src/core/util/testing/SumTreeTest.cc b/src/core/util/testing/SumTreeTest.cc new file mode 100644 index 00000000..1757cd17 --- /dev/null +++ b/src/core/util/testing/SumTreeTest.cc @@ -0,0 +1,34 @@ +#include "util/SumTree.hpp" + +#include +#define LOG std::cerr + +using std::endl; + +struct Pilot { + uint32_t id; + uint32_t instr2; + uint32_t data_address; + uint64_t duration; + + typedef uint64_t size_type; + size_type size() const { return duration; } +}; + +int main() +{ + fail::SumTree tree; + for (int i = 0; i <= 20; ++i) { + Pilot p; + p.duration = i; + tree.add(p); + } + + while (tree.get_size() > 0) { + uint64_t pos = tree.get_size() / 2; + LOG << "MAIN tree.get_size() = " << tree.get_size() + << ", trying to retrieve pos = " << pos << endl; + Pilot p = tree.get(pos); + LOG << "MAIN retrieved pilot with duration " << p.duration << endl; + } +}