Last active
January 13, 2021 01:09
-
-
Save binkiklou/90a3d77c07cc04461e4e6da2d1260e9a to your computer and use it in GitHub Desktop.
Tests time for finding 100k strings in vector of 1M strings ranging from 1 to 64 chars
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // I started writing this on WSL, but I'm finishing it | |
| // on Visual Studio, cuz I want to see the native performance | |
| // Should work with gcc, but might not. | |
| // btw Opening the file in Visual Studio broke the whole formating | |
| #include <string> | |
| #include <chrono> | |
| #include <vector> | |
| #include <random> | |
| #include <utility> | |
| #include <new> | |
| #include <iostream> | |
| #include <cstring> | |
| #define START_TIME auto t1 = std::chrono::high_resolution_clock::now(); | |
| #define STOP_TIME auto t2 = std::chrono::high_resolution_clock::now(); \ | |
| auto d = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count(); | |
| #define STOP_TIME_MS auto t2 = std::chrono::high_resolution_clock::now(); \ | |
| auto d = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count(); | |
| #define WORD_MIN 1 // Minimum number of chars | |
| #define WORD_MAX 64 // Maximum number of chars | |
| // Number of pseudo-random strings to make | |
| #define SAMPLE_SIZE 1000000 | |
| // not a very fitting name but who cares | |
| // This is the number of strings to find | |
| #define TARGET_SIZE 100000 | |
| class tracker | |
| { | |
| public: | |
| static void reset(); | |
| static void dump(); | |
| static int accesses; | |
| static int nodes; | |
| static int allocs; | |
| }; | |
| // Doesn't take into account std allocations | |
| // cuz I'm too lazy to track that | |
| int tracker::allocs; | |
| // Somewhat an indication of accesses, but not | |
| // the actual value | |
| int tracker::accesses; | |
| // I think that number is right | |
| int tracker::nodes; | |
| void tracker::reset() { | |
| allocs = 0; | |
| accesses = 0; | |
| nodes = 0; | |
| } | |
| void tracker::dump() { | |
| std::cout << "Allocations:" << allocs << "B" << std::endl; | |
| std::cout << "Accesses:" << accesses << std::endl; | |
| std::cout << "Nodes:" << nodes << std::endl; | |
| } | |
| // GENERATION | |
| std::vector<char*> random_strings(int count) | |
| { | |
| std::vector<char*> data; | |
| std::default_random_engine gen; | |
| std::uniform_int_distribution<int> len_dist(WORD_MIN, WORD_MAX); | |
| std::uniform_int_distribution<int> char_dist(48, 90); | |
| for (int i = 0; i < count; i++) { | |
| char len = static_cast<char>(len_dist(gen)); | |
| tracker::allocs += len; | |
| char* s = (char*)malloc(len + 1); | |
| for (int i = 0; i < len; i++) { | |
| s[i] = char_dist(gen); | |
| } | |
| s[len] = '\0'; | |
| data.push_back(s); | |
| } | |
| return data; | |
| } | |
| // DUMB SEARCH | |
| class dumb_searcher | |
| { | |
| public: | |
| ~dumb_searcher() { | |
| data = std::vector<char*>(); | |
| } | |
| bool find(char* x) | |
| { | |
| for (char* v : this->data) { | |
| tracker::accesses++; | |
| if (strcmp(v, x) == 0) { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| std::vector<char*> data; | |
| }; | |
| // BLOCK(SHARED) | |
| class block | |
| { | |
| public: | |
| unsigned int key; | |
| std::vector<char*> strings; | |
| }; | |
| // SMARTER SEARCH | |
| class smarter_seacher | |
| { | |
| public: | |
| ~smarter_seacher() { | |
| this->data = std::vector<block>(); | |
| } | |
| void setup(std::vector<char*> data) | |
| { | |
| // This only works if you know the | |
| // maximum size. This is also very memory inneficient | |
| block b; | |
| for (int i = 0; i != WORD_MAX + 1; i++) { | |
| b.key = i; | |
| this->data.push_back(b); | |
| } | |
| for (char* str : data) | |
| { | |
| tracker::accesses++; | |
| this->data.at(strlen(str)).strings.push_back(str); | |
| } | |
| } | |
| bool find(char* str) | |
| { | |
| size_t len = strlen(str); | |
| for (char* v : this->data.at(len).strings) | |
| { | |
| tracker::accesses++; | |
| if (strcmp(v, str) == 0) { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| private: | |
| std::vector<block> data; | |
| }; | |
| // TREE SEARCH | |
| class node | |
| { | |
| public: | |
| node() | |
| { | |
| tracker::nodes++; | |
| l = nullptr; | |
| r = nullptr; | |
| } | |
| ~node() | |
| { | |
| if (r != nullptr) { | |
| delete r; | |
| } | |
| if (l != nullptr) { | |
| delete l; | |
| } | |
| } | |
| void push(int val, char* str) | |
| { | |
| if (this->set == false) { | |
| this->value.key = val; | |
| this->value.strings.push_back(str); | |
| this->set = true; | |
| } | |
| else if (val == this->value.key) { | |
| this->value.strings.push_back(str); | |
| } | |
| else if (val < this->value.key) { | |
| if (this->l == nullptr) { | |
| tracker::allocs++; | |
| this->l = new node(); | |
| } | |
| this->l->push(val, str); | |
| } | |
| else { | |
| if (this->r == nullptr) { | |
| tracker::allocs++; | |
| this->r = new node(); | |
| } | |
| this->r->push(val, str); | |
| } | |
| } | |
| bool find(int val, char* str) | |
| { | |
| if (this->set == false) { | |
| return false; | |
| } | |
| else if (val == this->value.key) { | |
| for (char* v : this->value.strings) | |
| { | |
| tracker::accesses++; | |
| if (strcmp(v, str) == 0) { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| else if (val < this->value.key) { | |
| if (this->l != nullptr) { | |
| return l->find(val, str); | |
| } | |
| return false; | |
| } | |
| else if (val > this->value.key) { | |
| if (this->r != nullptr) { | |
| return r->find(val, str); | |
| } | |
| return false; | |
| } | |
| return false; | |
| } | |
| block value; | |
| node* l = nullptr; | |
| node* r = nullptr; | |
| bool set = false; | |
| }; | |
| class tree_searcher | |
| { | |
| public: | |
| tree_searcher() { | |
| root = node(); | |
| } | |
| bool find(char* str) | |
| { | |
| return root.find(strlen(str), str); | |
| } | |
| void push(char* str) | |
| { | |
| root.push(strlen(str), str); | |
| } | |
| private: | |
| node root; | |
| std::vector<block> data; | |
| }; | |
| class hashed_tree_searcher | |
| { | |
| public: | |
| hashed_tree_searcher() { | |
| root = node(); | |
| } | |
| bool find(char* str) { | |
| size_t hash = 0; | |
| for (int i = 0; i < strlen(str); i++) { | |
| hash += str[i]; | |
| } | |
| return root.find(hash, str); | |
| } | |
| void push(char* str) { | |
| size_t hash = 0; | |
| for (int i = 0; i < strlen(str); i++) { | |
| hash += str[i]; | |
| } | |
| root.push(hash, str); | |
| } | |
| private: | |
| node root; | |
| std::vector<block> data; | |
| }; | |
| int main(int argc, char* argv[]) | |
| { | |
| tracker::reset(); | |
| std::vector<char*> data; | |
| std::vector<char*> targets; | |
| { | |
| START_TIME | |
| data = random_strings(SAMPLE_SIZE); | |
| std::default_random_engine gen; | |
| std::uniform_int_distribution<int> bruh(1, data.size() - 1); | |
| for (int i = 0; i < TARGET_SIZE; i++) { | |
| targets.push_back(data.at(bruh(gen))); | |
| } | |
| STOP_TIME_MS | |
| std::cout << "Generating "<<SAMPLE_SIZE<<" pseudo-random strings took " << d << "ms" << std::endl; | |
| std::cout << "Choosen "<<TARGET_SIZE<<" strings to find" << std::endl; | |
| tracker::dump(); | |
| } | |
| { | |
| // This isn't fast because it is not designed to retrieve mutliple | |
| // elements at once. So it doesn't index like other techniques do. | |
| // or maybe STL bad :^) | |
| std::cout << "--=== std::find ===---" << std::endl; | |
| { | |
| START_TIME | |
| std::find(data.begin(), data.end(), targets.at(0)); | |
| STOP_TIME | |
| std::cout << "Found single element in " << d <<"us"<< std::endl; | |
| } | |
| START_TIME | |
| int i = 0; | |
| for (char* str : targets) | |
| { | |
| if (i % (TARGET_SIZE / 10) == 0) { | |
| std::cout << "."; | |
| } | |
| if (std::find(data.begin(), data.end(), str) == data.end()) | |
| { | |
| std::cout << "std find failed" << std::endl; | |
| return -1; | |
| } | |
| i++; | |
| } | |
| STOP_TIME_MS | |
| std::cout << "\n"; | |
| std::cout << "std::find took " << d << "ms" << std::endl; | |
| } | |
| tracker::reset(); | |
| { | |
| std::cout << "---=== (slightly)SMARTER SEARCH ===---" << std::endl; | |
| smarter_seacher s; | |
| { | |
| START_TIME | |
| s.setup(data); | |
| STOP_TIME | |
| std::cout << "Smarter Search overhead is " << d << "us" << std::endl; | |
| } | |
| { | |
| START_TIME | |
| s.find(targets.at(0)); | |
| STOP_TIME | |
| std::cout << "Found single element in " << d <<"us"<< std::endl; | |
| } | |
| START_TIME | |
| for (char* str : targets) { | |
| if (!s.find(str)) { | |
| std::cout << "Smarter searcher failed" << std::endl; | |
| return -1; | |
| } | |
| } | |
| STOP_TIME_MS | |
| std::cout << "Smarter search took " << d << "ms" << std::endl; | |
| tracker::dump(); | |
| } | |
| tracker::reset(); | |
| { | |
| std::cout << "---=== TREE SEARCH ===---" << std::endl; | |
| tree_searcher s = tree_searcher(); | |
| { | |
| START_TIME | |
| for (char* str : data) { | |
| s.push(str); | |
| } | |
| STOP_TIME | |
| std::cout << "Tree search overhead is " << d << "us" << std::endl; | |
| } | |
| { | |
| START_TIME | |
| s.find(targets.at(0)); | |
| STOP_TIME | |
| std::cout << "Found single element in " << d <<"us"<< std::endl; | |
| } | |
| START_TIME | |
| for (char* str : targets) { | |
| if (!s.find(str)) { | |
| std::cout << "Tree searcher failed" << std::endl; | |
| return -1; | |
| } | |
| } | |
| STOP_TIME_MS | |
| std::cout << "Tree search took " << d << "ms" << std::endl; | |
| tracker::dump(); | |
| } | |
| tracker::reset(); | |
| { | |
| // Hash tree is probably already a thing | |
| // but this is just, a binary tree composed | |
| // of the "Hash"(which is litteraly the sum of all chars in string) | |
| std::cout << "---=== HASH TREE SEARCH ===---" << std::endl; | |
| hashed_tree_searcher s = hashed_tree_searcher(); | |
| { | |
| START_TIME | |
| for (char* str : data) { | |
| s.push(str); | |
| } | |
| STOP_TIME | |
| std::cout << "Hashed tree search overhead is " << d << "us" << std::endl; | |
| } | |
| { | |
| START_TIME | |
| s.find(targets.at(0)); | |
| STOP_TIME | |
| std::cout << "Found single element in " << d << "us" << std::endl; | |
| } | |
| START_TIME | |
| for (char* str : targets) { | |
| if (!s.find(str)) { | |
| std::cout << "Hash tree searcher failed" << std::endl; | |
| return -1; | |
| } | |
| } | |
| STOP_TIME_MS | |
| std::cout << "Hash tree took " << d << "ms" << std::endl; | |
| tracker::dump(); | |
| } | |
| tracker::reset(); | |
| { | |
| std::cout << "---=== DUMB SEARCH ===---" << std::endl; | |
| dumb_searcher s; | |
| // The "overhead" is just copying the vector, which isn't really needed and would save time | |
| // but I do it anyway cuz it's easy like that | |
| { | |
| START_TIME | |
| s.data = data; | |
| STOP_TIME | |
| std::cout << "Dumb search overhead is " << d << "us" << std::endl; | |
| } | |
| { | |
| START_TIME | |
| s.find(targets.at(0)); | |
| STOP_TIME | |
| std::cout << "Found single element in " << d << "us" << std::endl; | |
| } | |
| START_TIME | |
| int i = 0; | |
| for (char* str : targets) { | |
| if (i % (TARGET_SIZE / 10) == 0) { | |
| std::cout << "."; | |
| } | |
| if (!s.find(str)) { | |
| std::cout << "Dumb searcher failed" << std::endl; | |
| return -1; | |
| } | |
| i++; | |
| } | |
| STOP_TIME_MS | |
| std::cout << "\n"; | |
| std::cout << "Dumb search took " << d << "ms" << std::endl; | |
| tracker::dump(); | |
| } | |
| // just free because why not | |
| for (char* ptr : data) { | |
| free(ptr); | |
| } | |
| data = std::vector<char*>(); | |
| targets = std::vector<char*>(); | |
| return 0; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment