Skip to content

Instantly share code, notes, and snippets.

@binkiklou
Last active January 13, 2021 01:09
Show Gist options
  • Select an option

  • Save binkiklou/90a3d77c07cc04461e4e6da2d1260e9a to your computer and use it in GitHub Desktop.

Select an option

Save binkiklou/90a3d77c07cc04461e4e6da2d1260e9a to your computer and use it in GitHub Desktop.
Tests time for finding 100k strings in vector of 1M strings ranging from 1 to 64 chars
// I started writing this on WSL, but I'm finishing it
// on Visual Studio, cuz I want to see the native performance
// Should work with gcc, but might not.
// btw Opening the file in Visual Studio broke the whole formating
#include <string>
#include <chrono>
#include <vector>
#include <random>
#include <utility>
#include <new>
#include <iostream>
#include <cstring>
#define START_TIME auto t1 = std::chrono::high_resolution_clock::now();
#define STOP_TIME auto t2 = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
#define STOP_TIME_MS auto t2 = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
#define WORD_MIN 1 // Minimum number of chars
#define WORD_MAX 64 // Maximum number of chars
// Number of pseudo-random strings to make
#define SAMPLE_SIZE 1000000
// not a very fitting name but who cares
// This is the number of strings to find
#define TARGET_SIZE 100000
class tracker
{
public:
static void reset();
static void dump();
static int accesses;
static int nodes;
static int allocs;
};
// Doesn't take into account std allocations
// cuz I'm too lazy to track that
int tracker::allocs;
// Somewhat an indication of accesses, but not
// the actual value
int tracker::accesses;
// I think that number is right
int tracker::nodes;
void tracker::reset() {
allocs = 0;
accesses = 0;
nodes = 0;
}
void tracker::dump() {
std::cout << "Allocations:" << allocs << "B" << std::endl;
std::cout << "Accesses:" << accesses << std::endl;
std::cout << "Nodes:" << nodes << std::endl;
}
// GENERATION
std::vector<char*> random_strings(int count)
{
std::vector<char*> data;
std::default_random_engine gen;
std::uniform_int_distribution<int> len_dist(WORD_MIN, WORD_MAX);
std::uniform_int_distribution<int> char_dist(48, 90);
for (int i = 0; i < count; i++) {
char len = static_cast<char>(len_dist(gen));
tracker::allocs += len;
char* s = (char*)malloc(len + 1);
for (int i = 0; i < len; i++) {
s[i] = char_dist(gen);
}
s[len] = '\0';
data.push_back(s);
}
return data;
}
// DUMB SEARCH
class dumb_searcher
{
public:
~dumb_searcher() {
data = std::vector<char*>();
}
bool find(char* x)
{
for (char* v : this->data) {
tracker::accesses++;
if (strcmp(v, x) == 0) {
return true;
}
}
return false;
}
std::vector<char*> data;
};
// BLOCK(SHARED)
class block
{
public:
unsigned int key;
std::vector<char*> strings;
};
// SMARTER SEARCH
class smarter_seacher
{
public:
~smarter_seacher() {
this->data = std::vector<block>();
}
void setup(std::vector<char*> data)
{
// This only works if you know the
// maximum size. This is also very memory inneficient
block b;
for (int i = 0; i != WORD_MAX + 1; i++) {
b.key = i;
this->data.push_back(b);
}
for (char* str : data)
{
tracker::accesses++;
this->data.at(strlen(str)).strings.push_back(str);
}
}
bool find(char* str)
{
size_t len = strlen(str);
for (char* v : this->data.at(len).strings)
{
tracker::accesses++;
if (strcmp(v, str) == 0) {
return true;
}
}
return false;
}
private:
std::vector<block> data;
};
// TREE SEARCH
class node
{
public:
node()
{
tracker::nodes++;
l = nullptr;
r = nullptr;
}
~node()
{
if (r != nullptr) {
delete r;
}
if (l != nullptr) {
delete l;
}
}
void push(int val, char* str)
{
if (this->set == false) {
this->value.key = val;
this->value.strings.push_back(str);
this->set = true;
}
else if (val == this->value.key) {
this->value.strings.push_back(str);
}
else if (val < this->value.key) {
if (this->l == nullptr) {
tracker::allocs++;
this->l = new node();
}
this->l->push(val, str);
}
else {
if (this->r == nullptr) {
tracker::allocs++;
this->r = new node();
}
this->r->push(val, str);
}
}
bool find(int val, char* str)
{
if (this->set == false) {
return false;
}
else if (val == this->value.key) {
for (char* v : this->value.strings)
{
tracker::accesses++;
if (strcmp(v, str) == 0) {
return true;
}
}
return false;
}
else if (val < this->value.key) {
if (this->l != nullptr) {
return l->find(val, str);
}
return false;
}
else if (val > this->value.key) {
if (this->r != nullptr) {
return r->find(val, str);
}
return false;
}
return false;
}
block value;
node* l = nullptr;
node* r = nullptr;
bool set = false;
};
class tree_searcher
{
public:
tree_searcher() {
root = node();
}
bool find(char* str)
{
return root.find(strlen(str), str);
}
void push(char* str)
{
root.push(strlen(str), str);
}
private:
node root;
std::vector<block> data;
};
class hashed_tree_searcher
{
public:
hashed_tree_searcher() {
root = node();
}
bool find(char* str) {
size_t hash = 0;
for (int i = 0; i < strlen(str); i++) {
hash += str[i];
}
return root.find(hash, str);
}
void push(char* str) {
size_t hash = 0;
for (int i = 0; i < strlen(str); i++) {
hash += str[i];
}
root.push(hash, str);
}
private:
node root;
std::vector<block> data;
};
int main(int argc, char* argv[])
{
tracker::reset();
std::vector<char*> data;
std::vector<char*> targets;
{
START_TIME
data = random_strings(SAMPLE_SIZE);
std::default_random_engine gen;
std::uniform_int_distribution<int> bruh(1, data.size() - 1);
for (int i = 0; i < TARGET_SIZE; i++) {
targets.push_back(data.at(bruh(gen)));
}
STOP_TIME_MS
std::cout << "Generating "<<SAMPLE_SIZE<<" pseudo-random strings took " << d << "ms" << std::endl;
std::cout << "Choosen "<<TARGET_SIZE<<" strings to find" << std::endl;
tracker::dump();
}
{
// This isn't fast because it is not designed to retrieve mutliple
// elements at once. So it doesn't index like other techniques do.
// or maybe STL bad :^)
std::cout << "--=== std::find ===---" << std::endl;
{
START_TIME
std::find(data.begin(), data.end(), targets.at(0));
STOP_TIME
std::cout << "Found single element in " << d <<"us"<< std::endl;
}
START_TIME
int i = 0;
for (char* str : targets)
{
if (i % (TARGET_SIZE / 10) == 0) {
std::cout << ".";
}
if (std::find(data.begin(), data.end(), str) == data.end())
{
std::cout << "std find failed" << std::endl;
return -1;
}
i++;
}
STOP_TIME_MS
std::cout << "\n";
std::cout << "std::find took " << d << "ms" << std::endl;
}
tracker::reset();
{
std::cout << "---=== (slightly)SMARTER SEARCH ===---" << std::endl;
smarter_seacher s;
{
START_TIME
s.setup(data);
STOP_TIME
std::cout << "Smarter Search overhead is " << d << "us" << std::endl;
}
{
START_TIME
s.find(targets.at(0));
STOP_TIME
std::cout << "Found single element in " << d <<"us"<< std::endl;
}
START_TIME
for (char* str : targets) {
if (!s.find(str)) {
std::cout << "Smarter searcher failed" << std::endl;
return -1;
}
}
STOP_TIME_MS
std::cout << "Smarter search took " << d << "ms" << std::endl;
tracker::dump();
}
tracker::reset();
{
std::cout << "---=== TREE SEARCH ===---" << std::endl;
tree_searcher s = tree_searcher();
{
START_TIME
for (char* str : data) {
s.push(str);
}
STOP_TIME
std::cout << "Tree search overhead is " << d << "us" << std::endl;
}
{
START_TIME
s.find(targets.at(0));
STOP_TIME
std::cout << "Found single element in " << d <<"us"<< std::endl;
}
START_TIME
for (char* str : targets) {
if (!s.find(str)) {
std::cout << "Tree searcher failed" << std::endl;
return -1;
}
}
STOP_TIME_MS
std::cout << "Tree search took " << d << "ms" << std::endl;
tracker::dump();
}
tracker::reset();
{
// Hash tree is probably already a thing
// but this is just, a binary tree composed
// of the "Hash"(which is litteraly the sum of all chars in string)
std::cout << "---=== HASH TREE SEARCH ===---" << std::endl;
hashed_tree_searcher s = hashed_tree_searcher();
{
START_TIME
for (char* str : data) {
s.push(str);
}
STOP_TIME
std::cout << "Hashed tree search overhead is " << d << "us" << std::endl;
}
{
START_TIME
s.find(targets.at(0));
STOP_TIME
std::cout << "Found single element in " << d << "us" << std::endl;
}
START_TIME
for (char* str : targets) {
if (!s.find(str)) {
std::cout << "Hash tree searcher failed" << std::endl;
return -1;
}
}
STOP_TIME_MS
std::cout << "Hash tree took " << d << "ms" << std::endl;
tracker::dump();
}
tracker::reset();
{
std::cout << "---=== DUMB SEARCH ===---" << std::endl;
dumb_searcher s;
// The "overhead" is just copying the vector, which isn't really needed and would save time
// but I do it anyway cuz it's easy like that
{
START_TIME
s.data = data;
STOP_TIME
std::cout << "Dumb search overhead is " << d << "us" << std::endl;
}
{
START_TIME
s.find(targets.at(0));
STOP_TIME
std::cout << "Found single element in " << d << "us" << std::endl;
}
START_TIME
int i = 0;
for (char* str : targets) {
if (i % (TARGET_SIZE / 10) == 0) {
std::cout << ".";
}
if (!s.find(str)) {
std::cout << "Dumb searcher failed" << std::endl;
return -1;
}
i++;
}
STOP_TIME_MS
std::cout << "\n";
std::cout << "Dumb search took " << d << "ms" << std::endl;
tracker::dump();
}
// just free because why not
for (char* ptr : data) {
free(ptr);
}
data = std::vector<char*>();
targets = std::vector<char*>();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment