binkiklou/stringsearch.cpp

## stringsearch.cpp
// I started writing this on WSL, but I'm finishing it
// on Visual Studio, cuz I want to see the native performance
// Should work with gcc, but might not.
// btw Opening the file in Visual Studio broke the whole formating

#include <string>
#include <chrono>
#include <vector>
#include <random>
#include <utility>
#include <new>
#include <iostream>
#include <cstring>

#define START_TIME auto t1 = std::chrono::high_resolution_clock::now();
#define STOP_TIME auto t2 = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();

#define STOP_TIME_MS auto t2 = std::chrono::high_resolution_clock::now(); \
auto d = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();

#define WORD_MIN 1 // Minimum number of chars
#define WORD_MAX 64 // Maximum number of chars

// Number of pseudo-random strings to make
#define SAMPLE_SIZE 1000000
// not a very fitting name but who cares
// This is the number of strings to find
#define TARGET_SIZE 100000

class tracker
{
public:
    static void reset();
    static void dump();
    static int accesses;
    static int nodes;
    static int allocs;
};

// Doesn't take into account std allocations
// cuz I'm too lazy to track that
int tracker::allocs;

// Somewhat an indication of accesses, but not
// the actual value
int tracker::accesses;

// I think that number is right
int tracker::nodes;

void tracker::reset() {
    allocs = 0;
    accesses = 0;
    nodes = 0;
}

void tracker::dump() {
    std::cout << "Allocations:" << allocs << "B" << std::endl;
    std::cout << "Accesses:" << accesses << std::endl;
    std::cout << "Nodes:" << nodes << std::endl;
}

// GENERATION
std::vector<char*> random_strings(int count)
{
    std::vector<char*> data;
    std::default_random_engine gen;
    std::uniform_int_distribution<int> len_dist(WORD_MIN, WORD_MAX);
    std::uniform_int_distribution<int> char_dist(48, 90);
    for (int i = 0; i < count; i++) {
        char len = static_cast<char>(len_dist(gen));
        tracker::allocs += len;
        char* s = (char*)malloc(len + 1);
        for (int i = 0; i < len; i++) {
            s[i] = char_dist(gen);
        }
        s[len] = '\0';
        data.push_back(s);
    }
    return data;
}

// DUMB SEARCH
class dumb_searcher
{
public:
    ~dumb_searcher() {
        data = std::vector<char*>();
    }
    bool find(char* x)
    {
        for (char* v : this->data) {
            tracker::accesses++;
            if (strcmp(v, x) == 0) {
                return true;
            }
        }
        return false;
    }
    std::vector<char*> data;
};

// BLOCK(SHARED)
class block
{
public:
    unsigned int key;
    std::vector<char*> strings;
};

// SMARTER SEARCH
class smarter_seacher
{
public:
    ~smarter_seacher() {
        this->data = std::vector<block>();
    }
    void setup(std::vector<char*> data)
    {
        // This only works if you know the
        // maximum size. This is also very memory inneficient
        block b;
        for (int i = 0; i != WORD_MAX + 1; i++) {
            b.key = i;
            this->data.push_back(b);
        }

        for (char* str : data)
        {
            tracker::accesses++;
            this->data.at(strlen(str)).strings.push_back(str);
        }
    }
    bool find(char* str)
    {
        size_t len = strlen(str);
        for (char* v : this->data.at(len).strings)
        {
            tracker::accesses++;
            if (strcmp(v, str) == 0) {
                return true;
            }
        }
        return false;
    }
private:
    std::vector<block> data;
};
// TREE SEARCH
class node
{
public:
    node()
    {
        tracker::nodes++;
        l = nullptr;
        r = nullptr;
    }
    ~node()
    {
        if (r != nullptr) {
            delete r;
        }
        if (l != nullptr) {
            delete l;
        }
    }
    void push(int val, char* str)
    {
        if (this->set == false) {
            this->value.key = val;
            this->value.strings.push_back(str);
            this->set = true;
        }
        else if (val == this->value.key) {
            this->value.strings.push_back(str);
        }
        else if (val < this->value.key) {
            if (this->l == nullptr) {
                tracker::allocs++;
                this->l = new node();
            }
            this->l->push(val, str);
        }
        else {
            if (this->r == nullptr) {
                tracker::allocs++;
                this->r = new node();
            }
            this->r->push(val, str);
        }
    }
    bool find(int val, char* str)
    {
        if (this->set == false) {
            return false;
        }
        else if (val == this->value.key) {
            for (char* v : this->value.strings)
            {
                tracker::accesses++;
                if (strcmp(v, str) == 0) {
                    return true;
                }
            }
            return false;
        }
        else if (val < this->value.key) {
            if (this->l != nullptr) {
                return l->find(val, str);
            }
            return false;
        }
        else if (val > this->value.key) {
            if (this->r != nullptr) {
                return r->find(val, str);
            }
            return false;
        }
        return false;
    }
    block value;
    node* l = nullptr;
    node* r = nullptr;
    bool set = false;
};
class tree_searcher
{
public:
    tree_searcher() {
        root = node();
    }
    bool find(char* str)
    {
        return root.find(strlen(str), str);
    }
    void push(char* str)
    {
        root.push(strlen(str), str);
    }
private:
    node root;
    std::vector<block> data;
};
class hashed_tree_searcher
{
public:
    hashed_tree_searcher() {
        root = node();
    }
    bool find(char* str) {
        size_t hash = 0;
        for (int i = 0; i < strlen(str); i++) {
            hash += str[i];
        }
        return root.find(hash, str);
    }
    void push(char* str) {
        size_t hash = 0;
        for (int i = 0; i < strlen(str); i++) {
            hash += str[i];
        }
        root.push(hash, str);
    }
private:
    node root;
    std::vector<block> data;
};

int main(int argc, char* argv[])
{
    tracker::reset();
    std::vector<char*> data;
    std::vector<char*> targets;

    {
        START_TIME
        data = random_strings(SAMPLE_SIZE);
        std::default_random_engine gen;
        std::uniform_int_distribution<int> bruh(1, data.size() - 1);
        for (int i = 0; i < TARGET_SIZE; i++) {
            targets.push_back(data.at(bruh(gen)));
        }
        STOP_TIME_MS
        std::cout << "Generating "<<SAMPLE_SIZE<<" pseudo-random strings took " << d << "ms" << std::endl;
        std::cout << "Choosen "<<TARGET_SIZE<<" strings to find" << std::endl;
        tracker::dump();
    }


    {
        // This isn't fast because it is not designed to retrieve mutliple
        // elements at once. So it doesn't index like other techniques do.
        // or maybe STL bad :^)
        std::cout << "--=== std::find ===---" << std::endl;
        {
            START_TIME
                std::find(data.begin(), data.end(), targets.at(0));
            STOP_TIME
                std::cout << "Found single element in " << d <<"us"<< std::endl;
        }
        START_TIME
            int i = 0;
            for (char* str : targets)
            {
                if (i % (TARGET_SIZE / 10) == 0) {
                    std::cout << ".";
                }
                if (std::find(data.begin(), data.end(), str) == data.end())
                {
                    std::cout << "std find failed" << std::endl;
                    return -1;
                }
                i++;
            }
        STOP_TIME_MS
            std::cout << "\n";
            std::cout << "std::find took " << d << "ms" << std::endl;
    }

    tracker::reset();

    {
        std::cout << "---=== (slightly)SMARTER SEARCH ===---" << std::endl;
        smarter_seacher s;
        {
            START_TIME
                s.setup(data);
            STOP_TIME
                std::cout << "Smarter Search overhead is " << d << "us" << std::endl;
        }
        {
            START_TIME
                s.find(targets.at(0));
            STOP_TIME
                std::cout << "Found single element in " << d <<"us"<< std::endl;
        }
        START_TIME
            for (char* str : targets) {
                if (!s.find(str)) {
                    std::cout << "Smarter searcher failed" << std::endl;
                    return -1;
                }
            }
        STOP_TIME_MS
            std::cout << "Smarter search took " << d << "ms" << std::endl;
        tracker::dump();
    }

    tracker::reset();

    {
        std::cout << "---=== TREE SEARCH ===---" << std::endl;
        tree_searcher s = tree_searcher();
        {
            START_TIME
                for (char* str : data) {
                    s.push(str);
                }
            STOP_TIME
                std::cout << "Tree search overhead is " << d << "us" << std::endl;
        }
        {
            START_TIME
                s.find(targets.at(0));
            STOP_TIME
                std::cout << "Found single element in " << d <<"us"<< std::endl;
        }
        START_TIME
            for (char* str : targets) {
                if (!s.find(str)) {
                    std::cout << "Tree searcher failed" << std::endl;
                    return -1;
                }
            }
        STOP_TIME_MS
            std::cout << "Tree search took " << d << "ms" << std::endl;
        tracker::dump();
    }

    tracker::reset();

    {
        // Hash tree is probably already a thing
        // but this is just, a binary tree composed
        // of the "Hash"(which is litteraly the sum of all chars in string)
        std::cout << "---=== HASH TREE SEARCH ===---" << std::endl;
        hashed_tree_searcher s = hashed_tree_searcher();
        {
            START_TIME
            for (char* str : data) {
                s.push(str);
            }
            STOP_TIME
                std::cout << "Hashed tree search overhead is " << d << "us" << std::endl;
        }
        {
            START_TIME
            s.find(targets.at(0));
            STOP_TIME
            std::cout << "Found single element in " << d << "us" << std::endl;
        }
        START_TIME
            for (char* str : targets) {
                if (!s.find(str)) {
                    std::cout << "Hash tree searcher failed" << std::endl;
                    return -1;
                }
            }
        STOP_TIME_MS
            std::cout << "Hash tree took " << d << "ms" << std::endl;
        tracker::dump();
    }
    tracker::reset();

    {
        std::cout << "---=== DUMB SEARCH ===---" << std::endl;
        dumb_searcher s;
        // The "overhead" is just copying the vector, which isn't really needed and would save time
        // but I do it anyway cuz it's easy like that
        {
            START_TIME
                s.data = data;
            STOP_TIME
                std::cout << "Dumb search overhead is " << d << "us" << std::endl;
        }
        {
            START_TIME
                s.find(targets.at(0));
            STOP_TIME
                std::cout << "Found single element in " << d << "us" << std::endl;
        }
        START_TIME
            int i = 0;
        for (char* str : targets) {
            if (i % (TARGET_SIZE / 10) == 0) {
                std::cout << ".";
            }
            if (!s.find(str)) {
                std::cout << "Dumb searcher failed" << std::endl;
                return -1;
            }
            i++;
        }
        STOP_TIME_MS
            std::cout << "\n";
        std::cout << "Dumb search took " << d << "ms" << std::endl;
        tracker::dump();
    }
    // just free because why not
    for (char* ptr : data) {
        free(ptr);
    }
    data = std::vector<char*>();
    targets = std::vector<char*>();
    return 0;
}
	// I started writing this on WSL, but I'm finishing it
	// on Visual Studio, cuz I want to see the native performance
	// Should work with gcc, but might not.
	// btw Opening the file in Visual Studio broke the whole formating

	#include <string>
	#include <chrono>
	#include <vector>
	#include <random>
	#include <utility>
	#include <new>
	#include <iostream>
	#include <cstring>

	#define START_TIME auto t1 = std::chrono::high_resolution_clock::now();
	#define STOP_TIME auto t2 = std::chrono::high_resolution_clock::now(); \
	auto d = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();

	#define STOP_TIME_MS auto t2 = std::chrono::high_resolution_clock::now(); \
	auto d = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();

	#define WORD_MIN 1 // Minimum number of chars
	#define WORD_MAX 64 // Maximum number of chars

	// Number of pseudo-random strings to make
	#define SAMPLE_SIZE 1000000
	// not a very fitting name but who cares
	// This is the number of strings to find
	#define TARGET_SIZE 100000

	class tracker
	{
	public:
	static void reset();
	static void dump();
	static int accesses;
	static int nodes;
	static int allocs;
	};

	// Doesn't take into account std allocations
	// cuz I'm too lazy to track that
	int tracker::allocs;

	// Somewhat an indication of accesses, but not
	// the actual value
	int tracker::accesses;

	// I think that number is right
	int tracker::nodes;

	void tracker::reset() {
	allocs = 0;
	accesses = 0;
	nodes = 0;
	}

	void tracker::dump() {
	std::cout << "Allocations:" << allocs << "B" << std::endl;
	std::cout << "Accesses:" << accesses << std::endl;
	std::cout << "Nodes:" << nodes << std::endl;
	}

	// GENERATION
	std::vector<char*> random_strings(int count)
	{
	std::vector<char*> data;
	std::default_random_engine gen;
	std::uniform_int_distribution<int> len_dist(WORD_MIN, WORD_MAX);
	std::uniform_int_distribution<int> char_dist(48, 90);
	for (int i = 0; i < count; i++) {
	char len = static_cast<char>(len_dist(gen));
	tracker::allocs += len;
	char* s = (char*)malloc(len + 1);
	for (int i = 0; i < len; i++) {
	s[i] = char_dist(gen);
	}
	s[len] = '\0';
	data.push_back(s);
	}
	return data;
	}

	// DUMB SEARCH
	class dumb_searcher
	{
	public:
	~dumb_searcher() {
	data = std::vector<char*>();
	}
	bool find(char* x)
	{
	for (char* v : this->data) {
	tracker::accesses++;
	if (strcmp(v, x) == 0) {
	return true;
	}
	}
	return false;
	}
	std::vector<char*> data;
	};

	// BLOCK(SHARED)
	class block
	{
	public:
	unsigned int key;
	std::vector<char*> strings;
	};

	// SMARTER SEARCH
	class smarter_seacher
	{
	public:
	~smarter_seacher() {
	this->data = std::vector<block>();
	}
	void setup(std::vector<char*> data)
	{
	// This only works if you know the
	// maximum size. This is also very memory inneficient
	block b;
	for (int i = 0; i != WORD_MAX + 1; i++) {
	b.key = i;
	this->data.push_back(b);
	}

	for (char* str : data)
	{
	tracker::accesses++;
	this->data.at(strlen(str)).strings.push_back(str);
	}
	}
	bool find(char* str)
	{
	size_t len = strlen(str);
	for (char* v : this->data.at(len).strings)
	{
	tracker::accesses++;
	if (strcmp(v, str) == 0) {
	return true;
	}
	}
	return false;
	}
	private:
	std::vector<block> data;
	};
	// TREE SEARCH
	class node
	{
	public:
	node()
	{
	tracker::nodes++;
	l = nullptr;
	r = nullptr;
	}
	~node()
	{
	if (r != nullptr) {
	delete r;
	}
	if (l != nullptr) {
	delete l;
	}
	}
	void push(int val, char* str)
	{
	if (this->set == false) {
	this->value.key = val;
	this->value.strings.push_back(str);
	this->set = true;
	}
	else if (val == this->value.key) {
	this->value.strings.push_back(str);
	}
	else if (val < this->value.key) {
	if (this->l == nullptr) {
	tracker::allocs++;
	this->l = new node();
	}
	this->l->push(val, str);
	}
	else {
	if (this->r == nullptr) {
	tracker::allocs++;
	this->r = new node();
	}
	this->r->push(val, str);
	}
	}
	bool find(int val, char* str)
	{
	if (this->set == false) {
	return false;
	}
	else if (val == this->value.key) {
	for (char* v : this->value.strings)
	{
	tracker::accesses++;
	if (strcmp(v, str) == 0) {
	return true;
	}
	}
	return false;
	}
	else if (val < this->value.key) {
	if (this->l != nullptr) {
	return l->find(val, str);
	}
	return false;
	}
	else if (val > this->value.key) {
	if (this->r != nullptr) {
	return r->find(val, str);
	}
	return false;
	}
	return false;
	}
	block value;
	node* l = nullptr;
	node* r = nullptr;
	bool set = false;
	};
	class tree_searcher
	{
	public:
	tree_searcher() {
	root = node();
	}
	bool find(char* str)
	{
	return root.find(strlen(str), str);
	}
	void push(char* str)
	{
	root.push(strlen(str), str);
	}
	private:
	node root;
	std::vector<block> data;
	};
	class hashed_tree_searcher
	{
	public:
	hashed_tree_searcher() {
	root = node();
	}
	bool find(char* str) {
	size_t hash = 0;
	for (int i = 0; i < strlen(str); i++) {
	hash += str[i];
	}
	return root.find(hash, str);
	}
	void push(char* str) {
	size_t hash = 0;
	for (int i = 0; i < strlen(str); i++) {
	hash += str[i];
	}
	root.push(hash, str);
	}
	private:
	node root;
	std::vector<block> data;
	};

	int main(int argc, char* argv[])
	{
	tracker::reset();
	std::vector<char*> data;
	std::vector<char*> targets;

	{
	START_TIME
	data = random_strings(SAMPLE_SIZE);
	std::default_random_engine gen;
	std::uniform_int_distribution<int> bruh(1, data.size() - 1);
	for (int i = 0; i < TARGET_SIZE; i++) {
	targets.push_back(data.at(bruh(gen)));
	}
	STOP_TIME_MS
	std::cout << "Generating "<<SAMPLE_SIZE<<" pseudo-random strings took " << d << "ms" << std::endl;
	std::cout << "Choosen "<<TARGET_SIZE<<" strings to find" << std::endl;
	tracker::dump();
	}


	{
	// This isn't fast because it is not designed to retrieve mutliple
	// elements at once. So it doesn't index like other techniques do.
	// or maybe STL bad :^)
	std::cout << "--=== std::find ===---" << std::endl;
	{
	START_TIME
	std::find(data.begin(), data.end(), targets.at(0));
	STOP_TIME
	std::cout << "Found single element in " << d <<"us"<< std::endl;
	}
	START_TIME
	int i = 0;
	for (char* str : targets)
	{
	if (i % (TARGET_SIZE / 10) == 0) {
	std::cout << ".";
	}
	if (std::find(data.begin(), data.end(), str) == data.end())
	{
	std::cout << "std find failed" << std::endl;
	return -1;
	}
	i++;
	}
	STOP_TIME_MS
	std::cout << "\n";
	std::cout << "std::find took " << d << "ms" << std::endl;
	}

	tracker::reset();

	{
	std::cout << "---=== (slightly)SMARTER SEARCH ===---" << std::endl;
	smarter_seacher s;
	{
	START_TIME
	s.setup(data);
	STOP_TIME
	std::cout << "Smarter Search overhead is " << d << "us" << std::endl;
	}
	{
	START_TIME
	s.find(targets.at(0));
	STOP_TIME
	std::cout << "Found single element in " << d <<"us"<< std::endl;
	}
	START_TIME
	for (char* str : targets) {
	if (!s.find(str)) {
	std::cout << "Smarter searcher failed" << std::endl;
	return -1;
	}
	}
	STOP_TIME_MS
	std::cout << "Smarter search took " << d << "ms" << std::endl;
	tracker::dump();
	}

	tracker::reset();

	{
	std::cout << "---=== TREE SEARCH ===---" << std::endl;
	tree_searcher s = tree_searcher();
	{
	START_TIME
	for (char* str : data) {
	s.push(str);
	}
	STOP_TIME
	std::cout << "Tree search overhead is " << d << "us" << std::endl;
	}
	{
	START_TIME
	s.find(targets.at(0));
	STOP_TIME
	std::cout << "Found single element in " << d <<"us"<< std::endl;
	}
	START_TIME
	for (char* str : targets) {
	if (!s.find(str)) {
	std::cout << "Tree searcher failed" << std::endl;
	return -1;
	}
	}
	STOP_TIME_MS
	std::cout << "Tree search took " << d << "ms" << std::endl;
	tracker::dump();
	}

	tracker::reset();

	{
	// Hash tree is probably already a thing
	// but this is just, a binary tree composed
	// of the "Hash"(which is litteraly the sum of all chars in string)
	std::cout << "---=== HASH TREE SEARCH ===---" << std::endl;
	hashed_tree_searcher s = hashed_tree_searcher();
	{
	START_TIME
	for (char* str : data) {
	s.push(str);
	}
	STOP_TIME
	std::cout << "Hashed tree search overhead is " << d << "us" << std::endl;
	}
	{
	START_TIME
	s.find(targets.at(0));
	STOP_TIME
	std::cout << "Found single element in " << d << "us" << std::endl;
	}
	START_TIME
	for (char* str : targets) {
	if (!s.find(str)) {
	std::cout << "Hash tree searcher failed" << std::endl;
	return -1;
	}
	}
	STOP_TIME_MS
	std::cout << "Hash tree took " << d << "ms" << std::endl;
	tracker::dump();
	}
	tracker::reset();

	{
	std::cout << "---=== DUMB SEARCH ===---" << std::endl;
	dumb_searcher s;
	// The "overhead" is just copying the vector, which isn't really needed and would save time
	// but I do it anyway cuz it's easy like that
	{
	START_TIME
	s.data = data;
	STOP_TIME
	std::cout << "Dumb search overhead is " << d << "us" << std::endl;
	}
	{
	START_TIME
	s.find(targets.at(0));
	STOP_TIME
	std::cout << "Found single element in " << d << "us" << std::endl;
	}
	START_TIME
	int i = 0;
	for (char* str : targets) {
	if (i % (TARGET_SIZE / 10) == 0) {
	std::cout << ".";
	}
	if (!s.find(str)) {
	std::cout << "Dumb searcher failed" << std::endl;
	return -1;
	}
	i++;
	}
	STOP_TIME_MS
	std::cout << "\n";
	std::cout << "Dumb search took " << d << "ms" << std::endl;
	tracker::dump();
	}
	// just free because why not
	for (char* ptr : data) {
	free(ptr);
	}
	data = std::vector<char*>();
	targets = std::vector<char*>();
	return 0;
	}
No results found