Created
January 27, 2017 08:05
-
-
Save betatim/9730aa06cd2ec0fb7d0cec7cfec8969b to your computer and use it in GitHub Desktop.
BloomFilter talk at PyDataZRH January 2017
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "<div style=\"position: relative\">\n", | |
| " <img style=\"position: absolute; top: 0; right: 0; max-height: 130px\" src=\"logo.jpg\" />\n", | |
| "</div>\n", | |
| "# Bloom Filters\n", | |
| "\n", | |
| "\n", | |
| "**Tim Head, [Wild Tree Tech](https://www.wildtreetech.com)**\n", | |
| "\n", | |
| "\n", | |
| "**26 January 2017**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "For more have a look at: http://betatim.github.io/posts/genome-hackers/ and http://betatim.github.io/posts/improbable-afternoon/." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from sklearn.utils import murmurhash3_32" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def hash_(value, seed=1):\n", | |
| " return murmurhash3_32(value, seed=seed, positive=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "3142237357" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "hash_(\"hello\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "bits = np.zeros(100, dtype=bool)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False, False], dtype=bool)" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "bits" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def insert(bf, value):\n", | |
| " idx = hash_(value) % bf.size\n", | |
| " bf[idx] += 1\n", | |
| " return bf" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, True, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False, False], dtype=bool)" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "insert(bits, \"hello\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, True, False, True, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False, False], dtype=bool)" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "insert(bits, \"world\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def contains(bits, value):\n", | |
| " idx = hash_(value) % bits.size\n", | |
| " return bits[idx]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "contains(bits, \"world\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "class BloomFilter:\n", | |
| " def __init__(self, m, k):\n", | |
| " self.bits = np.zeros(m, dtype=bool)\n", | |
| " self.k = k\n", | |
| " \n", | |
| " def insert(self, value):\n", | |
| " for i in range(self.k):\n", | |
| " idx = hash_(value, seed=1+i) % self.bits.size\n", | |
| " self.bits[idx] += 1\n", | |
| " \n", | |
| " def contains(self, value):\n", | |
| " for i in range(self.k):\n", | |
| " idx = hash_(value, seed=1+i) % self.bits.size\n", | |
| " if not self.bits[idx]:\n", | |
| " return False\n", | |
| " \n", | |
| " return True" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "bf = BloomFilter(100, 3)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "bf.insert(\"world\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, True, False, False, False, True, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, True, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False,\n", | |
| " False, False, False, False, False, False, False, False, False, False], dtype=bool)" | |
| ] | |
| }, | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "bf.bits" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "bf.contains(\"world\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "False" | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "bf.contains(\"hello\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "graph = BloomFilter(1000, 3)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "graph.insert(\"ACGTT\")\n", | |
| "graph.insert(\"CGTTA\")\n", | |
| "graph.insert(\"GTTAT\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "9" | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "graph.bits.sum()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def next_nodes(node):\n", | |
| " for c in \"ACGT\":\n", | |
| " yield node[1:] + c" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def walk(graph, start):\n", | |
| " now = start\n", | |
| " while True:\n", | |
| " print(now, '->', end=\" \")\n", | |
| " for nxt in next_nodes(now):\n", | |
| " if graph.contains(nxt):\n", | |
| " now = nxt\n", | |
| " break\n", | |
| " else:\n", | |
| " break\n", | |
| " " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "ACGTT -> CGTTA -> GTTAT -> " | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "walk(graph, \"ACGTT\")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.5.2" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment