Last active
May 30, 2023 05:55
-
-
Save patrickspencer/b2afbcf9385dbe745b1ecfd31b449f52 to your computer and use it in GitHub Desktop.
main_huggingface_transformer_embeddings.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyPqLOVobNZxltwSHcO5KBLA", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/patrickspencer/b2afbcf9385dbe745b1ecfd31b449f52/main_huggingface_transformer_embeddings.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "%%capture\n", | |
| "try:\n", | |
| " import sentence_transformers\n", | |
| "except:\n", | |
| " !pip install sentence_transformers\n", | |
| "\n", | |
| "try:\n", | |
| " import datasets\n", | |
| "except:\n", | |
| " !pip install datasets\n", | |
| "\n", | |
| "try:\n", | |
| " import redis\n", | |
| "except:\n", | |
| " !pip redis-server redis\n", | |
| "\n", | |
| "!pip apache_beam mwparserfromhell" | |
| ], | |
| "metadata": { | |
| "id": "3bob4-IPfTVJ" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import sentence_transformers\n", | |
| "from sentence_transformers import SentenceTransformer, util\n", | |
| "import datasets\n", | |
| "from datasets import load_dataset\n", | |
| "import redis\n", | |
| "from redis.commands.search.field import VectorField\n", | |
| "from redis.commands.search.query import Query\n", | |
| "import os\n", | |
| "import numpy as np" | |
| ], | |
| "metadata": { | |
| "id": "pcY5MYQKs9wF", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 381 | |
| }, | |
| "outputId": "af6ca09e-381e-4c30-8cdc-3bd58c5748a5" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "error", | |
| "ename": "ModuleNotFoundError", | |
| "evalue": "ignored", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", | |
| "\u001b[0;32m<ipython-input-2-7e5233240a8a>\u001b[0m in \u001b[0;36m<cell line: 5>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdatasets\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mdatasets\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mredis\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mredis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommands\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfield\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mVectorField\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mredis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcommands\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msearch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mquery\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mQuery\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", | |
| "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'redis'", | |
| "", | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0;32m\nNOTE: If your import is failing due to a missing package, you can\nmanually install dependencies using either !pip or !apt.\n\nTo view examples of installing some common dependencies, click the\n\"Open Examples\" button below.\n\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n" | |
| ], | |
| "errorDetails": { | |
| "actions": [ | |
| { | |
| "action": "open_url", | |
| "actionText": "Open Examples", | |
| "url": "/notebooks/snippets/importing_libraries.ipynb" | |
| } | |
| ] | |
| } | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "data = load_dataset(\"wikipedia\", \"20220301.simple\")" | |
| ], | |
| "metadata": { | |
| "id": "lIvcGaFNjrS-" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "data" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "V-W9k44Okifs", | |
| "outputId": "4596a400-7262-4201-9176-86868fcb5b80" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "DatasetDict({\n", | |
| " train: Dataset({\n", | |
| " features: ['id', 'url', 'title', 'text'],\n", | |
| " num_rows: 205328\n", | |
| " })\n", | |
| "})" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 5 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "DOC_NUMBER = 100\n", | |
| "corpus = []\n", | |
| "for article in data['train']:\n", | |
| " corpus.append(article['title']) # +'[SEPSP]'+article['text'])" | |
| ], | |
| "metadata": { | |
| "id": "0ZopSZZvjw1B" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "model = SentenceTransformer('all-MiniLM-L6-v2')" | |
| ], | |
| "metadata": { | |
| "id": "1xOxKPJOlFj-" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "%%time\n", | |
| "embeddings = {}\n", | |
| "for i, article in enumerate(corpus[:DOC_NUMBER]):\n", | |
| " e = model.encode(article)\n", | |
| " embeddings[i] = e" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "Ln1PoPiwn5mX", | |
| "outputId": "687fc784-b54e-43d2-b771-c3a231b8ec69" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "CPU times: user 1.48 s, sys: 4.81 ms, total: 1.49 s\n", | |
| "Wall time: 1.53 s\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "%%time\n", | |
| "source_embedding = model.encode('How big is London')" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "SYSmdOR9qp52", | |
| "outputId": "9b1320cd-56e8-426c-f876-6f0362d0ad16" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "CPU times: user 22.9 ms, sys: 890 µs, total: 23.8 ms\n", | |
| "Wall time: 23.3 ms\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "source_embedding[:DOC_NUMBER]" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "u4lsBPiJrxFg", | |
| "outputId": "93ce92b3-4495-4bd5-9eb2-66999e9d2423" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "array([ 0.15680279, 0.01084097, 0.05628692, -0.06912572, -0.04371276,\n", | |
| " 0.00547062, -0.00742654, -0.01148547, -0.08175434, -0.01239392,\n", | |
| " -0.06111924, -0.01476254, -0.05164117, -0.00105716, -0.04089508,\n", | |
| " -0.00423087, 0.02094622, -0.1030864 , -0.06753682, -0.03358319,\n", | |
| " 0.06726886, -0.02080584, 0.03697657, 0.06715704, -0.03812541,\n", | |
| " 0.0254552 , 0.00336791, 0.05342964, 0.02025586, 0.01062218,\n", | |
| " 0.02175471, -0.05132837, 0.01895596, 0.00033193, 0.01128084,\n", | |
| " 0.02568322, -0.00223427, 0.05819876, 0.05859486, -0.01609047,\n", | |
| " 0.0680901 , -0.02290127, 0.06352419, 0.01434694, 0.07030372,\n", | |
| " 0.01630854, 0.05723406, 0.01412784, -0.04280038, -0.12563422,\n", | |
| " 0.09168827, 0.03223368, 0.01189326, -0.07284012, -0.06457692,\n", | |
| " -0.01143881, -0.06565946, 0.00821731, -0.03890379, -0.04608008,\n", | |
| " -0.0420728 , 0.08040943, -0.06341829, -0.0118541 , 0.02523196,\n", | |
| " -0.01066248, -0.01198085, -0.02088821, 0.01127287, -0.02137115,\n", | |
| " -0.03684983, -0.03193337, -0.01050104, -0.03511192, 0.03988777,\n", | |
| " -0.0470114 , -0.03762781, -0.02189676, -0.06246338, 0.03126503,\n", | |
| " 0.006937 , -0.01092182, 0.00839958, 0.07916221, -0.02909099,\n", | |
| " -0.05399996, 0.04681252, 0.01613694, -0.03068321, -0.05426995,\n", | |
| " 0.023084 , -0.0373847 , -0.06804448, 0.12304887, -0.00288515,\n", | |
| " 0.01853398, -0.00167669, -0.05787184, 0.00661902, 0.04113097],\n", | |
| " dtype=float32)" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 10 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "(200*63) / 1000" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "hAnunHEDq5bT", | |
| "outputId": "431d250d-96e8-4f2d-d9cc-00f43463aea5" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "12.6" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 13 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "%%time\n", | |
| "for k, e in embeddings.items():\n", | |
| " print(\"Similarity:\", util.dot_score(source_embedding, e))" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "Zo_qxOUCoIxQ", | |
| "outputId": "740e184b-6e45-46e7-faab-fe048b243a6b" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Similarity: tensor([[0.0273]])\n", | |
| "Similarity: tensor([[-0.0051]])\n", | |
| "Similarity: tensor([[0.0009]])\n", | |
| "Similarity: tensor([[0.0376]])\n", | |
| "Similarity: tensor([[0.0483]])\n", | |
| "Similarity: tensor([[0.1139]])\n", | |
| "Similarity: tensor([[0.1006]])\n", | |
| "Similarity: tensor([[0.0565]])\n", | |
| "Similarity: tensor([[-0.0380]])\n", | |
| "Similarity: tensor([[-0.0245]])\n", | |
| "Similarity: tensor([[-0.0090]])\n", | |
| "Similarity: tensor([[0.1444]])\n", | |
| "Similarity: tensor([[0.0266]])\n", | |
| "Similarity: tensor([[0.1903]])\n", | |
| "Similarity: tensor([[0.1102]])\n", | |
| "Similarity: tensor([[-0.0023]])\n", | |
| "Similarity: tensor([[-0.0426]])\n", | |
| "Similarity: tensor([[0.0073]])\n", | |
| "Similarity: tensor([[0.0464]])\n", | |
| "Similarity: tensor([[-0.1063]])\n", | |
| "Similarity: tensor([[-0.0005]])\n", | |
| "Similarity: tensor([[-0.0183]])\n", | |
| "Similarity: tensor([[0.0476]])\n", | |
| "Similarity: tensor([[0.0118]])\n", | |
| "Similarity: tensor([[0.0541]])\n", | |
| "Similarity: tensor([[0.1319]])\n", | |
| "Similarity: tensor([[0.0248]])\n", | |
| "Similarity: tensor([[0.0228]])\n", | |
| "Similarity: tensor([[0.0938]])\n", | |
| "Similarity: tensor([[-0.0682]])\n", | |
| "Similarity: tensor([[0.0214]])\n", | |
| "Similarity: tensor([[0.1263]])\n", | |
| "Similarity: tensor([[0.1384]])\n", | |
| "Similarity: tensor([[0.0397]])\n", | |
| "Similarity: tensor([[0.0290]])\n", | |
| "Similarity: tensor([[0.0267]])\n", | |
| "Similarity: tensor([[0.0027]])\n", | |
| "Similarity: tensor([[0.0144]])\n", | |
| "Similarity: tensor([[-0.0129]])\n", | |
| "Similarity: tensor([[0.0188]])\n", | |
| "Similarity: tensor([[-0.0607]])\n", | |
| "Similarity: tensor([[0.0576]])\n", | |
| "Similarity: tensor([[0.0831]])\n", | |
| "Similarity: tensor([[0.0127]])\n", | |
| "Similarity: tensor([[0.2517]])\n", | |
| "Similarity: tensor([[0.0166]])\n", | |
| "Similarity: tensor([[0.2739]])\n", | |
| "Similarity: tensor([[0.0332]])\n", | |
| "Similarity: tensor([[-0.0864]])\n", | |
| "Similarity: tensor([[-0.0088]])\n", | |
| "Similarity: tensor([[-0.0182]])\n", | |
| "Similarity: tensor([[0.0247]])\n", | |
| "Similarity: tensor([[0.0424]])\n", | |
| "Similarity: tensor([[0.0434]])\n", | |
| "Similarity: tensor([[-0.0569]])\n", | |
| "Similarity: tensor([[0.1749]])\n", | |
| "Similarity: tensor([[0.0598]])\n", | |
| "Similarity: tensor([[0.0172]])\n", | |
| "Similarity: tensor([[-0.0548]])\n", | |
| "Similarity: tensor([[-0.0072]])\n", | |
| "Similarity: tensor([[0.0567]])\n", | |
| "Similarity: tensor([[0.0500]])\n", | |
| "Similarity: tensor([[0.1615]])\n", | |
| "Similarity: tensor([[-0.0400]])\n", | |
| "Similarity: tensor([[0.1370]])\n", | |
| "Similarity: tensor([[0.1413]])\n", | |
| "Similarity: tensor([[0.2447]])\n", | |
| "Similarity: tensor([[0.1307]])\n", | |
| "Similarity: tensor([[-0.0424]])\n", | |
| "Similarity: tensor([[0.0402]])\n", | |
| "Similarity: tensor([[-0.0483]])\n", | |
| "Similarity: tensor([[0.0035]])\n", | |
| "Similarity: tensor([[0.0097]])\n", | |
| "Similarity: tensor([[0.0313]])\n", | |
| "Similarity: tensor([[0.1572]])\n", | |
| "Similarity: tensor([[0.3134]])\n", | |
| "Similarity: tensor([[-0.0611]])\n", | |
| "Similarity: tensor([[0.0582]])\n", | |
| "Similarity: tensor([[0.0846]])\n", | |
| "Similarity: tensor([[0.0505]])\n", | |
| "Similarity: tensor([[0.0621]])\n", | |
| "Similarity: tensor([[-0.0278]])\n", | |
| "Similarity: tensor([[-0.0229]])\n", | |
| "Similarity: tensor([[0.0292]])\n", | |
| "Similarity: tensor([[0.1297]])\n", | |
| "Similarity: tensor([[0.0940]])\n", | |
| "Similarity: tensor([[0.0563]])\n", | |
| "Similarity: tensor([[0.0502]])\n", | |
| "Similarity: tensor([[0.0554]])\n", | |
| "Similarity: tensor([[0.1013]])\n", | |
| "Similarity: tensor([[0.1072]])\n", | |
| "Similarity: tensor([[-0.0042]])\n", | |
| "Similarity: tensor([[-0.0100]])\n", | |
| "Similarity: tensor([[-0.0229]])\n", | |
| "Similarity: tensor([[0.0912]])\n", | |
| "Similarity: tensor([[0.0099]])\n", | |
| "Similarity: tensor([[0.0062]])\n", | |
| "Similarity: tensor([[0.0511]])\n", | |
| "Similarity: tensor([[0.0534]])\n", | |
| "Similarity: tensor([[0.0290]])\n", | |
| "CPU times: user 156 ms, sys: 15.4 ms, total: 172 ms\n", | |
| "Wall time: 158 ms\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "query_embedding = model.encode('How big is London')\n", | |
| "passage_embedding = model.encode(['London has 9,787,426 inhabitants at the 2011 census',\n", | |
| " 'London is known for its finacial district'])\n", | |
| "\n", | |
| "print(\"Similarity:\", util.dot_score(query_embedding, passage_embedding))" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "wSHWGfr3fRm3", | |
| "outputId": "d3a30ea0-1a4b-4397-b790-fb1638378e83" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Similarity: tensor([[0.5627, 0.5645]])\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "len(passage_embedding[0])" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "EMmNVvolgY8K", | |
| "outputId": "6c3acd50-b8fd-4fc7-b1ea-45e387a25be3" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "384" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 5 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## Upload to Redis" | |
| ], | |
| "metadata": { | |
| "id": "GXMWmqXouhrd" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "%env host=redis-17080.c16.us-east-1-2.ec2.cloud.redislabs.com\n", | |
| "%env port=17080\n", | |
| "%env password=taZbopSA1nvVH8DOKxrRkWQYymIWpiV3" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "If1RB6U6gMwe", | |
| "outputId": "f188ef52-5f05-458e-80ae-b1b1c623c76e" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "env: host=redis-17080.c16.us-east-1-2.ec2.cloud.redislabs.com\n", | |
| "env: port=17080\n", | |
| "env: password=taZbopSA1nvVH8DOKxrRkWQYymIWpiV3\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "host = os.getenv('host')\n", | |
| "port = os.getenv('port')\n", | |
| "password = os.getenv('password')\n", | |
| "\n", | |
| "client = redis.Redis(host = host, port = port, password = password)\n", | |
| "client.ping()" | |
| ], | |
| "metadata": { | |
| "id": "AnpGtoYhiYbq", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "outputId": "c8f999cb-e1cc-4da4-8b9b-f6a0fcf331d8" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 14 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "vector_field = \"vector\"\n", | |
| "dim = 128\n", | |
| "\n", | |
| "# Store a blob of a random vector of type float32 under a field named 'vector' in Redis hash.\n", | |
| "np_vector = np.random.rand(dim).astype(np.float32)\n", | |
| "client.hset('key', mapping = {vector_field: np_vector.tobytes()})" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "k1kPJUDtu46G", | |
| "outputId": "2d9477f1-9118-4f6c-dd83-97afed03833f" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "0" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 15 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "schema = (VectorField(\"v\", \"HNSW\", {\"TYPE\": \"FLOAT32\", \"DIM\": 384, \"DISTANCE_METRIC\": \"L2\"}),)\n", | |
| "client.ft().create_index(schema)\n", | |
| "# client.ft().dropindex(schema)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "90RT75fRvhd5", | |
| "outputId": "22f9b6ea-7a88-4eb4-ce4b-ea6edd77b91b" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "b'OK'" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 56 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "type(embeddings)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "DqZ1Fs0nwkNS", | |
| "outputId": "f672d941-c5c8-46da-da04-91d97da9f800" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "dict" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 23 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "%%time\n", | |
| "import itertools\n", | |
| "\n", | |
| "for i, e in itertools.islice(embeddings.items(), 100):\n", | |
| " if i % 50 == 0:\n", | |
| " print(i)\n", | |
| " client.hset(str(i), \"v\", e.tobytes())" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "e1BmdYN9vZ-u", | |
| "outputId": "ebb53f51-130c-42ae-8ae5-fb672fa8e4ce" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "0\n", | |
| "50\n", | |
| "CPU times: user 29.7 ms, sys: 3.56 ms, total: 33.3 ms\n", | |
| "Wall time: 265 ms\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "len(e)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "7qZ3TYnE6plq", | |
| "outputId": "2a51d25f-1455-4f41-ab19-8feaaba8ac8a" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "384" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 58 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| " # test that the embedding was set\n", | |
| " # client.hget(str(i), \"v\")" | |
| ], | |
| "metadata": { | |
| "id": "Vkvrc3tA1X0F" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "%%time\n", | |
| "source_embedding = model.encode('How big is London')" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "i9iiC3nm2IIo", | |
| "outputId": "b9c840c2-7036-40fe-f52a-a98d31066973" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "CPU times: user 24.2 ms, sys: 681 µs, total: 24.9 ms\n", | |
| "Wall time: 29.2 ms\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "source_embedding.tobytes()" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "XbFeyqM92JPz", | |
| "outputId": "8dba0db0-f511-402a-f11d-d351c0fe46cb" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "b'\\xe9\\x90 >R\\x9e1<\\x1c\\x8df=\\xc9\\x91\\x8d\\xbd\\'\\x0c3\\xbd\\xdbB\\xb3;]Z\\xf3\\xbb\\x8d-<\\xbc\\xd2n\\xa7\\xbd\\xd9\\x0fK\\xbc,Xz\\xbd\\x98\\xdeq\\xbc\\xb0\\x85S\\xbd\\\\\\x90\\x8a\\xba\\x99\\x81\\'\\xbd\\x1c\\xa3\\x8a\\xbbf\\x97\\xab<\\xf7\\x1e\\xd3\\xbd\\xbeP\\x8a\\xbd\\x88\\x8e\\t\\xbdB\\xc4\\x89=\\x01q\\xaa\\xbc\\xbft\\x17=\\xa1\\x89\\x89=e)\\x1c\\xbdl\\x87\\xd0< \\xb8\\\\;\\n\\xd9Z=\\x9f\\xef\\xa5<\\xaa\\x08.<\\xf16\\xb2<\\xb2=R\\xbd\\x88I\\x9b<\\x01\\x07\\xae9E\\xd38<\\xa0e\\xd2<\\xddl\\x12\\xbb\\xd4an=*\\x01p=)\\xd0\\x83\\xbc\\xd3r\\x8b=p\\x9b\\xbb\\xbc\\xf9\\x18\\x82=j\\x0fk<f\\xfb\\x8f=|\\x99\\x85<Cnj=yxg<sO/\\xbdB\\xa6\\x00\\xbe\\x0f\\xc7\\xbb=v\\x07\\x04=\\xee\\xdbB<3-\\x95\\xbd\\xe8@\\x84\\xbd\\xd6i;\\xbcxx\\x86\\xbd\\xea\\xa1\\x06<\\x94Y\\x1f\\xbdx\\xbe<\\xbd\\x86T,\\xbd\\xb3\\xad\\xa4=s\\xe1\\x81\\xbd\\xae7B\\xbcB\\xb3\\xce<\\xae\\xb1.\\xbcWKD\\xbc\\xc0\\x1d\\xab\\xbc\\xdb\\xb18<\\x8b\\x12\\xaf\\xbc\\xd8\\xef\\x16\\xbd\\x90\\xcc\\x02\\xbd\\x93\\x0c,\\xbc\\x84\\xd1\\x0f\\xbd]a#=\\x06\\x8f@\\xbd\\x9d\\x1f\\x1a\\xbd\\xd6`\\xb3\\xbc\\x9b\\xd9\\x7f\\xbd\\xc2\\x0f\\x00=\\xc9O\\xe3;l\\xf12\\xbcc\\x9e\\t<\\xcc\\x1f\\xa2=:P\\xee\\xbc\\x0f/]\\xbd|\\xbe?=\\x9d1\\x84<Y[\\xfb\\xbc+J^\\xbd\\xa9\\x1a\\xbd<\\xb2 \\x19\\xbd\\xe7Z\\x8b\\xbd\\x0c\\x01\\xfc=\\xdd\\x14=\\xbb\\x91\\xd4\\x97<R\\xc4\\xdb\\xba\\x06\\x0bm\\xbdX\\xe4\\xd8;\\xf3x(=\\xe3\\xef\\x9e:\\xaaj\\xa8=\\n\\x07\\x17=\\x0f\\xab\\x9d=\\xf7\\x9f\\xaf;\\xe9\\xdbV\\xbd\\xec\\xf1\\x07\\xbcI\\t\\x82=\\xc4\\x98\\x91=\\x85\\x02\\x03\\xbe\\x14.+=gO\\xc7\\xbcw\\xdd\\r<\\x92\\xd9U=d$\\xa2;\\x05\\xde\\x8b\\xbd5^\\xd5<?5h\\xbd\\xb1\\x9b\\xe6\\xbc\\x17\\x97\\xfb\\xbc\\x13\\xed\\x9c=\\xf2\\xb4\\xd9<0\\xa0`\\xbcG\\xca\\xdb<\\xb3\\xa2U\\xbd\\xe5y\\x93=N\\xe4\\xb8<\\xefP\\xdf\\x89\\xe9}\\x96\\xbd\\x13\\xf2\\xa9\\xbb\\x18_\\xa2=2\\xbef=\\x9c|\\x1c=&`\\x0c\\xbc\\xe2\\x8ap\\xbb\\x14\\xdbK=\\x036_=\\x91e\\x9b=\\xcb\\xc1/=\\xd5F\\xb8\\xbc\\xc3n\\xb1=E\\xba\\xec\\xbc\\xb3\\x1f\\x9d=\\x80z^=\\x99\\x1c\\xa3=\\x11\\xb1\\x12=\\x95\\xc2\\xab\\xbd\\x11\\xfc\\xac\\xbc\\x7f!4\\xbd\\xdc$\\x9f\\xbb\\xb37\\x97=\\x88,\\x9a<\\xf2\\x83\\r\\xbd\\x00,\\x9a\\xbd%\\x1c\\x05\\xbb\\xe8\\x9a.=8\\x97\\x04=\\xa4\\xee\\xae\\xbaT9\\xa9\\xbc\\xce}\\xdc\\xbcp\\xc6\\x1d\\xbba\\xdaK=\\x1cz\\xc5\\xbc\\x0ck}\\xbcy\\xc2\\x14\\xbd\\x87\\xda \\xbdy2\\x18=vN&\\xbd\\xbe\\x19\"<\\x86\\xaft\\xbd\\x8e\\xb5\\xd5<\\x96\\x05=\\xbd\\xdc\\xda\\xe29\\xb3\\x1b\\x01=[\\x82\\xe1\\xbc^\\xdb\\xd9\\xbd\\x14~G\\xbc\\x85\\x00\\xd9\\xbb+E\\x16>\\xb4\\x13M;\\xadR\\xaf\\xbd\\x1f\\xd0\\xe4\\xbc\\xa2(\\x87=x\\x93?=BX\\xd4<\\xea\\xd6\\x95<\\xa9#\\x05=\\xddR\\x96=\\x96)\\xd0<oH\\x88=do\\x8b<J\\xa1\\xaf;\\x1b\\xf4\\xa1=\\xf6\\x05\\x13\\xbdl\\x19\\xfc;\\xefo:\\xbd\\xb6\\xc6\\x97\\xbdtI8<R\\xd3K<d\\xb6)=\\x1a\\xe4\\xfe=j\\x8b/;\\xbe.9\\xbc\\xe9\\xe5J\\xbd\\x13ko\\xbd\\xa7\\xc7\\xd4:4\\xf7\\xf0:\\xd3y\\xf0=!\\x99h;\\xd1\\xa6\\xcf\\xbcN\\xeeI<1-*\\xbdg\\x97\\xe6<\\x14\\xf4\\x9c\\xbd\\xfeA\\xdc;j\\x9b\\xd0\\xbc#eR\\xbc\\xb1\\xc6\\x95\\xbd\\x0b\\x93\\xd9\\xbd=Q\\x0b\\xbd\\x87\\xf4\\x939#y\"\\xbd\\xdf\\xe2P\\xbdVPO\\tq\\xc4K\\xbb-\\xce?\\xbc\\x8a|\\x8e\\xbb\\xe5\\xfa\\x9c\\xbc\\xd2\\xb5\\x86\\xbdx\\x84\\x07=\\x1fH^=1\\xd1\\xef=\\xe9\\xc2\\x11\\xbc\\x80M\\x9f=\\xf3\\x87\\xff\\xbc\\x08\\xd0$\\xbc\\x010&>\\x88#\\\\\\xbd\\xe0\\x8f\\xa5=\\xe7\\x8b\\xed\\xbc[\\xed\\x9a=r\\xa5\\x8e\\xbdl.\\x96:#\\xd3F=\\xf2\\x1ag=\\n\\xab\\xd3\\xbd\\x93\\xaa\\x18<\\x8a\\xd5\\xf2<I\\xb2\\x99\\xbb\\xbdp\\x17\\xbd\\x10\\x1f}\\xbd\\x10*\\xa3\\xbd\\x16Z \\xbd\"\\xa9\\x0e\\xbc\\x00;w\\xbd\\xd8\\xa2\\t\\xbc\\xcdB\\x8a<\\xde\\x08\\xf3\\xbc\\x8d?\\x07<Q\\xd5J\\xbd[&d=\\x03J\\x14\\xbd\\x9f\\x1f\\x1a=\\xd5\\x91\\x80<\\xd2\\xf1\\x89\\xbd \\xc0\\x8e\\xbdn\\xb6o<\\x98\\xc6w\\xbc t\\xe4=\\xf4\\x1e>\\xbc\\x8a\\x0b\\x0b\\xbd\\xbb\\x8c\\xc9;\\xf3\\xc6H=\\x85.\\xd8\\xbcQ~\\x1c=\\xed\\xee\\xa5=8\\x9b\\xd4\\xbc\\xf5V\\x90\\xbc\\xae\\xd2G=\\x03\\xb1\\xca;\\x01e\\xcb\\xbb\\xe8\\xf2\\xb4<H=\\x81<\\xa0\\xa8\\x88\\xbd\\x90\\x98\\x00\\xbd\\x94P\\x8e\\xbd\\x99QY\\xbc\\x15\\xdf\\xd9<\\x0e\\x19y\\xbd\\xb8\\x8c:;h \\xd6\\xbb\\xac\\xdf\\xc7<\\x04\\x97=\\xbcdWH=\\xff\\'c\\xbd?]\\x0e;\\xc3\\xab@\\xbbZ\\x9f\\x19<0\\x97\\xa2\\xbd\\x9a[;\\xbd\\x8a\\xee\\x9e=\\xa1\\xd2\\\\<%h\\xad=D\\xf9i=\\x8c?\\xc0=\\xa7\\xddj<\\xfc!\\xa8=U\\xfb!\\xbd\\x93Fb=\\x16\\xff\\xd6\\xbcd\\xa1\\x84=\\xeeB\\xf5=C\\x8d\\x1d\\xbds\\xd8\\xba\\xbb\\xbf\\x03\\xe6<\\xef\\xf4\\xed<J\\x979\\xbd\\xeaO\\xc6\\xbd\\xf3\\xf9V=}-c\\xb2\\xa6\\xef\\xae\\xbc\\x9949\\xbc\\x87\\xf0\\x10\\xbd\\xebP\\x10=\\xd4q\\x14\\xbdj?\\xe4\\xbc\\xd6\\x0b\\x90=b\\x11\\x9f=\\xd2\\x9e\\xae\\xbc\\xd15\\x02=9ke<\\xecV\\xca;\\xaa\\xd0\\x99\\xbd\\xc3\\xe5\\xbe=\\xe7#\\xdf\\xbd[>\\xce\\xbc\\x1d\\x02\\xc5\\xbdL\\x12\\x00\\xbe\\xcb\\xc0P</\\xa1\\x1b=\\x1b\\x9e\\xbc\\xbc:\\xb6\\xb6<\\xc7Z>\\xbd\\x08\\x01\\x93\\xbcF\\xc3\\x94=8r\\xfc\\xbd\\xdd\\x0es<\\xe9\\x12l:\\x854M\\xbd\\x04\\xba\\x95\\xbd\\xe0\\xaf\\\\=\\xf8U\\x06;\\xdf\\xa6\\xbb\\xbb\\xbd=\\xb4<\\x1c\\xac\\xd3\\xbc\\xc2\\xcc\\xc6\\xbd;\\xcdk=\\x92\\xf1\\x93=\\xc1~3<\\xc6\\xac\\xe1\\xbdm\\x15\\x06\\xbb\\xb5A\\xa3\\xbd(RJ=a.&\\xbd\\xd0.\\xb8=b\\xcc(\\xbc\"\\xe5\\xe8:\\xb7\\x18\\xfc\\xbbF%o\\xba\\x85g\\\\\\xbd\\x01\\x1b\\x90;\\xaf\\x97N=\\x08-\\xb1=\\xd6\\x1fE<\\xae\\x87\\xaf==\\xf8o\\xb8\\x18f\\x92\\xbd\\n\\xd0\\x1c\\xbd\\xdc\\xfd\\xc4\\xbc\\x8d\\xd5\\xfc<\\xeaJ%\\xbd\\n\\xa7\\xaa;x\\xdcg\\xbd\\x80K\\xe6<'" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 65 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "q = Query(\"*=>[KNN 3 @v $vec]\").return_field(\"__v_score\").dialect(2)\n", | |
| "client.ft().search(q, query_params={\"vec\": source_embedding.tobytes()})" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "bJPq8qA6velu", | |
| "outputId": "febeffcb-42a0-437f-8afe-89259d8a5486" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "Result{3 total, docs: [Document {'id': '44', 'payload': None, '__v_score': '1.49653553963'}, Document {'id': '46', 'payload': None, '__v_score': '1.45214688778'}, Document {'id': '75', 'payload': None, '__v_score': '1.37314224243'}]}" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "execution_count": 66 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "NY3J-Zuu2eNx" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment