Last active
November 13, 2016 11:57
-
-
Save yaju/d16884c9e06c044f8c95461e2ea050c4 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import word2vec" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Starting training using file text8\n", | |
| "Words processed: 17000K Vocab size: 4399K \n", | |
| "Vocab size (unigrams + bigrams): 2419827\n", | |
| "Words in train file: 17005206\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "word2vec.word2phrase('text8', 'text8-phrases', verbose=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Starting training using file text8-phrases\n", | |
| "Vocab size: 98331\n", | |
| "Words in train file: 15857306\n", | |
| "Alpha: 0.000002 Progress: 100.03% Words/thread/sec: 245.29k " | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "word2vec.word2vec('text8-phrases', 'text8.bin', size=100, verbose=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Starting training using file text8\n", | |
| "Vocab size: 71291\n", | |
| "Words in train file: 16718843\n", | |
| "Alpha: 0.000002 Progress: 100.03% Words/thread/sec: 245.25k " | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "word2vec.word2clusters('text8', 'text8-clusters.txt', 100, verbose=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "model = word2vec.load('text8.bin')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([u'</s>', u'the', u'of', ..., u'denishawn', u'tamiris', u'dolophine'], \n", | |
| " dtype='<U78')" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model.vocab" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(98331, 100)" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model.vectors.shape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([[ 0.14333284, 0.15825514, -0.13715847, ..., 0.05456942,\n", | |
| " 0.1095541 , 0.00693387],\n", | |
| " [ 0.12111571, -0.02320549, -0.02563629, ..., -0.15034008,\n", | |
| " -0.01260874, 0.05176837],\n", | |
| " [ 0.15126687, 0.06889148, 0.06623464, ..., -0.0855635 ,\n", | |
| " 0.07735366, 0.15149982],\n", | |
| " ..., \n", | |
| " [-0.03802459, 0.03345773, -0.00653401, ..., 0.07480404,\n", | |
| " -0.35265082, 0.00257411],\n", | |
| " [-0.02365303, -0.02668318, 0.05093664, ..., 0.11467363,\n", | |
| " -0.23554097, -0.00971382],\n", | |
| " [ 0.08233783, 0.0129103 , 0.16348898, ..., 0.13334648,\n", | |
| " -0.04252408, -0.03372287]])" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model.vectors" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(100,)" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model['dog'].shape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([-0.03324719, 0.00191073, 0.0090235 , 0.0964016 , 0.04208981,\n", | |
| " -0.07228027, 0.01304798, 0.1299261 , 0.06779151, 0.08304796])" | |
| ] | |
| }, | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model['dog'][:10]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(array([14558, 34047, 20175, 30402, 23666, 31618, 24110, 29181, 20336, 32659]),\n", | |
| " array([ 0.83386324, 0.83339789, 0.82985321, 0.82350754, 0.8211557 ,\n", | |
| " 0.8190668 , 0.81662868, 0.81528289, 0.81164501, 0.80770721]))" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "indexes, metrics = model.cosine('socks')\n", | |
| "indexes, metrics" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([u'winged', u'nosed', u'hairy', u'gravy', u'striped', u'straps',\n", | |
| " u'petals', u'pumpkin', u'crab', u'jug'], \n", | |
| " dtype='<U78')" | |
| ] | |
| }, | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model.vocab[indexes]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "rec.array([(u'winged', 0.8338632385860715), (u'nosed', 0.8333978881227401),\n", | |
| " (u'hairy', 0.8298532089128187), (u'gravy', 0.8235075439614752),\n", | |
| " (u'striped', 0.8211556950643765), (u'straps', 0.8190668018507885),\n", | |
| " (u'petals', 0.8166286839971598), (u'pumpkin', 0.8152828876711085),\n", | |
| " (u'crab', 0.8116450062378282), (u'jug', 0.8077072095376159)], \n", | |
| " dtype=[(u'word', '<U312'), (u'metric', '<f8')])" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model.generate_response(indexes, metrics)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(u'winged', 0.8338632385860715),\n", | |
| " (u'nosed', 0.8333978881227401),\n", | |
| " (u'hairy', 0.8298532089128187),\n", | |
| " (u'gravy', 0.8235075439614752),\n", | |
| " (u'striped', 0.8211556950643765),\n", | |
| " (u'straps', 0.8190668018507885),\n", | |
| " (u'petals', 0.8166286839971598),\n", | |
| " (u'pumpkin', 0.8152828876711085),\n", | |
| " (u'crab', 0.8116450062378282),\n", | |
| " (u'jug', 0.8077072095376159)]" | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model.generate_response(indexes, metrics).tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(u'san_francisco', 0.8978871089555688),\n", | |
| " (u'san_diego', 0.8809796846318916),\n", | |
| " (u'las_vegas', 0.8418097928908024),\n", | |
| " (u'miami', 0.8398494845049063),\n", | |
| " (u'seattle', 0.831899455568167),\n", | |
| " (u'chicago', 0.8267953558566568),\n", | |
| " (u'st_louis', 0.8222832383714451),\n", | |
| " (u'california', 0.8211714844110203),\n", | |
| " (u'detroit', 0.8188764034330804),\n", | |
| " (u'cleveland', 0.817251895152392)]" | |
| ] | |
| }, | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "indexes, metrics = model.cosine('los_angeles')\n", | |
| "model.generate_response(indexes, metrics).tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(array([ 1088, 7540, 1145, 1335, 3141, 344, 1827, 6769, 1770, 10311]),\n", | |
| " array([ 0.29054025, 0.27345068, 0.2709168 , 0.26793563, 0.26728774,\n", | |
| " 0.26669041, 0.26571555, 0.26416171, 0.26370962, 0.2619265 ]))" | |
| ] | |
| }, | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10)\n", | |
| "indexes, metrics" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(u'queen', 0.2905402467729655),\n", | |
| " (u'empress', 0.2734506828577926),\n", | |
| " (u'prince', 0.2709167973829251),\n", | |
| " (u'wife', 0.2679356346054478),\n", | |
| " (u'monarch', 0.26728773557223445),\n", | |
| " (u'son', 0.2666904115522106),\n", | |
| " (u'throne', 0.2657155461401751),\n", | |
| " (u'regent', 0.26416170868397304),\n", | |
| " (u'pope', 0.2637096213206423),\n", | |
| " (u'pharaoh', 0.2619265026976325)]" | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model.generate_response(indexes, metrics).tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "clusters = word2vec.load_clusters('text8-clusters.txt')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "98" | |
| ] | |
| }, | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "clusters['dog']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(236,)" | |
| ] | |
| }, | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "clusters.get_words_on_cluster(90).shape" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array(['or', 'use', 'based', 'natural', 'making', 'complex', 'physical',\n", | |
| " 'basic', 'simple', 'direct'], dtype=object)" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "clusters.get_words_on_cluster(90)[:10]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "model.clusters = clusters" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "indexes, metrics = model.analogy(pos=['paris', 'germany'], neg=['france'], n=10)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[(u'berlin', 0.3290806534910225, 20),\n", | |
| " (u'vienna', 0.28895394489218945, 82),\n", | |
| " (u'munich', 0.2884696908910993, 2),\n", | |
| " (u'leipzig', 0.28393902444609526, 41),\n", | |
| " (u'st_petersburg', 0.2722535287864745, 63),\n", | |
| " (u'moscow', 0.26823776119594767, 17),\n", | |
| " (u'z_rich', 0.255945681020352, 59),\n", | |
| " (u'prague', 0.2558858091470416, 45),\n", | |
| " (u'dresden', 0.2544421493862952, 86),\n", | |
| " (u'hamburg', 0.24842199803919204, 98)]" | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model.generate_response(indexes, metrics).tolist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.6" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment