Skip to content

Instantly share code, notes, and snippets.

@yaju
Last active November 13, 2016 11:57
Show Gist options
  • Select an option

  • Save yaju/d16884c9e06c044f8c95461e2ea050c4 to your computer and use it in GitHub Desktop.

Select an option

Save yaju/d16884c9e06c044f8c95461e2ea050c4 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import word2vec"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting training using file text8\n",
"Words processed: 17000K Vocab size: 4399K \n",
"Vocab size (unigrams + bigrams): 2419827\n",
"Words in train file: 17005206\n"
]
}
],
"source": [
"word2vec.word2phrase('text8', 'text8-phrases', verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting training using file text8-phrases\n",
"Vocab size: 98331\n",
"Words in train file: 15857306\n",
"Alpha: 0.000002 Progress: 100.03% Words/thread/sec: 245.29k "
]
}
],
"source": [
"word2vec.word2vec('text8-phrases', 'text8.bin', size=100, verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Starting training using file text8\n",
"Vocab size: 71291\n",
"Words in train file: 16718843\n",
"Alpha: 0.000002 Progress: 100.03% Words/thread/sec: 245.25k "
]
}
],
"source": [
"word2vec.word2clusters('text8', 'text8-clusters.txt', 100, verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"model = word2vec.load('text8.bin')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([u'</s>', u'the', u'of', ..., u'denishawn', u'tamiris', u'dolophine'], \n",
" dtype='<U78')"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.vocab"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(98331, 100)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.vectors.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[ 0.14333284, 0.15825514, -0.13715847, ..., 0.05456942,\n",
" 0.1095541 , 0.00693387],\n",
" [ 0.12111571, -0.02320549, -0.02563629, ..., -0.15034008,\n",
" -0.01260874, 0.05176837],\n",
" [ 0.15126687, 0.06889148, 0.06623464, ..., -0.0855635 ,\n",
" 0.07735366, 0.15149982],\n",
" ..., \n",
" [-0.03802459, 0.03345773, -0.00653401, ..., 0.07480404,\n",
" -0.35265082, 0.00257411],\n",
" [-0.02365303, -0.02668318, 0.05093664, ..., 0.11467363,\n",
" -0.23554097, -0.00971382],\n",
" [ 0.08233783, 0.0129103 , 0.16348898, ..., 0.13334648,\n",
" -0.04252408, -0.03372287]])"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.vectors"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(100,)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model['dog'].shape"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([-0.03324719, 0.00191073, 0.0090235 , 0.0964016 , 0.04208981,\n",
" -0.07228027, 0.01304798, 0.1299261 , 0.06779151, 0.08304796])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model['dog'][:10]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(array([14558, 34047, 20175, 30402, 23666, 31618, 24110, 29181, 20336, 32659]),\n",
" array([ 0.83386324, 0.83339789, 0.82985321, 0.82350754, 0.8211557 ,\n",
" 0.8190668 , 0.81662868, 0.81528289, 0.81164501, 0.80770721]))"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indexes, metrics = model.cosine('socks')\n",
"indexes, metrics"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([u'winged', u'nosed', u'hairy', u'gravy', u'striped', u'straps',\n",
" u'petals', u'pumpkin', u'crab', u'jug'], \n",
" dtype='<U78')"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.vocab[indexes]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"rec.array([(u'winged', 0.8338632385860715), (u'nosed', 0.8333978881227401),\n",
" (u'hairy', 0.8298532089128187), (u'gravy', 0.8235075439614752),\n",
" (u'striped', 0.8211556950643765), (u'straps', 0.8190668018507885),\n",
" (u'petals', 0.8166286839971598), (u'pumpkin', 0.8152828876711085),\n",
" (u'crab', 0.8116450062378282), (u'jug', 0.8077072095376159)], \n",
" dtype=[(u'word', '<U312'), (u'metric', '<f8')])"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.generate_response(indexes, metrics)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(u'winged', 0.8338632385860715),\n",
" (u'nosed', 0.8333978881227401),\n",
" (u'hairy', 0.8298532089128187),\n",
" (u'gravy', 0.8235075439614752),\n",
" (u'striped', 0.8211556950643765),\n",
" (u'straps', 0.8190668018507885),\n",
" (u'petals', 0.8166286839971598),\n",
" (u'pumpkin', 0.8152828876711085),\n",
" (u'crab', 0.8116450062378282),\n",
" (u'jug', 0.8077072095376159)]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.generate_response(indexes, metrics).tolist()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(u'san_francisco', 0.8978871089555688),\n",
" (u'san_diego', 0.8809796846318916),\n",
" (u'las_vegas', 0.8418097928908024),\n",
" (u'miami', 0.8398494845049063),\n",
" (u'seattle', 0.831899455568167),\n",
" (u'chicago', 0.8267953558566568),\n",
" (u'st_louis', 0.8222832383714451),\n",
" (u'california', 0.8211714844110203),\n",
" (u'detroit', 0.8188764034330804),\n",
" (u'cleveland', 0.817251895152392)]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indexes, metrics = model.cosine('los_angeles')\n",
"model.generate_response(indexes, metrics).tolist()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(array([ 1088, 7540, 1145, 1335, 3141, 344, 1827, 6769, 1770, 10311]),\n",
" array([ 0.29054025, 0.27345068, 0.2709168 , 0.26793563, 0.26728774,\n",
" 0.26669041, 0.26571555, 0.26416171, 0.26370962, 0.2619265 ]))"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"indexes, metrics = model.analogy(pos=['king', 'woman'], neg=['man'], n=10)\n",
"indexes, metrics"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(u'queen', 0.2905402467729655),\n",
" (u'empress', 0.2734506828577926),\n",
" (u'prince', 0.2709167973829251),\n",
" (u'wife', 0.2679356346054478),\n",
" (u'monarch', 0.26728773557223445),\n",
" (u'son', 0.2666904115522106),\n",
" (u'throne', 0.2657155461401751),\n",
" (u'regent', 0.26416170868397304),\n",
" (u'pope', 0.2637096213206423),\n",
" (u'pharaoh', 0.2619265026976325)]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.generate_response(indexes, metrics).tolist()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"clusters = word2vec.load_clusters('text8-clusters.txt')"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"98"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clusters['dog']"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(236,)"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clusters.get_words_on_cluster(90).shape"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array(['or', 'use', 'based', 'natural', 'making', 'complex', 'physical',\n",
" 'basic', 'simple', 'direct'], dtype=object)"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clusters.get_words_on_cluster(90)[:10]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"model.clusters = clusters"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"indexes, metrics = model.analogy(pos=['paris', 'germany'], neg=['france'], n=10)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(u'berlin', 0.3290806534910225, 20),\n",
" (u'vienna', 0.28895394489218945, 82),\n",
" (u'munich', 0.2884696908910993, 2),\n",
" (u'leipzig', 0.28393902444609526, 41),\n",
" (u'st_petersburg', 0.2722535287864745, 63),\n",
" (u'moscow', 0.26823776119594767, 17),\n",
" (u'z_rich', 0.255945681020352, 59),\n",
" (u'prague', 0.2558858091470416, 45),\n",
" (u'dresden', 0.2544421493862952, 86),\n",
" (u'hamburg', 0.24842199803919204, 98)]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.generate_response(indexes, metrics).tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment