Last active
July 5, 2016 00:14
-
-
Save moonmilk/691cb5c4d824f65d5e9b0eb77c5d0dca to your computer and use it in GitHub Desktop.
messing around with ofxMSAWord2Vec
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# messing with oxfMSAWord2Vec\n", | |
| "from https://github.com/memo/ofxMSAWord2Vec\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "load_word_vectors_bin : /Users/ranjit/Downloads/GoogleNews-vectors-negative300_trimmed_53K_lowercase.bin ... \n", | |
| "num_words: 53084/53084\n", | |
| "num_dims: 300\n", | |
| "done in 7.57633709908 seconds.\n", | |
| "------------------------------------------------------------\n", | |
| "normalize_word_vectors ... done in 0.50585103035 seconds.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from word2vec_utils import *\n", | |
| "vecs = load_word_vectors_bin('/Users/ranjit/Downloads/GoogleNews-vectors-negative300_trimmed_53K_lowercase.bin')\n", | |
| "vecs_n = normalize_word_vectors(vecs)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('dogs', 0.86804897),\n", | |
| " ('puppy', 0.81064284),\n", | |
| " ('cat', 0.7609458),\n", | |
| " ('beagle', 0.74186218),\n", | |
| " ('pup', 0.74069107),\n", | |
| " ('chihuahua', 0.71739173),\n", | |
| " ('pet', 0.71647859),\n", | |
| " ('canine', 0.69182897),\n", | |
| " ('collie', 0.67144096),\n", | |
| " ('kitten', 0.66598809)]" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "find_closest_words(vecs_n, \"dog\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('papillon', 0.69635022),\n", | |
| " ('chihuahua', 0.6762343),\n", | |
| " ('dalmatian', 0.65920705),\n", | |
| " ('pug', 0.64561403),\n", | |
| " ('puppy', 0.64243448),\n", | |
| " ('labrador', 0.63804096),\n", | |
| " ('mastiff', 0.62263489),\n", | |
| " ('poodle', 0.62242281),\n", | |
| " ('beagle', 0.62123823),\n", | |
| " ('alsatian', 0.6157546)]" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "find_closest_words(vecs_n, \"pomeranian\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('pomeranian', 1.0),\n", | |
| " ('papillon', 0.69635022),\n", | |
| " ('chihuahua', 0.6762343),\n", | |
| " ('dalmatian', 0.65920705),\n", | |
| " ('pug', 0.64561403),\n", | |
| " ('puppy', 0.64243448),\n", | |
| " ('labrador', 0.63804096),\n", | |
| " ('mastiff', 0.62263489),\n", | |
| " ('poodle', 0.62242281),\n", | |
| " ('beagle', 0.62123823)]" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pom_n = vecs_n[\"pomeranian\"]\n", | |
| "find_closest_words(vecs_n, pom_n)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "metal_words = \"burn cries veins eternity breathe beast gonna demons ashes soul\".split(\" \")\n", | |
| "unmetal_words = \"particularly indicated secretary committee university relatively noted approximately chairman employees\".split(\" \")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "metal_vecs, metal_vecs_n = [vecs[word] for word in metal_words], [vecs_n[word] for word in metal_words]\n", | |
| "unmetal_vecs, unmetal_vecs_n = [vecs[word] for word in unmetal_words], [vecs_n[word] for word in unmetal_words]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "([('hell', 0.58362615),\n", | |
| " ('eateth', 0.54645419),\n", | |
| " ('souls', 0.54323936),\n", | |
| " ('looketh', 0.53242099),\n", | |
| " ('god', 0.52718186)],\n", | |
| " [('said', 0.53903073),\n", | |
| " ('acknowledged', 0.49876258),\n", | |
| " ('stressed', 0.48518729),\n", | |
| " ('emphasized', 0.46537137),\n", | |
| " ('committees', 0.46324104)])" | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# find closest words to average word\n", | |
| "do_word_maths(vecs, vecs_n, [(0.1, word) for word in metal_words]),do_word_maths(vecs, vecs_n, [(0.1, word) for word in unmetal_words])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "normalize_word_vectors ... done in 5.19752502441e-05 seconds.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# find average vector\n", | |
| "import numpy as np\n", | |
| "metal_mean = np.mean(metal_vecs, axis=0)\n", | |
| "metal_mean_n = normalize_word_vectors({'metal_af':metal_mean})['metal_af']\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('soul', 0.63606858),\n", | |
| " ('demons', 0.60098231),\n", | |
| " ('hell', 0.58362621),\n", | |
| " ('eateth', 0.54645419),\n", | |
| " ('souls', 0.54323936),\n", | |
| " ('beast', 0.53437954),\n", | |
| " ('looketh', 0.53242099),\n", | |
| " ('eternity', 0.52865142),\n", | |
| " ('breathe', 0.52801883),\n", | |
| " ('god', 0.52718186)]" | |
| ] | |
| }, | |
| "execution_count": 43, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "find_closest_words(vecs_n, metal_mean_n)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 44, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "normalize_word_vectors ... done in 3.79085540771e-05 seconds.\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "unmetal_mean = np.mean(unmetal_vecs, axis=0)\n", | |
| "unmetal_mean_n = normalize_word_vectors({'unmetal_af':unmetal_mean})['unmetal_af']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[('chairman', 0.5799759),\n", | |
| " ('noted', 0.55365336),\n", | |
| " ('committee', 0.54178369),\n", | |
| " ('said', 0.53903079),\n", | |
| " ('secretary', 0.52612215),\n", | |
| " ('indicated', 0.50970483),\n", | |
| " ('acknowledged', 0.49876261),\n", | |
| " ('stressed', 0.48518729),\n", | |
| " ('emphasized', 0.46537143),\n", | |
| " ('committees', 0.46324104)]" | |
| ] | |
| }, | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "find_closest_words(vecs_n, unmetal_mean_n)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 198, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " --- less metal --- \n", | |
| "concerned\treceptive\tinvolved\tsaid\tstressed\n", | |
| "concerned\treceptive\tinvolved\tsaid\tamenable\n", | |
| "concerned\treceptive\tinvolved\tamenable\tconsidering\tactively\n", | |
| "concerned\treceptive\tinvolved\tamenable\taverse\tactively\tconsider\n", | |
| "receptive\tconcerned\tinvolved\taverse\tamenable\tintrigued\tactively\n", | |
| "receptive\tinvolved\tconcerned\taverse\tamenable\tintrigued\tconsider\n", | |
| "involved\treceptive\tconcerned\taverse\tintrigued\tamenable\tuninterested\n", | |
| "involved\treceptive\taverse\tconcerned\tintrigued\tamenable\tuninterested\n", | |
| "involved\treceptive\taverse\tintrigued\tconcerned\tamenable\tuninterested\n", | |
| "involved\taverse\tintrigued\treceptive\tuninterested\tamenable\tconcerned\n", | |
| "interested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\n", | |
| "averse\tinvolved\tintrigued\treceptive\tuninterested\tenamored\tamenable\n", | |
| "intrigued\taverse\tinvolved\tenamored\tuninterested\treceptive\tdesirous\n", | |
| "intrigued\taverse\tenamored\tdesirous\tuninterested\tinvolved\treceptive\n", | |
| "intrigued\taverse\tenamored\tdesirous\tuninterested\tfond\tinvolved\n", | |
| "intrigued\tenamored\taverse\tdesirous\tfond\tlove\tdreaming\n", | |
| "love\tintrigued\tenamored\tdreaming\tloves\tdesirous\taverse\n", | |
| "love\tmad\tdreaming\tloves\tintrigued\thell\n", | |
| "love\thell\tmad\tloves\tdreaming\n", | |
| "hell\tlove\tmad\tloves\n", | |
| "hell\tlove\tmad\twarn't\n", | |
| " --- more metal ---\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "word = 'interested'\n", | |
| "inc = 0.03\n", | |
| "print \" --- less metal --- \"\n", | |
| "def wordtable(w):\n", | |
| " return \"\\t\".join([a[0] for a in w])\n", | |
| "for f in range(10,0,-1):\n", | |
| " print wordtable(do_word_maths(vecs, vecs_n, [(1, word)] +[(inc*f, w) for w in unmetal_words], top_k=8))\n", | |
| "print wordtable([(word,) for i in range(0,10)])\n", | |
| "for f in range(0,10):\n", | |
| " print wordtable(do_word_maths(vecs, vecs_n, [(1, word)] +[(inc*f, w) for w in metal_words], top_k=8))\n", | |
| "print \" --- more metal ---\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 46, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.11" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment