Last active
December 26, 2019 05:16
-
-
Save d2207197/5dcd462ee06aa053b1a23a6eb5518c2c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "toc": true | |
| }, | |
| "source": [ | |
| "<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n", | |
| "<div class=\"toc\"><ul class=\"toc-item\"></ul></div>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-12-26T05:14:04.868076Z", | |
| "start_time": "2019-12-26T05:14:04.569093Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import sklearn" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-12-26T05:14:12.436029Z", | |
| "start_time": "2019-12-26T05:14:12.422009Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "'0.21.3'" | |
| ] | |
| }, | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "sklearn.__version__\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "start_time": "2019-12-26T05:15:17.626Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/joe/.asdf/installs/python/miniconda3-latest/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n", | |
| " \"this warning.\", FutureWarning)\n", | |
| "/home/joe/.asdf/installs/python/miniconda3-latest/lib/python3.7/site-packages/sklearn/model_selection/_split.py:657: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.\n", | |
| " % (min_groups, self.n_splits)), Warning)\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "from sklearn.compose import ColumnTransformer\n", | |
| "from sklearn.pipeline import Pipeline\n", | |
| "from sklearn.linear_model import LogisticRegressionCV\n", | |
| "from sklearn.preprocessing import LabelEncoder\n", | |
| "from sklearn.feature_extraction.text import CountVectorizer\n", | |
| "import catboost as cb\n", | |
| "\n", | |
| "csv_path = '/home/ryanchao2012/projects/grandchallenge/data/ques-class/v2.csv'\n", | |
| "df = pd.read_csv(csv_path)\n", | |
| "\n", | |
| "le = LabelEncoder()\n", | |
| "y = le.fit_transform(df['LABEL'])\n", | |
| "\n", | |
| "column_trans = ColumnTransformer([\n", | |
| " ('ngrams', CountVectorizer(token_pattern=r'[a-zA-Z]+|[0-9.]+|\\S', ngram_range=(1,2)), 'QTEXT')], \n", | |
| " remainder='drop')\n", | |
| "\n", | |
| "clf = LogisticRegressionCV(cv=3, random_state=0)\n", | |
| "\n", | |
| "pipe = Pipeline(steps=[\n", | |
| " ('feature extraction', column_trans),\n", | |
| " ('clf', clf)\n", | |
| "])\n", | |
| "pipe.fit(df, y)\n", | |
| "pipe.score(df, y)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "start_time": "2019-12-26T05:15:49.539Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "pred_label = le.inverse_transform(pipe.predict(df))\n", | |
| "pred_label\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 75, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-12-25T09:56:17.191964Z", | |
| "start_time": "2019-12-25T09:56:17.187507Z" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "df['pred_label'] = pred_label" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 76, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-12-25T09:56:17.864171Z", | |
| "start_time": "2019-12-25T09:56:17.850706Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>QID</th>\n", | |
| " <th>QTEXT</th>\n", | |
| " <th>ANSWER</th>\n", | |
| " <th>LABEL</th>\n", | |
| " <th>pred_label</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td>0</td>\n", | |
| " <td>D001Q01</td>\n", | |
| " <td>蘇東坡在中國歷史上,是哪一個朝代的人?</td>\n", | |
| " <td>北宋</td>\n", | |
| " <td>ERA</td>\n", | |
| " <td>ERA</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>1</td>\n", | |
| " <td>D001Q02</td>\n", | |
| " <td>蘇東坡是中國哪個省份的人?</td>\n", | |
| " <td>四川省</td>\n", | |
| " <td>LOCATION</td>\n", | |
| " <td>LOCATION</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>2</td>\n", | |
| " <td>D001Q03</td>\n", | |
| " <td>蘇東坡的爸爸叫什麼名字?</td>\n", | |
| " <td>蘇洵</td>\n", | |
| " <td>PERSON</td>\n", | |
| " <td>PERSON</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>3</td>\n", | |
| " <td>D001Q04</td>\n", | |
| " <td>蘇文忠公指的是誰?</td>\n", | |
| " <td>蘇軾</td>\n", | |
| " <td>PERSON</td>\n", | |
| " <td>PERSON</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>4</td>\n", | |
| " <td>D001Q05</td>\n", | |
| " <td>《蘇文忠公全集》是由何人編纂?</td>\n", | |
| " <td>王宗稷</td>\n", | |
| " <td>WHO</td>\n", | |
| " <td>WHO</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>1172</td>\n", | |
| " <td>D308Q03</td>\n", | |
| " <td>『池裡不見水,地上沒有泥』,猜一個字,答案為何?</td>\n", | |
| " <td>也</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>1173</td>\n", | |
| " <td>D308Q04</td>\n", | |
| " <td>提供『七十二小時』謎語的同學是誰?</td>\n", | |
| " <td>周偉</td>\n", | |
| " <td>WHO</td>\n", | |
| " <td>WHO</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>1174</td>\n", | |
| " <td>D308Q05</td>\n", | |
| " <td>答對『也』字謎語的同學叫什麼名字?</td>\n", | |
| " <td>于佩佩</td>\n", | |
| " <td>NAME</td>\n", | |
| " <td>NAME</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>1175</td>\n", | |
| " <td>D308Q06</td>\n", | |
| " <td>『七十二小時』,猜一個字,答案為何?</td>\n", | |
| " <td>晶</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>1176</td>\n", | |
| " <td>D308Q07</td>\n", | |
| " <td>猜對『根在水中央,身材細又長,皮膚白又嫩,好吃又營養。』答案是豆芽的是哪一位同學?</td>\n", | |
| " <td>丁小芹</td>\n", | |
| " <td>PERSON</td>\n", | |
| " <td>PERSON</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>1177 rows × 5 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " QID QTEXT ANSWER LABEL \\\n", | |
| "0 D001Q01 蘇東坡在中國歷史上,是哪一個朝代的人? 北宋 ERA \n", | |
| "1 D001Q02 蘇東坡是中國哪個省份的人? 四川省 LOCATION \n", | |
| "2 D001Q03 蘇東坡的爸爸叫什麼名字? 蘇洵 PERSON \n", | |
| "3 D001Q04 蘇文忠公指的是誰? 蘇軾 PERSON \n", | |
| "4 D001Q05 《蘇文忠公全集》是由何人編纂? 王宗稷 WHO \n", | |
| "... ... ... ... ... \n", | |
| "1172 D308Q03 『池裡不見水,地上沒有泥』,猜一個字,答案為何? 也 DOMAIN_TERM \n", | |
| "1173 D308Q04 提供『七十二小時』謎語的同學是誰? 周偉 WHO \n", | |
| "1174 D308Q05 答對『也』字謎語的同學叫什麼名字? 于佩佩 NAME \n", | |
| "1175 D308Q06 『七十二小時』,猜一個字,答案為何? 晶 DOMAIN_TERM \n", | |
| "1176 D308Q07 猜對『根在水中央,身材細又長,皮膚白又嫩,好吃又營養。』答案是豆芽的是哪一位同學? 丁小芹 PERSON \n", | |
| "\n", | |
| " pred_label \n", | |
| "0 ERA \n", | |
| "1 LOCATION \n", | |
| "2 PERSON \n", | |
| "3 PERSON \n", | |
| "4 WHO \n", | |
| "... ... \n", | |
| "1172 DOMAIN_TERM \n", | |
| "1173 WHO \n", | |
| "1174 NAME \n", | |
| "1175 DOMAIN_TERM \n", | |
| "1176 PERSON \n", | |
| "\n", | |
| "[1177 rows x 5 columns]" | |
| ] | |
| }, | |
| "execution_count": 76, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 94, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-12-25T10:02:50.743927Z", | |
| "start_time": "2019-12-25T10:02:50.671317Z" | |
| }, | |
| "scrolled": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>LABEL</th>\n", | |
| " <th>n</th>\n", | |
| " <th>pred_n</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td>19</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>165</td>\n", | |
| " <td>313.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>67</td>\n", | |
| " <td>YESNO</td>\n", | |
| " <td>114</td>\n", | |
| " <td>114.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>50</td>\n", | |
| " <td>RELATIVE</td>\n", | |
| " <td>86</td>\n", | |
| " <td>86.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>63</td>\n", | |
| " <td>WHERE</td>\n", | |
| " <td>63</td>\n", | |
| " <td>63.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>48</td>\n", | |
| " <td>QUANTITY</td>\n", | |
| " <td>54</td>\n", | |
| " <td>54.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>64</td>\n", | |
| " <td>WHO</td>\n", | |
| " <td>50</td>\n", | |
| " <td>50.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>65</td>\n", | |
| " <td>YEAR</td>\n", | |
| " <td>49</td>\n", | |
| " <td>49.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>12</td>\n", | |
| " <td>COUNTRY</td>\n", | |
| " <td>48</td>\n", | |
| " <td>48.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>1</td>\n", | |
| " <td>ALIAS</td>\n", | |
| " <td>43</td>\n", | |
| " <td>43.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>44</td>\n", | |
| " <td>PERSON</td>\n", | |
| " <td>43</td>\n", | |
| " <td>43.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>43</td>\n", | |
| " <td>PERIOD</td>\n", | |
| " <td>33</td>\n", | |
| " <td>33.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>56</td>\n", | |
| " <td>SUMMARY</td>\n", | |
| " <td>29</td>\n", | |
| " <td>4.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>62</td>\n", | |
| " <td>WHEN</td>\n", | |
| " <td>26</td>\n", | |
| " <td>26.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>0</td>\n", | |
| " <td>AGE</td>\n", | |
| " <td>25</td>\n", | |
| " <td>25.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>8</td>\n", | |
| " <td>CHOICE</td>\n", | |
| " <td>22</td>\n", | |
| " <td>22.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>13</td>\n", | |
| " <td>DATE</td>\n", | |
| " <td>22</td>\n", | |
| " <td>22.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>38</td>\n", | |
| " <td>NAME</td>\n", | |
| " <td>17</td>\n", | |
| " <td>17.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>33</td>\n", | |
| " <td>LOCATION</td>\n", | |
| " <td>17</td>\n", | |
| " <td>17.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>9</td>\n", | |
| " <td>CITY</td>\n", | |
| " <td>17</td>\n", | |
| " <td>17.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>26</td>\n", | |
| " <td>GROUP</td>\n", | |
| " <td>16</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>20</td>\n", | |
| " <td>ERA</td>\n", | |
| " <td>15</td>\n", | |
| " <td>15.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>27</td>\n", | |
| " <td>HEIGHT</td>\n", | |
| " <td>15</td>\n", | |
| " <td>15.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>5</td>\n", | |
| " <td>ART_WORK</td>\n", | |
| " <td>14</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>16</td>\n", | |
| " <td>DISEASE</td>\n", | |
| " <td>12</td>\n", | |
| " <td>12.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>11</td>\n", | |
| " <td>COMPANY</td>\n", | |
| " <td>12</td>\n", | |
| " <td>12.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>22</td>\n", | |
| " <td>FLOWER</td>\n", | |
| " <td>12</td>\n", | |
| " <td>12.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>7</td>\n", | |
| " <td>BUILDING</td>\n", | |
| " <td>11</td>\n", | |
| " <td>11.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>51</td>\n", | |
| " <td>RIVER</td>\n", | |
| " <td>10</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>32</td>\n", | |
| " <td>LENGTH</td>\n", | |
| " <td>9</td>\n", | |
| " <td>9.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>54</td>\n", | |
| " <td>SEASON</td>\n", | |
| " <td>7</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>18</td>\n", | |
| " <td>DOLLAR</td>\n", | |
| " <td>7</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>57</td>\n", | |
| " <td>TEMPLE</td>\n", | |
| " <td>7</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>30</td>\n", | |
| " <td>JOB</td>\n", | |
| " <td>7</td>\n", | |
| " <td>7.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>60</td>\n", | |
| " <td>VEHICLE</td>\n", | |
| " <td>7</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>10</td>\n", | |
| " <td>COLOR</td>\n", | |
| " <td>7</td>\n", | |
| " <td>7.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>40</td>\n", | |
| " <td>ORDER</td>\n", | |
| " <td>6</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>17</td>\n", | |
| " <td>DISTANCE</td>\n", | |
| " <td>6</td>\n", | |
| " <td>6.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>53</td>\n", | |
| " <td>SCHOOL</td>\n", | |
| " <td>5</td>\n", | |
| " <td>5.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>58</td>\n", | |
| " <td>TIME</td>\n", | |
| " <td>5</td>\n", | |
| " <td>5.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>34</td>\n", | |
| " <td>METAL</td>\n", | |
| " <td>5</td>\n", | |
| " <td>5.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>4</td>\n", | |
| " <td>AREA</td>\n", | |
| " <td>5</td>\n", | |
| " <td>5.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>6</td>\n", | |
| " <td>BODY</td>\n", | |
| " <td>5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>2</td>\n", | |
| " <td>ANIMAL</td>\n", | |
| " <td>5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>41</td>\n", | |
| " <td>PEN</td>\n", | |
| " <td>4</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>52</td>\n", | |
| " <td>ROAD</td>\n", | |
| " <td>4</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>59</td>\n", | |
| " <td>TOOL</td>\n", | |
| " <td>3</td>\n", | |
| " <td>3.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>23</td>\n", | |
| " <td>FOOD</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>36</td>\n", | |
| " <td>MOUNTAIN</td>\n", | |
| " <td>3</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>35</td>\n", | |
| " <td>MONTH</td>\n", | |
| " <td>2</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>14</td>\n", | |
| " <td>DAY</td>\n", | |
| " <td>2</td>\n", | |
| " <td>2.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>29</td>\n", | |
| " <td>ISLAND</td>\n", | |
| " <td>2</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>42</td>\n", | |
| " <td>PERCENT</td>\n", | |
| " <td>2</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>39</td>\n", | |
| " <td>OCEAN</td>\n", | |
| " <td>2</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>3</td>\n", | |
| " <td>ARE</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>61</td>\n", | |
| " <td>WEIGHT</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>15</td>\n", | |
| " <td>DIRECTION</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>66</td>\n", | |
| " <td>YES</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>37</td>\n", | |
| " <td>MOVIE</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>25</td>\n", | |
| " <td>GENDER</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>24</td>\n", | |
| " <td>FRUIT</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>55</td>\n", | |
| " <td>SPEED</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>28</td>\n", | |
| " <td>IDIOM</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>31</td>\n", | |
| " <td>LAKE</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>49</td>\n", | |
| " <td>RACE</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>68</td>\n", | |
| " <td>YESNO_ALT</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>47</td>\n", | |
| " <td>PORT</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>46</td>\n", | |
| " <td>PLANT</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>45</td>\n", | |
| " <td>PHONE</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>21</td>\n", | |
| " <td>FISH</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>69</td>\n", | |
| " <td>YESON</td>\n", | |
| " <td>1</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " LABEL n pred_n\n", | |
| "19 DOMAIN_TERM 165 313.0\n", | |
| "67 YESNO 114 114.0\n", | |
| "50 RELATIVE 86 86.0\n", | |
| "63 WHERE 63 63.0\n", | |
| "48 QUANTITY 54 54.0\n", | |
| "64 WHO 50 50.0\n", | |
| "65 YEAR 49 49.0\n", | |
| "12 COUNTRY 48 48.0\n", | |
| "1 ALIAS 43 43.0\n", | |
| "44 PERSON 43 43.0\n", | |
| "43 PERIOD 33 33.0\n", | |
| "56 SUMMARY 29 4.0\n", | |
| "62 WHEN 26 26.0\n", | |
| "0 AGE 25 25.0\n", | |
| "8 CHOICE 22 22.0\n", | |
| "13 DATE 22 22.0\n", | |
| "38 NAME 17 17.0\n", | |
| "33 LOCATION 17 17.0\n", | |
| "9 CITY 17 17.0\n", | |
| "26 GROUP 16 0.0\n", | |
| "20 ERA 15 15.0\n", | |
| "27 HEIGHT 15 15.0\n", | |
| "5 ART_WORK 14 0.0\n", | |
| "16 DISEASE 12 12.0\n", | |
| "11 COMPANY 12 12.0\n", | |
| "22 FLOWER 12 12.0\n", | |
| "7 BUILDING 11 11.0\n", | |
| "51 RIVER 10 0.0\n", | |
| "32 LENGTH 9 9.0\n", | |
| "54 SEASON 7 0.0\n", | |
| "18 DOLLAR 7 0.0\n", | |
| "57 TEMPLE 7 0.0\n", | |
| "30 JOB 7 7.0\n", | |
| "60 VEHICLE 7 0.0\n", | |
| "10 COLOR 7 7.0\n", | |
| "40 ORDER 6 0.0\n", | |
| "17 DISTANCE 6 6.0\n", | |
| "53 SCHOOL 5 5.0\n", | |
| "58 TIME 5 5.0\n", | |
| "34 METAL 5 5.0\n", | |
| "4 AREA 5 5.0\n", | |
| "6 BODY 5 0.0\n", | |
| "2 ANIMAL 5 0.0\n", | |
| "41 PEN 4 0.0\n", | |
| "52 ROAD 4 0.0\n", | |
| "59 TOOL 3 3.0\n", | |
| "23 FOOD 3 0.0\n", | |
| "36 MOUNTAIN 3 0.0\n", | |
| "35 MONTH 2 0.0\n", | |
| "14 DAY 2 2.0\n", | |
| "29 ISLAND 2 0.0\n", | |
| "42 PERCENT 2 0.0\n", | |
| "39 OCEAN 2 0.0\n", | |
| "3 ARE 1 0.0\n", | |
| "61 WEIGHT 1 0.0\n", | |
| "15 DIRECTION 1 0.0\n", | |
| "66 YES 1 0.0\n", | |
| "37 MOVIE 1 0.0\n", | |
| "25 GENDER 1 0.0\n", | |
| "24 FRUIT 1 0.0\n", | |
| "55 SPEED 1 0.0\n", | |
| "28 IDIOM 1 0.0\n", | |
| "31 LAKE 1 0.0\n", | |
| "49 RACE 1 0.0\n", | |
| "68 YESNO_ALT 1 0.0\n", | |
| "47 PORT 1 0.0\n", | |
| "46 PLANT 1 0.0\n", | |
| "45 PHONE 1 0.0\n", | |
| "21 FISH 1 0.0\n", | |
| "69 YESON 1 0.0" | |
| ] | |
| }, | |
| "execution_count": 94, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pd.options.display.max_rows = 100\n", | |
| "\n", | |
| "label_summary_df = (\n", | |
| " df >> group_by('LABEL') >> summarize(n = n(X.LABEL))\n", | |
| ")\n", | |
| "label_summary_df >>= left_join(\n", | |
| " other=(df >> group_by('pred_label') >> summarize(pred_n = n(X.LABEL)) >> rename(LABEL=X.pred_label)),\n", | |
| " by='LABEL'\n", | |
| ")\n", | |
| "label_summary_df = label_summary_df.fillna(0)\n", | |
| "label_summary_df >> arrange(desc(X.n))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 100, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-12-25T10:06:13.596151Z", | |
| "start_time": "2019-12-25T10:06:13.506650Z" | |
| }, | |
| "scrolled": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>LABEL</th>\n", | |
| " <th>label_n</th>\n", | |
| " <th>pred_label</th>\n", | |
| " <th>pair_n</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <td>20</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>165</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>165</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>70</td>\n", | |
| " <td>YESNO</td>\n", | |
| " <td>114</td>\n", | |
| " <td>YESNO</td>\n", | |
| " <td>114</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>53</td>\n", | |
| " <td>RELATIVE</td>\n", | |
| " <td>86</td>\n", | |
| " <td>RELATIVE</td>\n", | |
| " <td>86</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>66</td>\n", | |
| " <td>WHERE</td>\n", | |
| " <td>63</td>\n", | |
| " <td>WHERE</td>\n", | |
| " <td>63</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>51</td>\n", | |
| " <td>QUANTITY</td>\n", | |
| " <td>54</td>\n", | |
| " <td>QUANTITY</td>\n", | |
| " <td>54</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>67</td>\n", | |
| " <td>WHO</td>\n", | |
| " <td>50</td>\n", | |
| " <td>WHO</td>\n", | |
| " <td>50</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>68</td>\n", | |
| " <td>YEAR</td>\n", | |
| " <td>49</td>\n", | |
| " <td>YEAR</td>\n", | |
| " <td>49</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>12</td>\n", | |
| " <td>COUNTRY</td>\n", | |
| " <td>48</td>\n", | |
| " <td>COUNTRY</td>\n", | |
| " <td>48</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>1</td>\n", | |
| " <td>ALIAS</td>\n", | |
| " <td>43</td>\n", | |
| " <td>ALIAS</td>\n", | |
| " <td>43</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>47</td>\n", | |
| " <td>PERSON</td>\n", | |
| " <td>43</td>\n", | |
| " <td>PERSON</td>\n", | |
| " <td>43</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>46</td>\n", | |
| " <td>PERIOD</td>\n", | |
| " <td>33</td>\n", | |
| " <td>PERIOD</td>\n", | |
| " <td>33</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>59</td>\n", | |
| " <td>SUMMARY</td>\n", | |
| " <td>29</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>29</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>65</td>\n", | |
| " <td>WHEN</td>\n", | |
| " <td>26</td>\n", | |
| " <td>WHEN</td>\n", | |
| " <td>26</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>0</td>\n", | |
| " <td>AGE</td>\n", | |
| " <td>25</td>\n", | |
| " <td>AGE</td>\n", | |
| " <td>25</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>8</td>\n", | |
| " <td>CHOICE</td>\n", | |
| " <td>22</td>\n", | |
| " <td>CHOICE</td>\n", | |
| " <td>22</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>13</td>\n", | |
| " <td>DATE</td>\n", | |
| " <td>22</td>\n", | |
| " <td>DATE</td>\n", | |
| " <td>22</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>9</td>\n", | |
| " <td>CITY</td>\n", | |
| " <td>17</td>\n", | |
| " <td>CITY</td>\n", | |
| " <td>17</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>34</td>\n", | |
| " <td>LOCATION</td>\n", | |
| " <td>17</td>\n", | |
| " <td>LOCATION</td>\n", | |
| " <td>17</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>39</td>\n", | |
| " <td>NAME</td>\n", | |
| " <td>17</td>\n", | |
| " <td>NAME</td>\n", | |
| " <td>17</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>27</td>\n", | |
| " <td>GROUP</td>\n", | |
| " <td>16</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>16</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>21</td>\n", | |
| " <td>ERA</td>\n", | |
| " <td>15</td>\n", | |
| " <td>ERA</td>\n", | |
| " <td>15</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>28</td>\n", | |
| " <td>HEIGHT</td>\n", | |
| " <td>15</td>\n", | |
| " <td>HEIGHT</td>\n", | |
| " <td>15</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>5</td>\n", | |
| " <td>ART_WORK</td>\n", | |
| " <td>14</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>14</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>11</td>\n", | |
| " <td>COMPANY</td>\n", | |
| " <td>12</td>\n", | |
| " <td>COMPANY</td>\n", | |
| " <td>12</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>16</td>\n", | |
| " <td>DISEASE</td>\n", | |
| " <td>12</td>\n", | |
| " <td>DISEASE</td>\n", | |
| " <td>12</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>23</td>\n", | |
| " <td>FLOWER</td>\n", | |
| " <td>12</td>\n", | |
| " <td>FLOWER</td>\n", | |
| " <td>12</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>7</td>\n", | |
| " <td>BUILDING</td>\n", | |
| " <td>11</td>\n", | |
| " <td>BUILDING</td>\n", | |
| " <td>11</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>54</td>\n", | |
| " <td>RIVER</td>\n", | |
| " <td>10</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>10</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>33</td>\n", | |
| " <td>LENGTH</td>\n", | |
| " <td>9</td>\n", | |
| " <td>LENGTH</td>\n", | |
| " <td>9</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>10</td>\n", | |
| " <td>COLOR</td>\n", | |
| " <td>7</td>\n", | |
| " <td>COLOR</td>\n", | |
| " <td>7</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>31</td>\n", | |
| " <td>JOB</td>\n", | |
| " <td>7</td>\n", | |
| " <td>JOB</td>\n", | |
| " <td>7</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>57</td>\n", | |
| " <td>SEASON</td>\n", | |
| " <td>7</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>7</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>60</td>\n", | |
| " <td>TEMPLE</td>\n", | |
| " <td>7</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>7</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>63</td>\n", | |
| " <td>VEHICLE</td>\n", | |
| " <td>7</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>7</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>18</td>\n", | |
| " <td>DOLLAR</td>\n", | |
| " <td>7</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>19</td>\n", | |
| " <td>DOLLAR</td>\n", | |
| " <td>7</td>\n", | |
| " <td>SUMMARY</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>17</td>\n", | |
| " <td>DISTANCE</td>\n", | |
| " <td>6</td>\n", | |
| " <td>DISTANCE</td>\n", | |
| " <td>6</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>41</td>\n", | |
| " <td>ORDER</td>\n", | |
| " <td>6</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>42</td>\n", | |
| " <td>ORDER</td>\n", | |
| " <td>6</td>\n", | |
| " <td>SUMMARY</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>2</td>\n", | |
| " <td>ANIMAL</td>\n", | |
| " <td>5</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>4</td>\n", | |
| " <td>AREA</td>\n", | |
| " <td>5</td>\n", | |
| " <td>AREA</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>6</td>\n", | |
| " <td>BODY</td>\n", | |
| " <td>5</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>35</td>\n", | |
| " <td>METAL</td>\n", | |
| " <td>5</td>\n", | |
| " <td>METAL</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>56</td>\n", | |
| " <td>SCHOOL</td>\n", | |
| " <td>5</td>\n", | |
| " <td>SCHOOL</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>61</td>\n", | |
| " <td>TIME</td>\n", | |
| " <td>5</td>\n", | |
| " <td>TIME</td>\n", | |
| " <td>5</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>43</td>\n", | |
| " <td>PEN</td>\n", | |
| " <td>4</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>55</td>\n", | |
| " <td>ROAD</td>\n", | |
| " <td>4</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>4</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>24</td>\n", | |
| " <td>FOOD</td>\n", | |
| " <td>3</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>37</td>\n", | |
| " <td>MOUNTAIN</td>\n", | |
| " <td>3</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>62</td>\n", | |
| " <td>TOOL</td>\n", | |
| " <td>3</td>\n", | |
| " <td>TOOL</td>\n", | |
| " <td>3</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>14</td>\n", | |
| " <td>DAY</td>\n", | |
| " <td>2</td>\n", | |
| " <td>DAY</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>30</td>\n", | |
| " <td>ISLAND</td>\n", | |
| " <td>2</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>36</td>\n", | |
| " <td>MONTH</td>\n", | |
| " <td>2</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>40</td>\n", | |
| " <td>OCEAN</td>\n", | |
| " <td>2</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>2</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>44</td>\n", | |
| " <td>PERCENT</td>\n", | |
| " <td>2</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>45</td>\n", | |
| " <td>PERCENT</td>\n", | |
| " <td>2</td>\n", | |
| " <td>SUMMARY</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>3</td>\n", | |
| " <td>ARE</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>15</td>\n", | |
| " <td>DIRECTION</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>22</td>\n", | |
| " <td>FISH</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>25</td>\n", | |
| " <td>FRUIT</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>26</td>\n", | |
| " <td>GENDER</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>29</td>\n", | |
| " <td>IDIOM</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>32</td>\n", | |
| " <td>LAKE</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>38</td>\n", | |
| " <td>MOVIE</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>48</td>\n", | |
| " <td>PHONE</td>\n", | |
| " <td>1</td>\n", | |
| " <td>SUMMARY</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>49</td>\n", | |
| " <td>PLANT</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>50</td>\n", | |
| " <td>PORT</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>52</td>\n", | |
| " <td>RACE</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>58</td>\n", | |
| " <td>SPEED</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>64</td>\n", | |
| " <td>WEIGHT</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>69</td>\n", | |
| " <td>YES</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>71</td>\n", | |
| " <td>YESNO_ALT</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>72</td>\n", | |
| " <td>YESON</td>\n", | |
| " <td>1</td>\n", | |
| " <td>DOMAIN_TERM</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " LABEL label_n pred_label pair_n\n", | |
| "20 DOMAIN_TERM 165 DOMAIN_TERM 165\n", | |
| "70 YESNO 114 YESNO 114\n", | |
| "53 RELATIVE 86 RELATIVE 86\n", | |
| "66 WHERE 63 WHERE 63\n", | |
| "51 QUANTITY 54 QUANTITY 54\n", | |
| "67 WHO 50 WHO 50\n", | |
| "68 YEAR 49 YEAR 49\n", | |
| "12 COUNTRY 48 COUNTRY 48\n", | |
| "1 ALIAS 43 ALIAS 43\n", | |
| "47 PERSON 43 PERSON 43\n", | |
| "46 PERIOD 33 PERIOD 33\n", | |
| "59 SUMMARY 29 DOMAIN_TERM 29\n", | |
| "65 WHEN 26 WHEN 26\n", | |
| "0 AGE 25 AGE 25\n", | |
| "8 CHOICE 22 CHOICE 22\n", | |
| "13 DATE 22 DATE 22\n", | |
| "9 CITY 17 CITY 17\n", | |
| "34 LOCATION 17 LOCATION 17\n", | |
| "39 NAME 17 NAME 17\n", | |
| "27 GROUP 16 DOMAIN_TERM 16\n", | |
| "21 ERA 15 ERA 15\n", | |
| "28 HEIGHT 15 HEIGHT 15\n", | |
| "5 ART_WORK 14 DOMAIN_TERM 14\n", | |
| "11 COMPANY 12 COMPANY 12\n", | |
| "16 DISEASE 12 DISEASE 12\n", | |
| "23 FLOWER 12 FLOWER 12\n", | |
| "7 BUILDING 11 BUILDING 11\n", | |
| "54 RIVER 10 DOMAIN_TERM 10\n", | |
| "33 LENGTH 9 LENGTH 9\n", | |
| "10 COLOR 7 COLOR 7\n", | |
| "31 JOB 7 JOB 7\n", | |
| "57 SEASON 7 DOMAIN_TERM 7\n", | |
| "60 TEMPLE 7 DOMAIN_TERM 7\n", | |
| "63 VEHICLE 7 DOMAIN_TERM 7\n", | |
| "18 DOLLAR 7 DOMAIN_TERM 6\n", | |
| "19 DOLLAR 7 SUMMARY 1\n", | |
| "17 DISTANCE 6 DISTANCE 6\n", | |
| "41 ORDER 6 DOMAIN_TERM 5\n", | |
| "42 ORDER 6 SUMMARY 1\n", | |
| "2 ANIMAL 5 DOMAIN_TERM 5\n", | |
| "4 AREA 5 AREA 5\n", | |
| "6 BODY 5 DOMAIN_TERM 5\n", | |
| "35 METAL 5 METAL 5\n", | |
| "56 SCHOOL 5 SCHOOL 5\n", | |
| "61 TIME 5 TIME 5\n", | |
| "43 PEN 4 DOMAIN_TERM 4\n", | |
| "55 ROAD 4 DOMAIN_TERM 4\n", | |
| "24 FOOD 3 DOMAIN_TERM 3\n", | |
| "37 MOUNTAIN 3 DOMAIN_TERM 3\n", | |
| "62 TOOL 3 TOOL 3\n", | |
| "14 DAY 2 DAY 2\n", | |
| "30 ISLAND 2 DOMAIN_TERM 2\n", | |
| "36 MONTH 2 DOMAIN_TERM 2\n", | |
| "40 OCEAN 2 DOMAIN_TERM 2\n", | |
| "44 PERCENT 2 DOMAIN_TERM 1\n", | |
| "45 PERCENT 2 SUMMARY 1\n", | |
| "3 ARE 1 DOMAIN_TERM 1\n", | |
| "15 DIRECTION 1 DOMAIN_TERM 1\n", | |
| "22 FISH 1 DOMAIN_TERM 1\n", | |
| "25 FRUIT 1 DOMAIN_TERM 1\n", | |
| "26 GENDER 1 DOMAIN_TERM 1\n", | |
| "29 IDIOM 1 DOMAIN_TERM 1\n", | |
| "32 LAKE 1 DOMAIN_TERM 1\n", | |
| "38 MOVIE 1 DOMAIN_TERM 1\n", | |
| "48 PHONE 1 SUMMARY 1\n", | |
| "49 PLANT 1 DOMAIN_TERM 1\n", | |
| "50 PORT 1 DOMAIN_TERM 1\n", | |
| "52 RACE 1 DOMAIN_TERM 1\n", | |
| "58 SPEED 1 DOMAIN_TERM 1\n", | |
| "64 WEIGHT 1 DOMAIN_TERM 1\n", | |
| "69 YES 1 DOMAIN_TERM 1\n", | |
| "71 YESNO_ALT 1 DOMAIN_TERM 1\n", | |
| "72 YESON 1 DOMAIN_TERM 1" | |
| ] | |
| }, | |
| "execution_count": 100, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df >> group_by('LABEL', 'pred_label') >> summarise(pair_n = n(X.LABEL)) >> left_join(\n", | |
| " df >> group_by('LABEL') >> summarise(label_n=n(X.LABEL)), by='LABEL'\n", | |
| ") >> arrange(desc(X.label_n), desc(X.pair_n)) >> select(X.LABEL, X.label_n, X.pred_label, X.pair_n)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 48, | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2019-12-25T09:38:19.309065Z", | |
| "start_time": "2019-12-25T09:38:16.717369Z" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "https://gist.github.com/5dcd462ee06aa053b1a23a6eb5518c2c\r\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!/home/joe/.gem/ruby/2.5.0/bin/gist -u 5dcd462ee06aa053b1a23a6eb5518c2c grandchallenge-q-cls.ipynb" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "miniconda", | |
| "language": "python", | |
| "name": "miniconda" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.3" | |
| }, | |
| "toc": { | |
| "base_numbering": 1, | |
| "nav_menu": {}, | |
| "number_sections": true, | |
| "sideBar": true, | |
| "skip_h1_title": false, | |
| "title_cell": "Table of Contents", | |
| "title_sidebar": "Contents", | |
| "toc_cell": true, | |
| "toc_position": {}, | |
| "toc_section_display": true, | |
| "toc_window_display": true | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment