Last active
February 11, 2022 00:58
-
-
Save urbanecm/f92a14586ee8c7bcf1ded3062fb87b39 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "c8a865ad", | |
| "metadata": {}, | |
| "source": [ | |
| "### License statement\n", | |
| "Copyright 2022 Martin Urbanec (martin.urbanec@wikimedia.cz)\n", | |
| "\n", | |
| "Licensed under the Apache License, Version 2.0 (the \"License\"); you may not use this file except in compliance with the License. You may obtain a copy of the License at\n", | |
| "\n", | |
| "http://www.apache.org/licenses/LICENSE-2.0\n", | |
| "\n", | |
| "Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "40700ab4", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from IPython.display import display, Markdown, Latex, HTML\n", | |
| "\n", | |
| "import os\n", | |
| "import requests\n", | |
| "from datetime import datetime\n", | |
| "import json\n", | |
| "\n", | |
| "import wmpaws\n", | |
| "import pandas as pd\n", | |
| "pd.set_option('display.max_rows', None)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "323e5384-d6f9-4d06-96e0-9839fbdd4f39", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "ORES_TEMPLATE_URI = 'https://ores.wikimedia.org/scores/cswiki/?models=articletopic&revids={revids}'\n", | |
| "USER_AGENT = 'Urbanecm <urbanecm@tools.wmflabs.org>'\n", | |
| "ORES_TO_SEARCH_MAP = json.loads(open('data/ores_to_search_map.json').read()) # taken from https://github.com/wikimedia/mediawiki-extensions-CirrusSearch/blob/master/includes/Query/ArticleTopicFeature.php#L28" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "cada1fb7-553c-4cf3-a9b4-840cece31202", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "SEARCH_TO_GROWTH_MAP = {}\n", | |
| "\n", | |
| "growth_topics = requests.get('https://www.mediawiki.org/w/index.php?title=MediaWiki:NewcomerTopicsOres.json&action=raw&ctype=application/json').json()['topics']\n", | |
| "for growth_topic in growth_topics:\n", | |
| " for search_topic in growth_topics[growth_topic]['oresTopics']:\n", | |
| " SEARCH_TO_GROWTH_MAP[search_topic] = growth_topic" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "f8957648-ef05-4ea2-83a5-0571c4d19a0e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def get_ores_predictions(rev_ids):\n", | |
| " r = requests.get(ORES_TEMPLATE_URI.format(revids='|'.join([str(x) for x in rev_ids])), headers={'User-Agent': USER_AGENT})\n", | |
| " data = r.json()\n", | |
| " res = {}\n", | |
| " for rev_id in data:\n", | |
| " topics = set()\n", | |
| " for ores_topic in data[rev_id]['articletopic'].get('prediction', []):\n", | |
| " search_topic = ORES_TO_SEARCH_MAP.get(ores_topic, '')\n", | |
| " growth_topic = SEARCH_TO_GROWTH_MAP.get(search_topic)\n", | |
| " if growth_topic is not None:\n", | |
| " topics.add(growth_topic)\n", | |
| " \n", | |
| " if len(topics) > 0:\n", | |
| " res[int(rev_id)] = list(topics)\n", | |
| " return res" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "157aea71-c0ea-4b3f-8865-428349a4ba27", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/srv/paws/lib/python3.8/site-packages/pandas/io/sql.py:758: UserWarning: pandas only support SQLAlchemy connectable(engine/connection) ordatabase string URI or sqlite3 DBAPI2 connectionother DBAPI2 objects are not tested, please consider using SQLAlchemy\n", | |
| " warnings.warn(\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "df = wmpaws.run_sql('''\n", | |
| "SELECT\n", | |
| " rc_title,\n", | |
| " rc_cur_id,\n", | |
| " rc_this_oldid\n", | |
| "FROM recentchanges\n", | |
| "JOIN page ON page_id=rc_cur_id\n", | |
| "WHERE\n", | |
| " rc_new = 1\n", | |
| " AND rc_namespace = 0\n", | |
| " AND page_is_redirect = 0\n", | |
| " AND rc_timestamp LIKE '2022%'\n", | |
| "''', 'cswiki')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "a148ffd1-e7d4-42c6-bb39-1d07efd1a15e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "2458" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(df.rc_this_oldid) # number of articles to process" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "676819dd-8618-40a2-92fb-0104300767cc", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def batch_revids(rev_ids):\n", | |
| " batch = []\n", | |
| " for rev_id in rev_ids:\n", | |
| " batch.append(rev_id)\n", | |
| " if len(batch) >= 40:\n", | |
| " yield batch\n", | |
| " batch = []\n", | |
| " \n", | |
| " if len(batch) > 0:\n", | |
| " yield batch" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "94bcb74b-64c3-429a-b835-41d52d20eeec", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "topic_map = {}\n", | |
| "for batch in batch_revids(list(df.rc_this_oldid)):\n", | |
| " topic_map.update(get_ores_predictions(batch))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "ffdf2890-369b-4893-9ea2-e8af400595d3", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>count</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>topic</th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>europe</th>\n", | |
| " <td>1452</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>biography</th>\n", | |
| " <td>913</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>sports</th>\n", | |
| " <td>367</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>general-science</th>\n", | |
| " <td>364</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>art</th>\n", | |
| " <td>244</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>women</th>\n", | |
| " <td>217</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>tv-and-film</th>\n", | |
| " <td>182</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>earth-and-environment</th>\n", | |
| " <td>148</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>asia</th>\n", | |
| " <td>135</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>history</th>\n", | |
| " <td>128</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>architecture</th>\n", | |
| " <td>118</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>north-america</th>\n", | |
| " <td>117</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>politics-and-government</th>\n", | |
| " <td>111</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>society</th>\n", | |
| " <td>110</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>music</th>\n", | |
| " <td>102</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>transportation</th>\n", | |
| " <td>102</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>military-and-warfare</th>\n", | |
| " <td>98</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>philosophy-and-religion</th>\n", | |
| " <td>85</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>technology</th>\n", | |
| " <td>69</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>biology</th>\n", | |
| " <td>67</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>physics</th>\n", | |
| " <td>56</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>chemistry</th>\n", | |
| " <td>52</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>literature</th>\n", | |
| " <td>51</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>performing-arts</th>\n", | |
| " <td>44</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>computers-and-internet</th>\n", | |
| " <td>35</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>business-and-economics</th>\n", | |
| " <td>35</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>education</th>\n", | |
| " <td>25</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>engineering</th>\n", | |
| " <td>23</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>oceania</th>\n", | |
| " <td>22</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>medicine-and-health</th>\n", | |
| " <td>19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>africa</th>\n", | |
| " <td>19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>south-america</th>\n", | |
| " <td>19</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>entertainment</th>\n", | |
| " <td>17</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>comics-and-anime</th>\n", | |
| " <td>14</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>food-and-drink</th>\n", | |
| " <td>11</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>video-games</th>\n", | |
| " <td>9</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>central-america</th>\n", | |
| " <td>9</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>fashion</th>\n", | |
| " <td>8</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>mathematics</th>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " count\n", | |
| "topic \n", | |
| "europe 1452\n", | |
| "biography 913\n", | |
| "sports 367\n", | |
| "general-science 364\n", | |
| "art 244\n", | |
| "women 217\n", | |
| "tv-and-film 182\n", | |
| "earth-and-environment 148\n", | |
| "asia 135\n", | |
| "history 128\n", | |
| "architecture 118\n", | |
| "north-america 117\n", | |
| "politics-and-government 111\n", | |
| "society 110\n", | |
| "music 102\n", | |
| "transportation 102\n", | |
| "military-and-warfare 98\n", | |
| "philosophy-and-religion 85\n", | |
| "technology 69\n", | |
| "biology 67\n", | |
| "physics 56\n", | |
| "chemistry 52\n", | |
| "literature 51\n", | |
| "performing-arts 44\n", | |
| "computers-and-internet 35\n", | |
| "business-and-economics 35\n", | |
| "education 25\n", | |
| "engineering 23\n", | |
| "oceania 22\n", | |
| "medicine-and-health 19\n", | |
| "africa 19\n", | |
| "south-america 19\n", | |
| "entertainment 17\n", | |
| "comics-and-anime 14\n", | |
| "food-and-drink 11\n", | |
| "video-games 9\n", | |
| "central-america 9\n", | |
| "fashion 8\n", | |
| "mathematics 1" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "topic_counts = {}\n", | |
| "for rev_id in topic_map:\n", | |
| " topics = topic_map[rev_id]\n", | |
| " for topic in topics:\n", | |
| " if topic not in topic_counts:\n", | |
| " topic_counts[topic] = 0\n", | |
| " topic_counts[topic] += 1\n", | |
| "\n", | |
| "pdData = {\n", | |
| " 'topic': [],\n", | |
| " 'count': []\n", | |
| "}\n", | |
| "for topic in topic_counts:\n", | |
| " pdData['topic'].append(topic)\n", | |
| " pdData['count'].append(topic_counts[topic])\n", | |
| "\n", | |
| "topic_counts = pd.DataFrame(pdData)\n", | |
| "topic_counts.set_index('topic', inplace=True)\n", | |
| "topic_counts.sort_values('count', ascending=False)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "bcb4ebac-69c3-4937-ab97-99fb93e2e66f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.8.10" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| {"Culture.Biography.Biography*":"biography","Culture.Biography.Women":"women","Culture.Food and drink":"food-and-drink","Culture.Internet culture":"internet-culture","Culture.Linguistics":"linguistics","Culture.Literature":"literature","Culture.Media.Books":"books","Culture.Media.Entertainment":"entertainment","Culture.Media.Films":"films","Culture.Media.Media*":"media","Culture.Media.Music":"music","Culture.Media.Radio":"radio","Culture.Media.Software":"software","Culture.Media.Television":"television","Culture.Media.Video games":"video-games","Culture.Performing arts":"performing-arts","Culture.Philosophy and religion":"philosophy-and-religion","Culture.Sports":"sports","Culture.Visual arts.Architecture":"architecture","Culture.Visual arts.Comics and Anime":"comics-and-anime","Culture.Visual arts.Fashion":"fashion","Culture.Visual arts.Visual arts*":"visual-arts","Geography.Geographical":"geographical","Geography.Regions.Africa.Africa*":"africa","Geography.Regions.Africa.Central Africa":"central-africa","Geography.Regions.Africa.Eastern Africa":"eastern-africa","Geography.Regions.Africa.Northern Africa":"northern-africa","Geography.Regions.Africa.Southern Africa":"southern-africa","Geography.Regions.Africa.Western Africa":"western-africa","Geography.Regions.Americas.Central America":"central-america","Geography.Regions.Americas.North America":"north-america","Geography.Regions.Americas.South America":"south-america","Geography.Regions.Asia.Asia*":"asia","Geography.Regions.Asia.Central Asia":"central-asia","Geography.Regions.Asia.East Asia":"east-asia","Geography.Regions.Asia.North Asia":"north-asia","Geography.Regions.Asia.South Asia":"south-asia","Geography.Regions.Asia.Southeast Asia":"southeast-asia","Geography.Regions.Asia.West Asia":"west-asia","Geography.Regions.Europe.Eastern Europe":"eastern-europe","Geography.Regions.Europe.Europe*":"europe","Geography.Regions.Europe.Northern Europe":"northern-europe","Geography.Regions.Europe.Southern Europe":"southern-europe","Geography.Regions.Europe.Western Europe":"western-europe","Geography.Regions.Oceania":"oceania","History and Society.Business and economics":"business-and-economics","History and Society.Education":"education","History and Society.History":"history","History and Society.Military and warfare":"military-and-warfare","History and Society.Politics and government":"politics-and-government","History and Society.Society":"society","History and Society.Transportation":"transportation","STEM.Biology":"biology","STEM.Chemistry":"chemistry","STEM.Computing":"computing","STEM.Earth and environment":"earth-and-environment","STEM.Engineering":"engineering","STEM.Libraries & Information":"libraries-and-information","STEM.Mathematics":"mathematics","STEM.Medicine & Health":"medicine-and-health","STEM.Physics":"physics","STEM.STEM*":"stem","STEM.Space":"space","STEM.Technology":"technology"} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment