Last active
March 4, 2026 11:26
-
-
Save Facenapalm/c3350635381a22a23e791b5ec77ed102 to your computer and use it in GitHub Desktop.
WIP script for Wikipedia
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| В main раскомментить нужное: | |
| Может формировать отчёты об отсутствующих/конфликтующих категориях. | |
| Может автопостить его в Википедию. | |
| Может проходиться по статьям из pages.txt и переносит категории компьютерных игр | |
| из энвики в рувики. | |
| """ | |
| import re | |
| import json | |
| import requests | |
| import os.path | |
| import pywikibot | |
| from functools import cache | |
| from collections import Counter, defaultdict | |
| from pywikibot.data.api import Request | |
| @cache | |
| def get_interwiki(page: pywikibot.Page, new_site: pywikibot.Site) -> str: | |
| try: | |
| for interwiki in page.iterlanglinks(): | |
| if interwiki.site.code == new_site.code: | |
| return interwiki.title | |
| except pywikibot.exceptions.UnknownSiteError: | |
| pass | |
| return '' | |
| def remove_prefix(string: str, prefix: str) -> str: | |
| if not string.startswith(prefix): | |
| return string | |
| return string[len(prefix):] | |
| class Bot(): | |
| cachefile = 'catdata.json' | |
| reportfile = 'report.txt' | |
| def __init__(self): | |
| self.site = pywikibot.Site() | |
| self.site.login() | |
| print('Initialization:') | |
| self.opensource_categories = self.get_categories_recursive_fast( | |
| ['Свободные компьютерные игры'], include_roots=True) | |
| print('\t[1/4] Opensource categories scrapped.') | |
| self.mod_categories = self.get_categories_recursive_fast( | |
| ['Модификации компьютерных игр'], include_roots=True) | |
| print('\t[2/4] Mod categories scrapped.') | |
| self.gamemode_categories = self.get_categories_recursive_fast( | |
| ['Компьютерные игры по игровому режиму']) | |
| print('\t[3/4] Gamemode categories scrapped.') | |
| self.genre_categories = self.get_categories_recursive( | |
| ['Компьютерные игры по жанрам'], | |
| ['Компьютерные игры по жанру повествования']) | |
| print('\t[4/4] Genre categories scrapped.') | |
| def get_categories_recursive_fast(self, | |
| categories: list[str], | |
| negcats: list[str] = [], | |
| include_roots: bool = False, | |
| depth: int = 5) -> set[str]: | |
| """ | |
| Get the list of subcategories that are included in the one of the | |
| [categories], but not through any of the [negcats], using PetScan. | |
| Both [categories] and [negcats] should not have a 'Category:' prefix. | |
| Result won't have it either. | |
| If include_roots is set, the result would include the [categories]. | |
| """ | |
| params = { | |
| 'language': 'ru', | |
| 'project': 'wikipedia', | |
| 'depth': depth, | |
| 'categories': '\n'.join(categories), | |
| 'combination': 'union', | |
| 'negcats': '\n'.join(negcats), | |
| 'ns[14]': '1', | |
| 'format': 'plain', | |
| 'doit': 'Do+it!', | |
| } | |
| response = requests.post('https://petscan.wmcloud.org', params=params) | |
| if include_roots: | |
| result = set(categories) | |
| else: | |
| result = set() | |
| for category in response.text.split('\n'): | |
| result.add(remove_prefix(category, 'Категория:')) | |
| return result | |
| def get_categories_recursive(self, | |
| categories: list[str], | |
| negcats: list[str], | |
| include_roots: bool = False, | |
| depth: int = 5) -> set[str]: | |
| """ | |
| Get the list of subcategories that are included in the one of the | |
| [categories], but not through any of the [negcats], using WM API. | |
| Both [categories] and [negcats] should not have a 'Category:' prefix. | |
| Result won't have it either. | |
| If include_roots is set, the result would include the [categories]. | |
| Considerably slower but more precise than | |
| get_categories_recursive_fast(). | |
| """ | |
| if include_roots: | |
| result = set(categories) | |
| else: | |
| result = set() | |
| seen = set(negcats) | |
| def dfs(category: pywikibot.Category) -> None: | |
| name = category.title(with_ns=False) | |
| if name in seen: | |
| return | |
| seen.add(name) | |
| result.add(name) | |
| for subcategory in category.subcategories(): | |
| dfs(subcategory) | |
| for catname in categories: | |
| category = pywikibot.Category(self.site, catname) | |
| dfs(category) | |
| return result | |
| def get_category_members(self, catname: str) -> list[str]: | |
| """ | |
| Return the list of the articles included in given category. | |
| """ | |
| print(f'Requesting [[Category:{catname}]] members:') | |
| parameters = { | |
| 'action': 'query', | |
| 'format': 'json', | |
| 'list': 'categorymembers', | |
| 'cmtitle': catname, | |
| 'cmlimit': 'max', | |
| 'cmprop': 'title', | |
| 'cmnamespace': 0, | |
| 'assert': 'bot' | |
| } | |
| request = Request(self.site, parameters=parameters) | |
| counter = 0 | |
| result = [] | |
| while True: | |
| reply = request.submit() | |
| if 'categorymembers' in reply['query']: | |
| result.extend([pageinfo['title'] for pageinfo in reply['query']['categorymembers']]) | |
| counter += len(reply['query']['categorymembers']) | |
| print(f'\t{counter}') | |
| if 'query-continue' in reply: | |
| for key, value in reply['query-continue']['categorymembers'].items(): | |
| request[key] = value | |
| elif 'continue' in reply: | |
| for key, value in reply['continue'].items(): | |
| request[key] = value | |
| else: | |
| break | |
| print('\tDone.') | |
| return result | |
| def get_template_usage(self, tempname: str) -> list[str]: | |
| """ | |
| Return the list of the articles using given template. | |
| """ | |
| print(f'Requesting {{{{{tempname}}}}} members:') | |
| parameters = { | |
| 'action': 'query', | |
| 'format': 'json', | |
| 'list': 'embeddedin', | |
| 'eititle': tempname, | |
| 'eilimit': 'max', | |
| 'assert': 'bot' | |
| } | |
| request = Request(self.site, parameters=parameters) | |
| counter = 0 | |
| result = [] | |
| while True: | |
| reply = request.submit() | |
| if 'embeddedin' in reply['query']: | |
| result.extend([pageinfo['title'] for pageinfo in reply['query']['embeddedin']]) | |
| counter += len(reply['query']['embeddedin']) | |
| print(f'\t{counter}') | |
| if 'query-continue' in reply: | |
| for key, value in reply['query-continue']['embeddedin'].items(): | |
| request[key] = value | |
| elif 'continue' in reply: | |
| for key, value in reply['continue'].items(): | |
| request[key] = value | |
| else: | |
| break | |
| print('\tDone.') | |
| return result | |
| def get_pagelist(self) -> list[str]: | |
| """ | |
| Get the list of the articles about video games (should have | |
| {{компьютерная игра}} template without |nocat=1 parameter). | |
| """ | |
| template = set(self.get_template_usage('Шаблон:Компьютерная игра')) | |
| category = set(self.get_category_members('Категория:Компьютерные игры по алфавиту')) | |
| return list(template & category) | |
| def get_pages_categories(self, pagelist: list[str], limit: int = 500) -> dict[str, list[str]]: | |
| """ | |
| For every page from the list get list of categories and return | |
| {page: [categories]} | |
| dictionary. | |
| """ | |
| print('Requesting pages categories.') | |
| result = dict.fromkeys(pagelist, []) | |
| parameters = { | |
| 'action': 'query', | |
| 'prop': 'categories', | |
| 'cllimit': '5000', | |
| 'assert': 'bot' | |
| } | |
| for idx in range(0, len(pagelist), limit): | |
| print(f'\t{idx}/{len(pagelist)}') | |
| parameters["titles"] = "|".join(pagelist[idx:idx+limit]) | |
| request = Request(self.site, parameters=parameters) | |
| while True: | |
| reply = request.submit() | |
| # Wikipedia API can return page list in non-canonical form! | |
| # At least when there are two possible canonical forms for one namespace | |
| # (for instance, 'Участник' – 'Участница' in Russian Wikipedia). | |
| # This query will normalize them and we need to handle it. | |
| denormalize = {} | |
| if 'normalized' in reply['query']: | |
| for fix in reply['query']['normalized']: | |
| denormalize[fix['to']] = fix['from'] | |
| for value in reply['query']['pages'].values(): | |
| title = value['title'] | |
| if title in denormalize: | |
| title = denormalize[title] | |
| if 'categories' in value: | |
| cats = [cat['title'] for cat in value['categories']] | |
| result[title] = result[title] + cats | |
| if 'query-continue' in reply: | |
| for key, value in reply['query-continue']['embeddedin'].items(): | |
| request[key] = value | |
| elif 'continue' in reply: | |
| for key, value in reply['continue'].items(): | |
| request[key] = value | |
| else: | |
| break | |
| print('\tDone.') | |
| return result | |
| def get_category_type(self, category: str) -> str: | |
| category = remove_prefix(category, 'Категория:') | |
| if category == 'Компьютерные игры по алфавиту': | |
| return 'common' | |
| elif re.match(r'Дополнения', category): | |
| return 'addon' | |
| elif re.match(r'Игры для (?!.*\b(Live|Network|VR|Oculus|HTC Vive|WiiWare)\b)', category): | |
| return 'platform' | |
| elif category == 'Браузерные игры': | |
| return 'platform' | |
| elif re.match(r'Отменённые (компьютерные )?игры для ', category): | |
| return 'platform_unreleased' | |
| elif re.match(r'Игры только для ', category): | |
| return 'platform+exclusive' | |
| elif category == 'Браузерная многопользовательская ролевая онлайн-игра': | |
| return 'platform+gamemode+genre' | |
| elif category == 'BBMMOG': | |
| return 'platform+gamemode+genre' | |
| elif re.match(r'Компьютерные игры \d{4} года$', category): | |
| return 'release_date' | |
| elif category == 'Компьютерные игры в разработке': | |
| return 'unreleased' | |
| elif re.match(r'Компьютерные игры, выпуск которых запланирован на \d{4} год$', category): | |
| return 'unreleased' | |
| elif re.match(r'Отменённые (компьютерные )?игры', category): | |
| return 'unreleased' | |
| elif re.match(r'Компьютерные игры, разработанные (в|во|на) ', category): | |
| return 'country_of_origin' | |
| elif re.match(r'Компьютерные игры, разработанные ', category): | |
| return 'developer' | |
| elif re.match(r'Компьютерные игры, изданные ', category): | |
| return 'publisher' | |
| elif category == 'Инди-игры': | |
| return 'publisher' | |
| elif category in self.genre_categories: | |
| if category in self.gamemode_categories: | |
| return 'gamemode+genre' | |
| else: | |
| return 'genre' | |
| elif category in self.gamemode_categories: | |
| return 'gamemode' | |
| elif category == 'Фан-игры': | |
| return 'fan_game' | |
| elif category in self.mod_categories: | |
| return 'mod' | |
| elif category in self.opensource_categories: | |
| return 'open_source' | |
| elif re.match(r'Игры на движке ', category): | |
| return 'engine' | |
| elif re.match(r'Игры, использующие ', category): | |
| return 'engine' | |
| elif re.match(r'Категория:Компьютерные игры, профинансированные через ', category): | |
| return 'crowdsource' | |
| elif re.match(r'(Википедия|Проект|Викиданные|ПРО):', category): | |
| return 'internal' | |
| elif re.match(r'(Страницы|Статьи|Шаблоны)', category): | |
| return 'internal' | |
| else: | |
| return 'other' | |
| def print_report(self, bypass_cache: bool = False) -> None: | |
| """ | |
| Output the list of articles with missing or conflicting categories. | |
| """ | |
| if os.path.isfile(self.cachefile) and not bypass_cache: | |
| with open(self.cachefile, encoding="utf-8") as cache_page: | |
| catdata = json.load(cache_page) | |
| else: | |
| pagelist = self.get_pagelist() | |
| catdata = self.get_pages_categories(pagelist) | |
| with open('catdata.json', 'w', encoding='utf-8') as json_file: | |
| json.dump(catdata, json_file, indent=4, ensure_ascii=False) | |
| lines = [] | |
| for page in sorted(catdata.keys()): | |
| categories = catdata[page] | |
| cattypes = [] | |
| for category in categories: | |
| cattypes.extend(self.get_category_type(category).split('+')) | |
| counts = Counter(cattypes) | |
| if counts['addon'] > 0: | |
| # An article about an add-on, not video game. | |
| continue | |
| unreleased = counts['platform_unreleased'] or counts['unreleased'] | |
| dubious_country = counts['open_source'] or counts['fan_game'] or counts['mod'] | |
| errors = [] | |
| if not counts['genre']: | |
| errors.append('нет жанра') | |
| if not counts['release_date'] and not unreleased: | |
| errors.append('нет даты') | |
| if counts['release_date'] > 1: | |
| errors.append('несколько дат') | |
| if counts['release_date'] and counts['unreleased']: | |
| errors.append('дата у невыпущенной игры') | |
| if not counts['country_of_origin'] and not dubious_country: | |
| errors.append('нет страны') | |
| if counts['platform'] == 0 and not unreleased: | |
| errors.append('нет платформы') | |
| if counts['exclusive'] and counts['platform'] >= 2: | |
| errors.append('не эксклюзив') | |
| # if counts['exclusive'] == 0 and counts['platform'] == 1: | |
| # errors.append('возможный эксклюзив') | |
| if not counts['gamemode']: | |
| errors.append('нет режима') | |
| # if not counts['publisher'] and not unreleased: | |
| # errors.append('нет издателя') | |
| if errors: | |
| lines.append(f'* [[{page}]] ({", ".join(errors)})') | |
| with open(self.reportfile, 'w', encoding='utf-8') as output: | |
| output.write('\n'.join(lines)) | |
| def process_page(self, other_site: pywikibot.Site, pagename: str) -> None: | |
| """ | |
| Import missing categories from given site on given page. | |
| Categories won't be imported if category of the same type already exist | |
| to prevent duplication. E. g. if the page has a genre category, other | |
| genre categories won't be imported. | |
| """ | |
| try: | |
| page = pywikibot.Page(self.site, pagename) | |
| interwiki_name = get_interwiki(page, other_site) | |
| if not interwiki_name: | |
| raise RuntimeError(f'No {other_site.code} interwiki found') | |
| ru_categories = [cat.title() for cat in page.categories()] | |
| ru_counts = Counter(self.get_category_type(category) for category in ru_categories) | |
| in_categories = defaultdict(list) | |
| interwiki = pywikibot.Page(other_site, interwiki_name) | |
| for in_category in interwiki.categories(): | |
| category = get_interwiki(in_category, self.site) | |
| if not category: | |
| continue | |
| in_categories[self.get_category_type(category)].append(category) | |
| # If page is categorized as Windows-only game, it shouldn't be | |
| # categorized as Windows game, etc. | |
| for category in in_categories['platform+exclusive']: | |
| non_exclusive = category.replace(' только ', ' ') | |
| if non_exclusive in in_categories['platform']: | |
| in_categories['platform'].remove(non_exclusive) | |
| categories_to_add = [] | |
| comments_to_add = [] | |
| # Import platform. | |
| if ru_counts['platform'] + ru_counts['platform+exclusive'] == 0: | |
| platforms = in_categories['platform+exclusive'] + in_categories['platform'] | |
| if platforms: | |
| categories_to_add.extend(platforms) | |
| comments_to_add.append('платформы') | |
| # Import release date. | |
| if ru_counts['release_date'] + ru_counts['unreleased'] == 0: | |
| if len(in_categories['release_date']) == 1: | |
| categories_to_add.extend(in_categories['release_date']) | |
| comments_to_add.append('дата выпуска') | |
| # Import category types that can only be presented once (e. g. a | |
| # game shouldn't be categorized as a Singleplayer exclusive and a | |
| # Multiplayer exclusive at the same time). | |
| imported_single_types = { | |
| 'gamemode': 'режим' | |
| } | |
| for type, comment in imported_single_types.items(): | |
| if ru_counts[type]: | |
| continue | |
| if len(in_categories[type]) != 1: | |
| continue | |
| categories_to_add.extend(in_categories[type]) | |
| comments_to_add.append(comment) | |
| # Import other category types. | |
| imported_types = { | |
| 'country_of_origin': 'страна', | |
| 'genre': 'жанр', | |
| 'engine': 'движок', | |
| # 'developer': 'разработчик', | |
| # 'publisher': 'издатель', | |
| } | |
| for type, comment in imported_types.items(): | |
| if ru_counts[type]: | |
| continue | |
| if len(in_categories[type]) == 0: | |
| continue | |
| categories_to_add.extend(in_categories[type]) | |
| comments_to_add.append(comment) | |
| if not categories_to_add: | |
| raise RuntimeError('no categories imported') | |
| code_to_add = ''.join(f'\n[[Категория:{category}]]' for category in categories_to_add) | |
| comment = f'Импорт категорий из {other_site.code}wiki: {", ".join(comments_to_add)}' | |
| page.text = page.text.rstrip() + code_to_add | |
| page.save(comment) | |
| except RuntimeError as error: | |
| print(f'{pagename.strip()}: {error}') | |
| def post_report(self, pagename: str) -> None: | |
| self.print_report(True) | |
| with open(self.reportfile, encoding='utf-8') as report: | |
| inserted = report.read() | |
| page = pywikibot.Page(self.site, pagename) | |
| text, success = re.subn(r'(<!-- bot -->\n)[\s\S]*(\n<!-- /bot -->)', f'\\1{inserted}\\2', page.text) | |
| if success == 0: | |
| raise RuntimeError('<!-- bot --> label not found') | |
| page.text = text | |
| page.save('Автообновление списка', minor=False) | |
| def main(): | |
| bot = Bot() | |
| # bot.print_report(True) | |
| bot.post_report('Проект:Компьютерные игры/Проблемы с категоризацией') | |
| # XXX: process_page needs to be rewritten according to the new get_category_type() logic !!! | |
| # enwiki = pywikibot.Site('en') | |
| # with open('pages.txt', encoding='utf-8') as pagelist: | |
| # for page in pagelist: | |
| # bot.process_page(enwiki, page) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment