Skip to content

Instantly share code, notes, and snippets.

@Facenapalm
Last active March 4, 2026 11:26
Show Gist options
  • Select an option

  • Save Facenapalm/c3350635381a22a23e791b5ec77ed102 to your computer and use it in GitHub Desktop.

Select an option

Save Facenapalm/c3350635381a22a23e791b5ec77ed102 to your computer and use it in GitHub Desktop.
WIP script for Wikipedia
"""
В main раскомментить нужное:
Может формировать отчёты об отсутствующих/конфликтующих категориях.
Может автопостить его в Википедию.
Может проходиться по статьям из pages.txt и переносит категории компьютерных игр
из энвики в рувики.
"""
import re
import json
import requests
import os.path
import pywikibot
from functools import cache
from collections import Counter, defaultdict
from pywikibot.data.api import Request
@cache
def get_interwiki(page: pywikibot.Page, new_site: pywikibot.Site) -> str:
try:
for interwiki in page.iterlanglinks():
if interwiki.site.code == new_site.code:
return interwiki.title
except pywikibot.exceptions.UnknownSiteError:
pass
return ''
def remove_prefix(string: str, prefix: str) -> str:
if not string.startswith(prefix):
return string
return string[len(prefix):]
class Bot():
cachefile = 'catdata.json'
reportfile = 'report.txt'
def __init__(self):
self.site = pywikibot.Site()
self.site.login()
print('Initialization:')
self.opensource_categories = self.get_categories_recursive_fast(
['Свободные компьютерные игры'], include_roots=True)
print('\t[1/4] Opensource categories scrapped.')
self.mod_categories = self.get_categories_recursive_fast(
['Модификации компьютерных игр'], include_roots=True)
print('\t[2/4] Mod categories scrapped.')
self.gamemode_categories = self.get_categories_recursive_fast(
['Компьютерные игры по игровому режиму'])
print('\t[3/4] Gamemode categories scrapped.')
self.genre_categories = self.get_categories_recursive(
['Компьютерные игры по жанрам'],
['Компьютерные игры по жанру повествования'])
print('\t[4/4] Genre categories scrapped.')
def get_categories_recursive_fast(self,
categories: list[str],
negcats: list[str] = [],
include_roots: bool = False,
depth: int = 5) -> set[str]:
"""
Get the list of subcategories that are included in the one of the
[categories], but not through any of the [negcats], using PetScan.
Both [categories] and [negcats] should not have a 'Category:' prefix.
Result won't have it either.
If include_roots is set, the result would include the [categories].
"""
params = {
'language': 'ru',
'project': 'wikipedia',
'depth': depth,
'categories': '\n'.join(categories),
'combination': 'union',
'negcats': '\n'.join(negcats),
'ns[14]': '1',
'format': 'plain',
'doit': 'Do+it!',
}
response = requests.post('https://petscan.wmcloud.org', params=params)
if include_roots:
result = set(categories)
else:
result = set()
for category in response.text.split('\n'):
result.add(remove_prefix(category, 'Категория:'))
return result
def get_categories_recursive(self,
categories: list[str],
negcats: list[str],
include_roots: bool = False,
depth: int = 5) -> set[str]:
"""
Get the list of subcategories that are included in the one of the
[categories], but not through any of the [negcats], using WM API.
Both [categories] and [negcats] should not have a 'Category:' prefix.
Result won't have it either.
If include_roots is set, the result would include the [categories].
Considerably slower but more precise than
get_categories_recursive_fast().
"""
if include_roots:
result = set(categories)
else:
result = set()
seen = set(negcats)
def dfs(category: pywikibot.Category) -> None:
name = category.title(with_ns=False)
if name in seen:
return
seen.add(name)
result.add(name)
for subcategory in category.subcategories():
dfs(subcategory)
for catname in categories:
category = pywikibot.Category(self.site, catname)
dfs(category)
return result
def get_category_members(self, catname: str) -> list[str]:
"""
Return the list of the articles included in given category.
"""
print(f'Requesting [[Category:{catname}]] members:')
parameters = {
'action': 'query',
'format': 'json',
'list': 'categorymembers',
'cmtitle': catname,
'cmlimit': 'max',
'cmprop': 'title',
'cmnamespace': 0,
'assert': 'bot'
}
request = Request(self.site, parameters=parameters)
counter = 0
result = []
while True:
reply = request.submit()
if 'categorymembers' in reply['query']:
result.extend([pageinfo['title'] for pageinfo in reply['query']['categorymembers']])
counter += len(reply['query']['categorymembers'])
print(f'\t{counter}')
if 'query-continue' in reply:
for key, value in reply['query-continue']['categorymembers'].items():
request[key] = value
elif 'continue' in reply:
for key, value in reply['continue'].items():
request[key] = value
else:
break
print('\tDone.')
return result
def get_template_usage(self, tempname: str) -> list[str]:
"""
Return the list of the articles using given template.
"""
print(f'Requesting {{{{{tempname}}}}} members:')
parameters = {
'action': 'query',
'format': 'json',
'list': 'embeddedin',
'eititle': tempname,
'eilimit': 'max',
'assert': 'bot'
}
request = Request(self.site, parameters=parameters)
counter = 0
result = []
while True:
reply = request.submit()
if 'embeddedin' in reply['query']:
result.extend([pageinfo['title'] for pageinfo in reply['query']['embeddedin']])
counter += len(reply['query']['embeddedin'])
print(f'\t{counter}')
if 'query-continue' in reply:
for key, value in reply['query-continue']['embeddedin'].items():
request[key] = value
elif 'continue' in reply:
for key, value in reply['continue'].items():
request[key] = value
else:
break
print('\tDone.')
return result
def get_pagelist(self) -> list[str]:
"""
Get the list of the articles about video games (should have
{{компьютерная игра}} template without |nocat=1 parameter).
"""
template = set(self.get_template_usage('Шаблон:Компьютерная игра'))
category = set(self.get_category_members('Категория:Компьютерные игры по алфавиту'))
return list(template & category)
def get_pages_categories(self, pagelist: list[str], limit: int = 500) -> dict[str, list[str]]:
"""
For every page from the list get list of categories and return
{page: [categories]}
dictionary.
"""
print('Requesting pages categories.')
result = dict.fromkeys(pagelist, [])
parameters = {
'action': 'query',
'prop': 'categories',
'cllimit': '5000',
'assert': 'bot'
}
for idx in range(0, len(pagelist), limit):
print(f'\t{idx}/{len(pagelist)}')
parameters["titles"] = "|".join(pagelist[idx:idx+limit])
request = Request(self.site, parameters=parameters)
while True:
reply = request.submit()
# Wikipedia API can return page list in non-canonical form!
# At least when there are two possible canonical forms for one namespace
# (for instance, 'Участник' – 'Участница' in Russian Wikipedia).
# This query will normalize them and we need to handle it.
denormalize = {}
if 'normalized' in reply['query']:
for fix in reply['query']['normalized']:
denormalize[fix['to']] = fix['from']
for value in reply['query']['pages'].values():
title = value['title']
if title in denormalize:
title = denormalize[title]
if 'categories' in value:
cats = [cat['title'] for cat in value['categories']]
result[title] = result[title] + cats
if 'query-continue' in reply:
for key, value in reply['query-continue']['embeddedin'].items():
request[key] = value
elif 'continue' in reply:
for key, value in reply['continue'].items():
request[key] = value
else:
break
print('\tDone.')
return result
def get_category_type(self, category: str) -> str:
category = remove_prefix(category, 'Категория:')
if category == 'Компьютерные игры по алфавиту':
return 'common'
elif re.match(r'Дополнения', category):
return 'addon'
elif re.match(r'Игры для (?!.*\b(Live|Network|VR|Oculus|HTC Vive|WiiWare)\b)', category):
return 'platform'
elif category == 'Браузерные игры':
return 'platform'
elif re.match(r'Отменённые (компьютерные )?игры для ', category):
return 'platform_unreleased'
elif re.match(r'Игры только для ', category):
return 'platform+exclusive'
elif category == 'Браузерная многопользовательская ролевая онлайн-игра':
return 'platform+gamemode+genre'
elif category == 'BBMMOG':
return 'platform+gamemode+genre'
elif re.match(r'Компьютерные игры \d{4} года$', category):
return 'release_date'
elif category == 'Компьютерные игры в разработке':
return 'unreleased'
elif re.match(r'Компьютерные игры, выпуск которых запланирован на \d{4} год$', category):
return 'unreleased'
elif re.match(r'Отменённые (компьютерные )?игры', category):
return 'unreleased'
elif re.match(r'Компьютерные игры, разработанные (в|во|на) ', category):
return 'country_of_origin'
elif re.match(r'Компьютерные игры, разработанные ', category):
return 'developer'
elif re.match(r'Компьютерные игры, изданные ', category):
return 'publisher'
elif category == 'Инди-игры':
return 'publisher'
elif category in self.genre_categories:
if category in self.gamemode_categories:
return 'gamemode+genre'
else:
return 'genre'
elif category in self.gamemode_categories:
return 'gamemode'
elif category == 'Фан-игры':
return 'fan_game'
elif category in self.mod_categories:
return 'mod'
elif category in self.opensource_categories:
return 'open_source'
elif re.match(r'Игры на движке ', category):
return 'engine'
elif re.match(r'Игры, использующие ', category):
return 'engine'
elif re.match(r'Категория:Компьютерные игры, профинансированные через ', category):
return 'crowdsource'
elif re.match(r'(Википедия|Проект|Викиданные|ПРО):', category):
return 'internal'
elif re.match(r'(Страницы|Статьи|Шаблоны)', category):
return 'internal'
else:
return 'other'
def print_report(self, bypass_cache: bool = False) -> None:
"""
Output the list of articles with missing or conflicting categories.
"""
if os.path.isfile(self.cachefile) and not bypass_cache:
with open(self.cachefile, encoding="utf-8") as cache_page:
catdata = json.load(cache_page)
else:
pagelist = self.get_pagelist()
catdata = self.get_pages_categories(pagelist)
with open('catdata.json', 'w', encoding='utf-8') as json_file:
json.dump(catdata, json_file, indent=4, ensure_ascii=False)
lines = []
for page in sorted(catdata.keys()):
categories = catdata[page]
cattypes = []
for category in categories:
cattypes.extend(self.get_category_type(category).split('+'))
counts = Counter(cattypes)
if counts['addon'] > 0:
# An article about an add-on, not video game.
continue
unreleased = counts['platform_unreleased'] or counts['unreleased']
dubious_country = counts['open_source'] or counts['fan_game'] or counts['mod']
errors = []
if not counts['genre']:
errors.append('нет жанра')
if not counts['release_date'] and not unreleased:
errors.append('нет даты')
if counts['release_date'] > 1:
errors.append('несколько дат')
if counts['release_date'] and counts['unreleased']:
errors.append('дата у невыпущенной игры')
if not counts['country_of_origin'] and not dubious_country:
errors.append('нет страны')
if counts['platform'] == 0 and not unreleased:
errors.append('нет платформы')
if counts['exclusive'] and counts['platform'] >= 2:
errors.append('не эксклюзив')
# if counts['exclusive'] == 0 and counts['platform'] == 1:
# errors.append('возможный эксклюзив')
if not counts['gamemode']:
errors.append('нет режима')
# if not counts['publisher'] and not unreleased:
# errors.append('нет издателя')
if errors:
lines.append(f'* [[{page}]] ({", ".join(errors)})')
with open(self.reportfile, 'w', encoding='utf-8') as output:
output.write('\n'.join(lines))
def process_page(self, other_site: pywikibot.Site, pagename: str) -> None:
"""
Import missing categories from given site on given page.
Categories won't be imported if category of the same type already exist
to prevent duplication. E. g. if the page has a genre category, other
genre categories won't be imported.
"""
try:
page = pywikibot.Page(self.site, pagename)
interwiki_name = get_interwiki(page, other_site)
if not interwiki_name:
raise RuntimeError(f'No {other_site.code} interwiki found')
ru_categories = [cat.title() for cat in page.categories()]
ru_counts = Counter(self.get_category_type(category) for category in ru_categories)
in_categories = defaultdict(list)
interwiki = pywikibot.Page(other_site, interwiki_name)
for in_category in interwiki.categories():
category = get_interwiki(in_category, self.site)
if not category:
continue
in_categories[self.get_category_type(category)].append(category)
# If page is categorized as Windows-only game, it shouldn't be
# categorized as Windows game, etc.
for category in in_categories['platform+exclusive']:
non_exclusive = category.replace(' только ', ' ')
if non_exclusive in in_categories['platform']:
in_categories['platform'].remove(non_exclusive)
categories_to_add = []
comments_to_add = []
# Import platform.
if ru_counts['platform'] + ru_counts['platform+exclusive'] == 0:
platforms = in_categories['platform+exclusive'] + in_categories['platform']
if platforms:
categories_to_add.extend(platforms)
comments_to_add.append('платформы')
# Import release date.
if ru_counts['release_date'] + ru_counts['unreleased'] == 0:
if len(in_categories['release_date']) == 1:
categories_to_add.extend(in_categories['release_date'])
comments_to_add.append('дата выпуска')
# Import category types that can only be presented once (e. g. a
# game shouldn't be categorized as a Singleplayer exclusive and a
# Multiplayer exclusive at the same time).
imported_single_types = {
'gamemode': 'режим'
}
for type, comment in imported_single_types.items():
if ru_counts[type]:
continue
if len(in_categories[type]) != 1:
continue
categories_to_add.extend(in_categories[type])
comments_to_add.append(comment)
# Import other category types.
imported_types = {
'country_of_origin': 'страна',
'genre': 'жанр',
'engine': 'движок',
# 'developer': 'разработчик',
# 'publisher': 'издатель',
}
for type, comment in imported_types.items():
if ru_counts[type]:
continue
if len(in_categories[type]) == 0:
continue
categories_to_add.extend(in_categories[type])
comments_to_add.append(comment)
if not categories_to_add:
raise RuntimeError('no categories imported')
code_to_add = ''.join(f'\n[[Категория:{category}]]' for category in categories_to_add)
comment = f'Импорт категорий из {other_site.code}wiki: {", ".join(comments_to_add)}'
page.text = page.text.rstrip() + code_to_add
page.save(comment)
except RuntimeError as error:
print(f'{pagename.strip()}: {error}')
def post_report(self, pagename: str) -> None:
self.print_report(True)
with open(self.reportfile, encoding='utf-8') as report:
inserted = report.read()
page = pywikibot.Page(self.site, pagename)
text, success = re.subn(r'(<!-- bot -->\n)[\s\S]*(\n<!-- /bot -->)', f'\\1{inserted}\\2', page.text)
if success == 0:
raise RuntimeError('<!-- bot --> label not found')
page.text = text
page.save('Автообновление списка', minor=False)
def main():
bot = Bot()
# bot.print_report(True)
bot.post_report('Проект:Компьютерные игры/Проблемы с категоризацией')
# XXX: process_page needs to be rewritten according to the new get_category_type() logic !!!
# enwiki = pywikibot.Site('en')
# with open('pages.txt', encoding='utf-8') as pagelist:
# for page in pagelist:
# bot.process_page(enwiki, page)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment