Facenapalm/vg_categories_maintainer.py

## vg_categories_maintainer.py
"""
В main раскомментить нужное:

Может формировать отчёты об отсутствующих/конфликтующих категориях.

Может автопостить его в Википедию.

Может проходиться по статьям из pages.txt и переносит категории компьютерных игр
из энвики в рувики.
"""

import re
import json
import requests
import os.path
import pywikibot
from functools import cache
from collections import Counter, defaultdict
from pywikibot.data.api import Request


@cache
def get_interwiki(page: pywikibot.Page, new_site: pywikibot.Site) -> str:
    try:
        for interwiki in page.iterlanglinks():
            if interwiki.site.code == new_site.code:
                return interwiki.title
    except pywikibot.exceptions.UnknownSiteError:
        pass
    return ''


def remove_prefix(string: str, prefix: str) -> str:
    if not string.startswith(prefix):
        return string
    return string[len(prefix):]


class Bot():
    cachefile = 'catdata.json'
    reportfile = 'report.txt'

    def __init__(self):
        self.site = pywikibot.Site()
        self.site.login()

        print('Initialization:')

        self.opensource_categories = self.get_categories_recursive_fast(
                ['Свободные компьютерные игры'], include_roots=True)
        print('\t[1/4] Opensource categories scrapped.')

        self.mod_categories = self.get_categories_recursive_fast(
            ['Модификации компьютерных игр'], include_roots=True)
        print('\t[2/4] Mod categories scrapped.')

        self.gamemode_categories = self.get_categories_recursive_fast(
            ['Компьютерные игры по игровому режиму'])
        print('\t[3/4] Gamemode categories scrapped.')

        self.genre_categories = self.get_categories_recursive(
                ['Компьютерные игры по жанрам'],
                ['Компьютерные игры по жанру повествования'])
        print('\t[4/4] Genre categories scrapped.')


    def get_categories_recursive_fast(self,
                categories: list[str],
                negcats: list[str] = [],
                include_roots: bool = False,
                depth: int = 5) -> set[str]:
        """
        Get the list of subcategories that are included in the one of the
        [categories], but not through any of the [negcats], using PetScan.

        Both [categories] and [negcats] should not have a 'Category:' prefix.
        Result won't have it either.

        If include_roots is set, the result would include the [categories].
        """
        params = {
            'language': 'ru',
            'project': 'wikipedia',
            'depth': depth,
            'categories': '\n'.join(categories),
            'combination': 'union',
            'negcats': '\n'.join(negcats),
            'ns[14]': '1',
            'format': 'plain',
            'doit': 'Do+it!',
        }

        response = requests.post('https://petscan.wmcloud.org', params=params)

        if include_roots:
            result = set(categories)
        else:
            result = set()

        for category in response.text.split('\n'):
            result.add(remove_prefix(category, 'Категория:'))

        return result


    def get_categories_recursive(self,
                categories: list[str],
                negcats: list[str],
                include_roots: bool = False,
                depth: int = 5) -> set[str]:
        """
        Get the list of subcategories that are included in the one of the
        [categories], but not through any of the [negcats], using WM API.

        Both [categories] and [negcats] should not have a 'Category:' prefix.
        Result won't have it either.

        If include_roots is set, the result would include the [categories].

        Considerably slower but more precise than
        get_categories_recursive_fast().
        """

        if include_roots:
            result = set(categories)
        else:
            result = set()

        seen = set(negcats)

        def dfs(category: pywikibot.Category) -> None:
            name = category.title(with_ns=False)
            if name in seen:
                return

            seen.add(name)
            result.add(name)

            for subcategory in category.subcategories():
                dfs(subcategory)

        for catname in categories:
            category = pywikibot.Category(self.site, catname)
            dfs(category)

        return result


    def get_category_members(self, catname: str) -> list[str]:
        """
        Return the list of the articles included in given category.
        """
        print(f'Requesting [[Category:{catname}]] members:')

        parameters = {
            'action': 'query',
            'format': 'json',
            'list': 'categorymembers',
            'cmtitle': catname,
            'cmlimit': 'max',
            'cmprop': 'title',
            'cmnamespace': 0,
            'assert': 'bot'
        }
        request = Request(self.site, parameters=parameters)
        counter = 0
        result = []
        while True:
            reply = request.submit()
            if 'categorymembers' in reply['query']:
                result.extend([pageinfo['title'] for pageinfo in reply['query']['categorymembers']])
                counter += len(reply['query']['categorymembers'])
                print(f'\t{counter}')
            if 'query-continue' in reply:
                for key, value in reply['query-continue']['categorymembers'].items():
                    request[key] = value
            elif 'continue' in reply:
                for key, value in reply['continue'].items():
                    request[key] = value
            else:
                break

        print('\tDone.')
        return result


    def get_template_usage(self, tempname: str) -> list[str]:
        """
        Return the list of the articles using given template.
        """
        print(f'Requesting {{{{{tempname}}}}} members:')

        parameters = {
            'action': 'query',
            'format': 'json',
            'list': 'embeddedin',
            'eititle': tempname,
            'eilimit': 'max',
            'assert': 'bot'
        }
        request = Request(self.site, parameters=parameters)
        counter = 0
        result = []
        while True:
            reply = request.submit()
            if 'embeddedin' in reply['query']:
                result.extend([pageinfo['title'] for pageinfo in reply['query']['embeddedin']])
                counter += len(reply['query']['embeddedin'])
                print(f'\t{counter}')
            if 'query-continue' in reply:
                for key, value in reply['query-continue']['embeddedin'].items():
                    request[key] = value
            elif 'continue' in reply:
                for key, value in reply['continue'].items():
                    request[key] = value
            else:
                break

        print('\tDone.')
        return result


    def get_pagelist(self) -> list[str]:
        """
        Get the list of the articles about video games (should have
        {{компьютерная игра}} template without |nocat=1 parameter).
        """
        template = set(self.get_template_usage('Шаблон:Компьютерная игра'))
        category = set(self.get_category_members('Категория:Компьютерные игры по алфавиту'))
        return list(template & category)


    def get_pages_categories(self, pagelist: list[str], limit: int = 500) -> dict[str, list[str]]:
        """
        For every page from the list get list of categories and return
            {page: [categories]}
        dictionary.
        """
        print('Requesting pages categories.')
        result = dict.fromkeys(pagelist, [])
        parameters = {
            'action': 'query',
            'prop': 'categories',
            'cllimit': '5000',
            'assert': 'bot'
        }
        for idx in range(0, len(pagelist), limit):
            print(f'\t{idx}/{len(pagelist)}')
            parameters["titles"] = "|".join(pagelist[idx:idx+limit])
            request = Request(self.site, parameters=parameters)
            while True:
                reply = request.submit()

                # Wikipedia API can return page list in non-canonical form!
                # At least when there are two possible canonical forms for one namespace
                # (for instance, 'Участник' – 'Участница' in Russian Wikipedia).
                # This query will normalize them and we need to handle it.
                denormalize = {}
                if 'normalized' in reply['query']:
                    for fix in reply['query']['normalized']:
                        denormalize[fix['to']] = fix['from']

                for value in reply['query']['pages'].values():
                    title = value['title']
                    if title in denormalize:
                        title = denormalize[title]
                    if 'categories' in value:
                        cats = [cat['title'] for cat in value['categories']]
                        result[title] = result[title] + cats
                if 'query-continue' in reply:
                    for key, value in reply['query-continue']['embeddedin'].items():
                        request[key] = value
                elif 'continue' in reply:
                    for key, value in reply['continue'].items():
                        request[key] = value
                else:
                    break

        print('\tDone.')
        return result

    def get_category_type(self, category: str) -> str:
        category = remove_prefix(category, 'Категория:')

        if category == 'Компьютерные игры по алфавиту':
            return 'common'

        elif re.match(r'Дополнения', category):
            return 'addon'

        elif re.match(r'Игры для (?!.*\b(Live|Network|VR|Oculus|HTC Vive|WiiWare)\b)', category):
            return 'platform'
        elif category == 'Браузерные игры':
            return 'platform'
        elif re.match(r'Отменённые (компьютерные )?игры для ', category):
            return 'platform_unreleased'
        elif re.match(r'Игры только для ', category):
            return 'platform+exclusive'

        elif category == 'Браузерная многопользовательская ролевая онлайн-игра':
            return 'platform+gamemode+genre'
        elif category == 'BBMMOG':
            return 'platform+gamemode+genre'

        elif re.match(r'Компьютерные игры \d{4} года$', category):
            return 'release_date'
        elif category == 'Компьютерные игры в разработке':
            return 'unreleased'
        elif re.match(r'Компьютерные игры, выпуск которых запланирован на \d{4} год$', category):
            return 'unreleased'
        elif re.match(r'Отменённые (компьютерные )?игры', category):
            return 'unreleased'

        elif re.match(r'Компьютерные игры, разработанные (в|во|на) ', category):
            return 'country_of_origin'

        elif re.match(r'Компьютерные игры, разработанные ', category):
            return 'developer'

        elif re.match(r'Компьютерные игры, изданные ', category):
            return 'publisher'
        elif category == 'Инди-игры':
            return 'publisher'

        elif category in self.genre_categories:
            if category in self.gamemode_categories:
                return 'gamemode+genre'
            else:
                return 'genre'

        elif category in self.gamemode_categories:
            return 'gamemode'

        elif category == 'Фан-игры':
            return 'fan_game'
        elif category in self.mod_categories:
            return 'mod'
        elif category in self.opensource_categories:
            return 'open_source'

        elif re.match(r'Игры на движке ', category):
            return 'engine'
        elif re.match(r'Игры, использующие ', category):
            return 'engine'

        elif re.match(r'Категория:Компьютерные игры, профинансированные через ', category):
            return 'crowdsource'

        elif re.match(r'(Википедия|Проект|Викиданные|ПРО):', category):
            return 'internal'
        elif re.match(r'(Страницы|Статьи|Шаблоны)', category):
            return 'internal'

        else:
            return 'other'


    def print_report(self, bypass_cache: bool = False) -> None:
        """
        Output the list of articles with missing or conflicting categories.
        """
        if os.path.isfile(self.cachefile) and not bypass_cache:
            with open(self.cachefile, encoding="utf-8") as cache_page:
                catdata = json.load(cache_page)
        else:
            pagelist = self.get_pagelist()
            catdata = self.get_pages_categories(pagelist)
            with open('catdata.json', 'w', encoding='utf-8') as json_file:
                json.dump(catdata, json_file, indent=4, ensure_ascii=False)

        lines = []

        for page in sorted(catdata.keys()):
            categories = catdata[page]

            cattypes = []
            for category in categories:
                cattypes.extend(self.get_category_type(category).split('+'))
            counts = Counter(cattypes)

            if counts['addon'] > 0:
                # An article about an add-on, not video game.
                continue

            unreleased = counts['platform_unreleased'] or counts['unreleased']
            dubious_country = counts['open_source'] or counts['fan_game'] or counts['mod']
            errors = []

            if not counts['genre']:
                errors.append('нет жанра')
            if not counts['release_date'] and not unreleased:
                errors.append('нет даты')
            if counts['release_date'] > 1:
                errors.append('несколько дат')
            if counts['release_date'] and counts['unreleased']:
                errors.append('дата у невыпущенной игры')
            if not counts['country_of_origin'] and not dubious_country:
                errors.append('нет страны')
            if counts['platform'] == 0 and not unreleased:
                errors.append('нет платформы')
            if counts['exclusive'] and counts['platform'] >= 2:
                errors.append('не эксклюзив')
            # if counts['exclusive'] == 0 and counts['platform'] == 1:
            #     errors.append('возможный эксклюзив')
            if not counts['gamemode']:
                errors.append('нет режима')
            # if not counts['publisher'] and not unreleased:
            #     errors.append('нет издателя')

            if errors:
                lines.append(f'* [[{page}]] ({", ".join(errors)})')

        with open(self.reportfile, 'w', encoding='utf-8') as output:
            output.write('\n'.join(lines))


    def process_page(self, other_site: pywikibot.Site, pagename: str) -> None:
        """
        Import missing categories from given site on given page.

        Categories won't be imported if category of the same type already exist
        to prevent duplication. E. g. if the page has a genre category, other
        genre categories won't be imported.
        """
        try:
            page = pywikibot.Page(self.site, pagename)
            interwiki_name = get_interwiki(page, other_site)
            if not interwiki_name:
                raise RuntimeError(f'No {other_site.code} interwiki found')

            ru_categories = [cat.title() for cat in page.categories()]
            ru_counts = Counter(self.get_category_type(category) for category in ru_categories)

            in_categories = defaultdict(list)

            interwiki = pywikibot.Page(other_site, interwiki_name)
            for in_category in interwiki.categories():
                category = get_interwiki(in_category, self.site)
                if not category:
                    continue
                in_categories[self.get_category_type(category)].append(category)

            # If page is categorized as Windows-only game, it shouldn't be
            # categorized as Windows game, etc.
            for category in in_categories['platform+exclusive']:
                non_exclusive = category.replace(' только ', ' ')
                if non_exclusive in in_categories['platform']:
                    in_categories['platform'].remove(non_exclusive)

            categories_to_add = []
            comments_to_add = []

            # Import platform.
            if ru_counts['platform'] + ru_counts['platform+exclusive'] == 0:
                platforms = in_categories['platform+exclusive'] + in_categories['platform']
                if platforms:
                    categories_to_add.extend(platforms)
                    comments_to_add.append('платформы')

            # Import release date.
            if ru_counts['release_date'] + ru_counts['unreleased'] == 0:
                if len(in_categories['release_date']) == 1:
                    categories_to_add.extend(in_categories['release_date'])
                    comments_to_add.append('дата выпуска')

            # Import category types that can only be presented once (e. g. a
            # game shouldn't be categorized as a Singleplayer exclusive and a
            # Multiplayer exclusive at the same time).
            imported_single_types = {
                'gamemode': 'режим'
            }
            for type, comment in imported_single_types.items():
                if ru_counts[type]:
                    continue
                if len(in_categories[type]) != 1:
                    continue
                categories_to_add.extend(in_categories[type])
                comments_to_add.append(comment)

            # Import other category types.
            imported_types = {
                'country_of_origin': 'страна',
                'genre': 'жанр',
                'engine': 'движок',
                # 'developer': 'разработчик',
                # 'publisher': 'издатель',
            }
            for type, comment in imported_types.items():
                if ru_counts[type]:
                    continue
                if len(in_categories[type]) == 0:
                    continue
                categories_to_add.extend(in_categories[type])
                comments_to_add.append(comment)

            if not categories_to_add:
                raise RuntimeError('no categories imported')

            code_to_add = ''.join(f'\n[[Категория:{category}]]' for category in categories_to_add)
            comment = f'Импорт категорий из {other_site.code}wiki: {", ".join(comments_to_add)}'

            page.text = page.text.rstrip() + code_to_add
            page.save(comment)
        except RuntimeError as error:
            print(f'{pagename.strip()}: {error}')

    def post_report(self, pagename: str) -> None:
        self.print_report(True)

        with open(self.reportfile, encoding='utf-8') as report:
            inserted = report.read()

        page = pywikibot.Page(self.site, pagename)
        text, success = re.subn(r'(<!-- bot -->\n)[\s\S]*(\n<!-- /bot -->)', f'\\1{inserted}\\2', page.text)
        if success == 0:
            raise RuntimeError('<!-- bot --> label not found')
        page.text = text
        page.save('Автообновление списка', minor=False)


def main():
    bot = Bot()

    # bot.print_report(True)
    bot.post_report('Проект:Компьютерные игры/Проблемы с категоризацией')

    # XXX: process_page needs to be rewritten according to the new get_category_type() logic !!!
    # enwiki = pywikibot.Site('en')
    # with open('pages.txt', encoding='utf-8') as pagelist:
    #     for page in pagelist:
    #         bot.process_page(enwiki, page)

if __name__ == "__main__":
    main()
	"""
	В main раскомментить нужное:

	Может формировать отчёты об отсутствующих/конфликтующих категориях.

	Может автопостить его в Википедию.

	Может проходиться по статьям из pages.txt и переносит категории компьютерных игр
	из энвики в рувики.
	"""

	import re
	import json
	import requests
	import os.path
	import pywikibot
	from functools import cache
	from collections import Counter, defaultdict
	from pywikibot.data.api import Request


	@cache
	def get_interwiki(page: pywikibot.Page, new_site: pywikibot.Site) -> str:
	try:
	for interwiki in page.iterlanglinks():
	if interwiki.site.code == new_site.code:
	return interwiki.title
	except pywikibot.exceptions.UnknownSiteError:
	pass
	return ''


	def remove_prefix(string: str, prefix: str) -> str:
	if not string.startswith(prefix):
	return string
	return string[len(prefix):]


	class Bot():
	cachefile = 'catdata.json'
	reportfile = 'report.txt'

	def __init__(self):
	self.site = pywikibot.Site()
	self.site.login()

	print('Initialization:')

	self.opensource_categories = self.get_categories_recursive_fast(
	['Свободные компьютерные игры'], include_roots=True)
	print('\t[1/4] Opensource categories scrapped.')

	self.mod_categories = self.get_categories_recursive_fast(
	['Модификации компьютерных игр'], include_roots=True)
	print('\t[2/4] Mod categories scrapped.')

	self.gamemode_categories = self.get_categories_recursive_fast(
	['Компьютерные игры по игровому режиму'])
	print('\t[3/4] Gamemode categories scrapped.')

	self.genre_categories = self.get_categories_recursive(
	['Компьютерные игры по жанрам'],
	['Компьютерные игры по жанру повествования'])
	print('\t[4/4] Genre categories scrapped.')


	def get_categories_recursive_fast(self,
	categories: list[str],
	negcats: list[str] = [],
	include_roots: bool = False,
	depth: int = 5) -> set[str]:
	"""
	Get the list of subcategories that are included in the one of the
	[categories], but not through any of the [negcats], using PetScan.

	Both [categories] and [negcats] should not have a 'Category:' prefix.
	Result won't have it either.

	If include_roots is set, the result would include the [categories].
	"""
	params = {
	'language': 'ru',
	'project': 'wikipedia',
	'depth': depth,
	'categories': '\n'.join(categories),
	'combination': 'union',
	'negcats': '\n'.join(negcats),
	'ns[14]': '1',
	'format': 'plain',
	'doit': 'Do+it!',
	}

	response = requests.post('https://petscan.wmcloud.org', params=params)

	if include_roots:
	result = set(categories)
	else:
	result = set()

	for category in response.text.split('\n'):
	result.add(remove_prefix(category, 'Категория:'))

	return result


	def get_categories_recursive(self,
	categories: list[str],
	negcats: list[str],
	include_roots: bool = False,
	depth: int = 5) -> set[str]:
	"""
	Get the list of subcategories that are included in the one of the
	[categories], but not through any of the [negcats], using WM API.

	Both [categories] and [negcats] should not have a 'Category:' prefix.
	Result won't have it either.

	If include_roots is set, the result would include the [categories].

	Considerably slower but more precise than
	get_categories_recursive_fast().
	"""

	if include_roots:
	result = set(categories)
	else:
	result = set()

	seen = set(negcats)

	def dfs(category: pywikibot.Category) -> None:
	name = category.title(with_ns=False)
	if name in seen:
	return

	seen.add(name)
	result.add(name)

	for subcategory in category.subcategories():
	dfs(subcategory)

	for catname in categories:
	category = pywikibot.Category(self.site, catname)
	dfs(category)

	return result


	def get_category_members(self, catname: str) -> list[str]:
	"""
	Return the list of the articles included in given category.
	"""
	print(f'Requesting [[Category:{catname}]] members:')

	parameters = {
	'action': 'query',
	'format': 'json',
	'list': 'categorymembers',
	'cmtitle': catname,
	'cmlimit': 'max',
	'cmprop': 'title',
	'cmnamespace': 0,
	'assert': 'bot'
	}
	request = Request(self.site, parameters=parameters)
	counter = 0
	result = []
	while True:
	reply = request.submit()
	if 'categorymembers' in reply['query']:
	result.extend([pageinfo['title'] for pageinfo in reply['query']['categorymembers']])
	counter += len(reply['query']['categorymembers'])
	print(f'\t{counter}')
	if 'query-continue' in reply:
	for key, value in reply['query-continue']['categorymembers'].items():
	request[key] = value
	elif 'continue' in reply:
	for key, value in reply['continue'].items():
	request[key] = value
	else:
	break

	print('\tDone.')
	return result


	def get_template_usage(self, tempname: str) -> list[str]:
	"""
	Return the list of the articles using given template.
	"""
	print(f'Requesting {{{{{tempname}}}}} members:')

	parameters = {
	'action': 'query',
	'format': 'json',
	'list': 'embeddedin',
	'eititle': tempname,
	'eilimit': 'max',
	'assert': 'bot'
	}
	request = Request(self.site, parameters=parameters)
	counter = 0
	result = []
	while True:
	reply = request.submit()
	if 'embeddedin' in reply['query']:
	result.extend([pageinfo['title'] for pageinfo in reply['query']['embeddedin']])
	counter += len(reply['query']['embeddedin'])
	print(f'\t{counter}')
	if 'query-continue' in reply:
	for key, value in reply['query-continue']['embeddedin'].items():
	request[key] = value
	elif 'continue' in reply:
	for key, value in reply['continue'].items():
	request[key] = value
	else:
	break

	print('\tDone.')
	return result


	def get_pagelist(self) -> list[str]:
	"""
	Get the list of the articles about video games (should have
	{{компьютерная игра}} template without \|nocat=1 parameter).
	"""
	template = set(self.get_template_usage('Шаблон:Компьютерная игра'))
	category = set(self.get_category_members('Категория:Компьютерные игры по алфавиту'))
	return list(template & category)


	def get_pages_categories(self, pagelist: list[str], limit: int = 500) -> dict[str, list[str]]:
	"""
	For every page from the list get list of categories and return
	{page: [categories]}
	dictionary.
	"""
	print('Requesting pages categories.')
	result = dict.fromkeys(pagelist, [])
	parameters = {
	'action': 'query',
	'prop': 'categories',
	'cllimit': '5000',
	'assert': 'bot'
	}
	for idx in range(0, len(pagelist), limit):
	print(f'\t{idx}/{len(pagelist)}')
	parameters["titles"] = "\|".join(pagelist[idx:idx+limit])
	request = Request(self.site, parameters=parameters)
	while True:
	reply = request.submit()

	# Wikipedia API can return page list in non-canonical form!
	# At least when there are two possible canonical forms for one namespace
	# (for instance, 'Участник' – 'Участница' in Russian Wikipedia).
	# This query will normalize them and we need to handle it.
	denormalize = {}
	if 'normalized' in reply['query']:
	for fix in reply['query']['normalized']:
	denormalize[fix['to']] = fix['from']

	for value in reply['query']['pages'].values():
	title = value['title']
	if title in denormalize:
	title = denormalize[title]
	if 'categories' in value:
	cats = [cat['title'] for cat in value['categories']]
	result[title] = result[title] + cats
	if 'query-continue' in reply:
	for key, value in reply['query-continue']['embeddedin'].items():
	request[key] = value
	elif 'continue' in reply:
	for key, value in reply['continue'].items():
	request[key] = value
	else:
	break

	print('\tDone.')
	return result

	def get_category_type(self, category: str) -> str:
	category = remove_prefix(category, 'Категория:')

	if category == 'Компьютерные игры по алфавиту':
	return 'common'

	elif re.match(r'Дополнения', category):
	return 'addon'

	elif re.match(r'Игры для (?!.*\b(Live\|Network\|VR\|Oculus\|HTC Vive\|WiiWare)\b)', category):
	return 'platform'
	elif category == 'Браузерные игры':
	return 'platform'
	elif re.match(r'Отменённые (компьютерные )?игры для ', category):
	return 'platform_unreleased'
	elif re.match(r'Игры только для ', category):
	return 'platform+exclusive'

	elif category == 'Браузерная многопользовательская ролевая онлайн-игра':
	return 'platform+gamemode+genre'
	elif category == 'BBMMOG':
	return 'platform+gamemode+genre'

	elif re.match(r'Компьютерные игры \d{4} года$', category):
	return 'release_date'
	elif category == 'Компьютерные игры в разработке':
	return 'unreleased'
	elif re.match(r'Компьютерные игры, выпуск которых запланирован на \d{4} год$', category):
	return 'unreleased'
	elif re.match(r'Отменённые (компьютерные )?игры', category):
	return 'unreleased'

	elif re.match(r'Компьютерные игры, разработанные (в\|во\|на) ', category):
	return 'country_of_origin'

	elif re.match(r'Компьютерные игры, разработанные ', category):
	return 'developer'

	elif re.match(r'Компьютерные игры, изданные ', category):
	return 'publisher'
	elif category == 'Инди-игры':
	return 'publisher'

	elif category in self.genre_categories:
	if category in self.gamemode_categories:
	return 'gamemode+genre'
	else:
	return 'genre'

	elif category in self.gamemode_categories:
	return 'gamemode'

	elif category == 'Фан-игры':
	return 'fan_game'
	elif category in self.mod_categories:
	return 'mod'
	elif category in self.opensource_categories:
	return 'open_source'

	elif re.match(r'Игры на движке ', category):
	return 'engine'
	elif re.match(r'Игры, использующие ', category):
	return 'engine'

	elif re.match(r'Категория:Компьютерные игры, профинансированные через ', category):
	return 'crowdsource'

	elif re.match(r'(Википедия\|Проект\|Викиданные\|ПРО):', category):
	return 'internal'
	elif re.match(r'(Страницы\|Статьи\|Шаблоны)', category):
	return 'internal'

	else:
	return 'other'


	def print_report(self, bypass_cache: bool = False) -> None:
	"""
	Output the list of articles with missing or conflicting categories.
	"""
	if os.path.isfile(self.cachefile) and not bypass_cache:
	with open(self.cachefile, encoding="utf-8") as cache_page:
	catdata = json.load(cache_page)
	else:
	pagelist = self.get_pagelist()
	catdata = self.get_pages_categories(pagelist)
	with open('catdata.json', 'w', encoding='utf-8') as json_file:
	json.dump(catdata, json_file, indent=4, ensure_ascii=False)

	lines = []

	for page in sorted(catdata.keys()):
	categories = catdata[page]

	cattypes = []
	for category in categories:
	cattypes.extend(self.get_category_type(category).split('+'))
	counts = Counter(cattypes)

	if counts['addon'] > 0:
	# An article about an add-on, not video game.
	continue

	unreleased = counts['platform_unreleased'] or counts['unreleased']
	dubious_country = counts['open_source'] or counts['fan_game'] or counts['mod']
	errors = []

	if not counts['genre']:
	errors.append('нет жанра')
	if not counts['release_date'] and not unreleased:
	errors.append('нет даты')
	if counts['release_date'] > 1:
	errors.append('несколько дат')
	if counts['release_date'] and counts['unreleased']:
	errors.append('дата у невыпущенной игры')
	if not counts['country_of_origin'] and not dubious_country:
	errors.append('нет страны')
	if counts['platform'] == 0 and not unreleased:
	errors.append('нет платформы')
	if counts['exclusive'] and counts['platform'] >= 2:
	errors.append('не эксклюзив')
	# if counts['exclusive'] == 0 and counts['platform'] == 1:
	# errors.append('возможный эксклюзив')
	if not counts['gamemode']:
	errors.append('нет режима')
	# if not counts['publisher'] and not unreleased:
	# errors.append('нет издателя')

	if errors:
	lines.append(f'* [[{page}]] ({", ".join(errors)})')

	with open(self.reportfile, 'w', encoding='utf-8') as output:
	output.write('\n'.join(lines))


	def process_page(self, other_site: pywikibot.Site, pagename: str) -> None:
	"""
	Import missing categories from given site on given page.

	Categories won't be imported if category of the same type already exist
	to prevent duplication. E. g. if the page has a genre category, other
	genre categories won't be imported.
	"""
	try:
	page = pywikibot.Page(self.site, pagename)
	interwiki_name = get_interwiki(page, other_site)
	if not interwiki_name:
	raise RuntimeError(f'No {other_site.code} interwiki found')

	ru_categories = [cat.title() for cat in page.categories()]
	ru_counts = Counter(self.get_category_type(category) for category in ru_categories)

	in_categories = defaultdict(list)

	interwiki = pywikibot.Page(other_site, interwiki_name)
	for in_category in interwiki.categories():
	category = get_interwiki(in_category, self.site)
	if not category:
	continue
	in_categories[self.get_category_type(category)].append(category)

	# If page is categorized as Windows-only game, it shouldn't be
	# categorized as Windows game, etc.
	for category in in_categories['platform+exclusive']:
	non_exclusive = category.replace(' только ', ' ')
	if non_exclusive in in_categories['platform']:
	in_categories['platform'].remove(non_exclusive)

	categories_to_add = []
	comments_to_add = []

	# Import platform.
	if ru_counts['platform'] + ru_counts['platform+exclusive'] == 0:
	platforms = in_categories['platform+exclusive'] + in_categories['platform']
	if platforms:
	categories_to_add.extend(platforms)
	comments_to_add.append('платформы')

	# Import release date.
	if ru_counts['release_date'] + ru_counts['unreleased'] == 0:
	if len(in_categories['release_date']) == 1:
	categories_to_add.extend(in_categories['release_date'])
	comments_to_add.append('дата выпуска')

	# Import category types that can only be presented once (e. g. a
	# game shouldn't be categorized as a Singleplayer exclusive and a
	# Multiplayer exclusive at the same time).
	imported_single_types = {
	'gamemode': 'режим'
	}
	for type, comment in imported_single_types.items():
	if ru_counts[type]:
	continue
	if len(in_categories[type]) != 1:
	continue
	categories_to_add.extend(in_categories[type])
	comments_to_add.append(comment)

	# Import other category types.
	imported_types = {
	'country_of_origin': 'страна',
	'genre': 'жанр',
	'engine': 'движок',
	# 'developer': 'разработчик',
	# 'publisher': 'издатель',
	}
	for type, comment in imported_types.items():
	if ru_counts[type]:
	continue
	if len(in_categories[type]) == 0:
	continue
	categories_to_add.extend(in_categories[type])
	comments_to_add.append(comment)

	if not categories_to_add:
	raise RuntimeError('no categories imported')

	code_to_add = ''.join(f'\n[[Категория:{category}]]' for category in categories_to_add)
	comment = f'Импорт категорий из {other_site.code}wiki: {", ".join(comments_to_add)}'

	page.text = page.text.rstrip() + code_to_add
	page.save(comment)
	except RuntimeError as error:
	print(f'{pagename.strip()}: {error}')

	def post_report(self, pagename: str) -> None:
	self.print_report(True)

	with open(self.reportfile, encoding='utf-8') as report:
	inserted = report.read()

	page = pywikibot.Page(self.site, pagename)
	text, success = re.subn(r'(<!-- bot -->\n)[\s\S]*(\n<!-- /bot -->)', f'\\1{inserted}\\2', page.text)
	if success == 0:
	raise RuntimeError('<!-- bot --> label not found')
	page.text = text
	page.save('Автообновление списка', minor=False)


	def main():
	bot = Bot()

	# bot.print_report(True)
	bot.post_report('Проект:Компьютерные игры/Проблемы с категоризацией')

	# XXX: process_page needs to be rewritten according to the new get_category_type() logic !!!
	# enwiki = pywikibot.Site('en')
	# with open('pages.txt', encoding='utf-8') as pagelist:
	# for page in pagelist:
	# bot.process_page(enwiki, page)

	if __name__ == "__main__":
	main()
No results found