Skip to content

Instantly share code, notes, and snippets.

@rkhapov
Created May 8, 2018 18:17
Show Gist options
  • Select an option

  • Save rkhapov/69d251250aa1bb83dae4e41d008575a8 to your computer and use it in GitHub Desktop.

Select an option

Save rkhapov/69d251250aa1bb83dae4e41d008575a8 to your computer and use it in GitHub Desktop.
2
#!/usr/bin/env python3
from enum import Enum
from html.parser import HTMLParser
class Sex(Enum):
MALE = 1
FEMALE = 2
def __hash__(self):
return self.value
def __eq__(self, other):
return self.value == other.value
@staticmethod
def get_sex_by_name(name):
if Sex._is_male_name(name):
return Sex.MALE
return Sex.FEMALE
@staticmethod
def _is_male_name(name):
name = str(name)
return name in ['Илья', 'Лёва', 'Никита', 'Игорь'] or (
not (name.endswith('а') or name.endswith('я') or name.endswith('ь')))
class HTMLStatisticParser(HTMLParser):
def __init__(self):
super().__init__()
self._tag_states = {}
self._current_year = 0
self._years_statistic = {}
def get_statistic(self):
return self._years_statistic
def handle_starttag(self, tag, attrs):
dict_name = HTMLStatisticParser._get_dict_name_by_tag(tag, attrs)
if dict_name in self._tag_states:
self._tag_states[dict_name] += 1
else:
self._tag_states[dict_name] = 1
def handle_endtag(self, tag):
dict_name = HTMLStatisticParser._get_dict_name_by_tag(tag, None)
self._tag_states[dict_name] -= 1
def handle_data(self, data):
if self._is_parser_in_tag('td') and self._is_parser_in_tag('a'):
self._handle_name(data)
elif self._is_parser_in_tag('td') and self._is_parser_in_tag('h3'):
self._handle_year(data)
def _handle_name(self, data):
_, name = str(data).split(' ')
sex = Sex.get_sex_by_name(name)
if self._current_year not in self._years_statistic:
self._years_statistic[self._current_year] = {}
if sex not in self._years_statistic[self._current_year]:
self._years_statistic[self._current_year][sex] = {}
if name not in self._years_statistic[self._current_year][sex]:
self._years_statistic[self._current_year][sex][name] = 0
self._years_statistic[self._current_year][sex][name] += 1
def _handle_year(self, data):
self._current_year = int(data)
def _is_parser_in_tag(self, tag_name):
return tag_name in self._tag_states and self._tag_states[tag_name] >= 1
def error(self, message):
print(str(message))
exit(1)
@staticmethod
def _get_dict_name_by_tag(tag, attrs):
return tag
class Statistic:
def __init__(self, filename, caching=True):
self._stat = self._make_stat(filename)
self._caching = caching
self._years_list = None
self._caching_dict = None
def get_years(self):
if self._caching:
if self._years_list is None:
self._years_list = self._make_years_list()
return self._years_list
else:
return self._make_years_list()
def get_name_statistic(self, year=None, sex=None):
if self._caching:
if self._caching_dict is None:
self._caching_dict = {}
if (year, sex) not in self._caching_dict:
self._caching_dict[(year, sex)] = self._make_list_for_condition(year, sex)
return self._caching_dict[(year, sex)]
else:
return self._make_list_for_condition(year, sex)
def _make_years_list(self):
return tuple(sorted(self._stat.keys()))
def _make_list_for_condition(self, year=None, sex=None):
if year is None:
years_list = self.get_years()
else:
years_list = (int(year),)
if sex is None:
sex_list = (Sex.MALE, Sex.FEMALE)
else:
sex_list = (sex,)
result_dict = {}
for y in years_list:
for s in sex_list:
for key in self._stat[y][s]:
if key in result_dict:
result_dict[key] += self._stat[y][s][key]
else:
result_dict[key] = self._stat[y][s][key]
return tuple(sorted(result_dict.items(), key=lambda node: node[1], reverse=True))
@staticmethod
def _make_stat(filename):
with open(filename, 'r') as file:
content = file.read()
parser = HTMLStatisticParser()
parser.feed(content)
return parser.get_statistic()
def make_stat(filename):
return Statistic(filename)
def extract_years(stat):
if isinstance(stat, Statistic):
return list(map(str, stat.get_years()))
raise TypeError('Expected Statistic class')
def extract_general(stat):
if isinstance(stat, Statistic):
return stat.get_name_statistic()
raise TypeError('Expected Statistic class')
def extract_general_male(stat):
if isinstance(stat, Statistic):
return stat.get_name_statistic(sex=Sex.MALE)
raise TypeError('Expected Statistic class')
def extract_general_female(stat):
if isinstance(stat, Statistic):
return stat.get_name_statistic(sex=Sex.FEMALE)
raise TypeError('Expected Statistic class')
def extract_year(stat, year):
if isinstance(stat, Statistic):
return stat.get_name_statistic(year=year)
raise TypeError('Expected Statistic class')
def extract_year_male(stat, year):
if isinstance(stat, Statistic):
return stat.get_name_statistic(year=year, sex=Sex.MALE)
raise TypeError('Expected Statistic class')
def extract_year_female(stat, year):
if isinstance(stat, Statistic):
return stat.get_name_statistic(year=year, sex=Sex.FEMALE)
raise TypeError('Expected Statistic class')
if __name__ == "__main__":
stat = make_stat('home.html')
print('Years:', extract_years(stat))
print('General:', extract_general(stat))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment