Created
October 12, 2022 13:04
-
-
Save bitcoineazy/a6cad941b5d10b3f8dfe38b706743dd9 to your computer and use it in GitHub Desktop.
wiki_parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| all_es_prop = [] | |
| for i, (n, lnk) in enumerate(all_es[:3]): | |
| es = BeautifulSoup(urllib.request.urlopen(lnk), 'lxml') | |
| es_pr = {} | |
| city_info = {} | |
| all_cities_in_group = [] | |
| mw_pages = es.find("div", {"id": "mw-pages"}) | |
| group_cities_links = [] | |
| saved_th = [] | |
| cities_rows = [] | |
| df_parsed = pd.DataFrame() | |
| if mw_pages is not None: | |
| group_cities = mw_pages.find_all("a") | |
| for each_a in group_cities: | |
| if "Городские" in each_a["title"]: | |
| group_cities_links.append(f"https://ru.wikipedia.org{each_a['href']}") | |
| all_cities_in_group.append(each_a["title"]) | |
| #print(each_a["href"], each_a["title"]) | |
| #print(group_cities) | |
| #all_cities_in_group.append(group_cities["title"]) | |
| #print(f"https://ru.wikipedia.org{group_cities['href']}") | |
| #group_cities_links.append(f"https://ru.wikipedia.org{group_cities['href']}") | |
| for link in group_cities_links: | |
| cs = BeautifulSoup(urllib.request.urlopen(link), 'lxml') | |
| cities_table = cs.find("table", class_="standard") | |
| #print(link) | |
| if cities_table is not None: | |
| for row in cities_table.find_all("tr"): | |
| th = row.find_all("th") | |
| #print("123") | |
| #print(th.text) | |
| if th: | |
| #pass | |
| #print(f"Th length: {len(th)}") | |
| for i in range(len(th)): | |
| #print(th[i].text.strip(), end=' || ') | |
| saved_th.append(th[i].text.strip()) | |
| #df_parsed[th[i].text.strip()] = 0 | |
| td = row.find_all('td') | |
| #print(td.text) | |
| if td: | |
| #print("123") | |
| for i in range(len(td)): | |
| a_img = td[i].find_all("a", class_="image") | |
| if a_img: | |
| for each_img in a_img: | |
| img_src = each_img.find_all("img") | |
| if img_src: | |
| for each_src in img_src: | |
| image_link = each_src["src"] | |
| # print(f"Th[i]: {i}") | |
| # print(f"Th now: {saved_th}") | |
| # print(f"Th length now: {len(saved_th)}") | |
| # print(f"Image link in: {saved_th[i]}") | |
| #city_info[saved_th[i]] = td[i].text.strip() | |
| es_pr[td[i].text.strip() if saved_th[i] == "№" else len(es_pr)] = {saved_th[i]: td[i].text.strip()} | |
| #print(city_info) | |
| #df_parsed[saved_th[i]] = td[i].text.strip() | |
| #print(td[i].text.strip(), "----", saved_th[i]) | |
| #es_pr[td[i].text.strip()] = city_info | |
| #print(city_info) | |
| #print("123") | |
| print(city_info) | |
| #print(es_pr) | |
| all_es_prop.append((n, {"Города": es_pr})) | |
| #print('{}: {}, Название города: {}'.format(i, n, city_name)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment