Skip to content

Instantly share code, notes, and snippets.

@bitcoineazy
Created October 12, 2022 13:04
Show Gist options
  • Select an option

  • Save bitcoineazy/a6cad941b5d10b3f8dfe38b706743dd9 to your computer and use it in GitHub Desktop.

Select an option

Save bitcoineazy/a6cad941b5d10b3f8dfe38b706743dd9 to your computer and use it in GitHub Desktop.
wiki_parser
all_es_prop = []
for i, (n, lnk) in enumerate(all_es[:3]):
es = BeautifulSoup(urllib.request.urlopen(lnk), 'lxml')
es_pr = {}
city_info = {}
all_cities_in_group = []
mw_pages = es.find("div", {"id": "mw-pages"})
group_cities_links = []
saved_th = []
cities_rows = []
df_parsed = pd.DataFrame()
if mw_pages is not None:
group_cities = mw_pages.find_all("a")
for each_a in group_cities:
if "Городские" in each_a["title"]:
group_cities_links.append(f"https://ru.wikipedia.org{each_a['href']}")
all_cities_in_group.append(each_a["title"])
#print(each_a["href"], each_a["title"])
#print(group_cities)
#all_cities_in_group.append(group_cities["title"])
#print(f"https://ru.wikipedia.org{group_cities['href']}")
#group_cities_links.append(f"https://ru.wikipedia.org{group_cities['href']}")
for link in group_cities_links:
cs = BeautifulSoup(urllib.request.urlopen(link), 'lxml')
cities_table = cs.find("table", class_="standard")
#print(link)
if cities_table is not None:
for row in cities_table.find_all("tr"):
th = row.find_all("th")
#print("123")
#print(th.text)
if th:
#pass
#print(f"Th length: {len(th)}")
for i in range(len(th)):
#print(th[i].text.strip(), end=' || ')
saved_th.append(th[i].text.strip())
#df_parsed[th[i].text.strip()] = 0
td = row.find_all('td')
#print(td.text)
if td:
#print("123")
for i in range(len(td)):
a_img = td[i].find_all("a", class_="image")
if a_img:
for each_img in a_img:
img_src = each_img.find_all("img")
if img_src:
for each_src in img_src:
image_link = each_src["src"]
# print(f"Th[i]: {i}")
# print(f"Th now: {saved_th}")
# print(f"Th length now: {len(saved_th)}")
# print(f"Image link in: {saved_th[i]}")
#city_info[saved_th[i]] = td[i].text.strip()
es_pr[td[i].text.strip() if saved_th[i] == "№" else len(es_pr)] = {saved_th[i]: td[i].text.strip()}
#print(city_info)
#df_parsed[saved_th[i]] = td[i].text.strip()
#print(td[i].text.strip(), "----", saved_th[i])
#es_pr[td[i].text.strip()] = city_info
#print(city_info)
#print("123")
print(city_info)
#print(es_pr)
all_es_prop.append((n, {"Города": es_pr}))
#print('{}: {}, Название города: {}'.format(i, n, city_name))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment