Created
June 2, 2024 12:58
-
-
Save sushmithapopuri/46a95fcd93c1306355be37babb3730df to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| from tqdm import tqdm | |
| import json | |
| def get_subpage_links(l): | |
| for link in tqdm(l): | |
| # If not crawled through this page start crawling and get links | |
| if l[link] == "Not-checked": | |
| dict_links_subpages = get_links(link) | |
| # Change the dictionary value of the link to "Checked" | |
| l[link] = "Checked" | |
| else: | |
| # Create an empty dictionary in case every link is checked | |
| dict_links_subpages = {} | |
| # Add new dictionary to old dictionary | |
| l = {**dict_links_subpages, **l} | |
| return l | |
| # create empty dict | |
| dict_href_links = {} | |
| def get_links(website_link): | |
| html_data = getdata(website_link) | |
| soup = BeautifulSoup(html_data, "html.parser") | |
| list_links = [] | |
| for link in soup.find_all("a", href=True): | |
| # Append to list if new link contains original link | |
| if str(link["href"]).startswith((str(website_link))): | |
| list_links.append(link["href"]) | |
| # Include all href that do not start with website link but with "/" | |
| if str(link["href"]).startswith("/"): | |
| if link["href"] not in dict_href_links: | |
| print(link["href"]) | |
| dict_href_links[link["href"]] = None | |
| link_with_www = website_link + link["href"] | |
| print("adjusted link =", link_with_www) | |
| list_links.append(link_with_www) | |
| # Convert list of links to dictionary and define keys as the links and the values as "Not-checked" | |
| dict_links = dict.fromkeys(list_links, "Not-checked") | |
| return dict_links | |
| def getdata(url): | |
| r = requests.get(url, verify=False) | |
| return r.text | |
| website = "https://unt.edu" | |
| # create dictionary of website | |
| dict_links = {website:"Not-checked"} | |
| counter, counter2 = None, 0 | |
| while counter != 0: | |
| counter2 += 1 | |
| dict_links2 = get_subpage_links(dict_links) | |
| dict_links = dict_links2 | |
| # Save list in json file | |
| a_file = open("data.json", "w") | |
| json.dump(dict_links, a_file) | |
| a_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment