Skip to content

Instantly share code, notes, and snippets.

@sushmithapopuri
Created June 2, 2024 12:58
Show Gist options
  • Select an option

  • Save sushmithapopuri/46a95fcd93c1306355be37babb3730df to your computer and use it in GitHub Desktop.

Select an option

Save sushmithapopuri/46a95fcd93c1306355be37babb3730df to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import json
def get_subpage_links(l):
for link in tqdm(l):
# If not crawled through this page start crawling and get links
if l[link] == "Not-checked":
dict_links_subpages = get_links(link)
# Change the dictionary value of the link to "Checked"
l[link] = "Checked"
else:
# Create an empty dictionary in case every link is checked
dict_links_subpages = {}
# Add new dictionary to old dictionary
l = {**dict_links_subpages, **l}
return l
# create empty dict
dict_href_links = {}
def get_links(website_link):
html_data = getdata(website_link)
soup = BeautifulSoup(html_data, "html.parser")
list_links = []
for link in soup.find_all("a", href=True):
# Append to list if new link contains original link
if str(link["href"]).startswith((str(website_link))):
list_links.append(link["href"])
# Include all href that do not start with website link but with "/"
if str(link["href"]).startswith("/"):
if link["href"] not in dict_href_links:
print(link["href"])
dict_href_links[link["href"]] = None
link_with_www = website_link + link["href"]
print("adjusted link =", link_with_www)
list_links.append(link_with_www)
# Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
dict_links = dict.fromkeys(list_links, "Not-checked")
return dict_links
def getdata(url):
r = requests.get(url, verify=False)
return r.text
website = "https://unt.edu"
# create dictionary of website
dict_links = {website:"Not-checked"}
counter, counter2 = None, 0
while counter != 0:
counter2 += 1
dict_links2 = get_subpage_links(dict_links)
dict_links = dict_links2
# Save list in json file
a_file = open("data.json", "w")
json.dump(dict_links, a_file)
a_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment