Skip to content

Instantly share code, notes, and snippets.

@antonl
Created May 16, 2025 20:00
Show Gist options
  • Select an option

  • Save antonl/baa260dbea1f4c28c8bd53e5226fda52 to your computer and use it in GitHub Desktop.

Select an option

Save antonl/baa260dbea1f4c28c8bd53e5226fda52 to your computer and use it in GitHub Desktop.
Webcrawler solutions
# ThreadPool version
from concurrent.futures import ThreadPoolExecutor
import time
class Solution:
def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
domain, *_ = startUrl[7:].split('/')
domain = "http://" + domain
def retrieve(item) -> tuple[str, list[str]]:
return item, htmlParser.getUrls(item)
with ThreadPoolExecutor(max_workers=16) as pool:
seen = set()
active_futures = {}
def on_complete(future):
url, links = future.result()
for next_ in links:
if next_ in seen or domain not in next_:
continue
f = pool.submit(retrieve, next_)
f.add_done_callback(on_complete)
active_futures[next_] = f
seen.add(next_)
active_futures.pop(url)
seen.add(startUrl)
f = pool.submit(retrieve, startUrl)
f.add_done_callback(on_complete)
active_futures[startUrl] = f
while active_futures:
time.sleep(0.001)
return list(seen)
# Thread version
from threading import Thread, Lock, Event
from queue import Queue
class Solution:
def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
domain, *_ = startUrl[7:].split('/')
domain = "http://" + domain
work_queue = Queue()
seen = set()
seen_lock = Lock()
should_exit = Event()
def retrieve():
while not should_exit.is_set():
try:
url = work_queue.get(timeout=0.005)
except:
continue
links = htmlParser.getUrls(url)
with seen_lock:
for next_ in links:
if domain in next_ and next_ not in seen:
work_queue.put_nowait(next_)
seen.add(next_)
work_queue.task_done()
work_queue.put(startUrl)
seen.add(startUrl)
for _ in range(10):
Thread(target=retrieve, daemon=True).start()
work_queue.join()
should_exit.set()
return list(seen)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment