Created
May 16, 2025 20:00
-
-
Save antonl/baa260dbea1f4c28c8bd53e5226fda52 to your computer and use it in GitHub Desktop.
Webcrawler solutions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # ThreadPool version | |
| from concurrent.futures import ThreadPoolExecutor | |
| import time | |
| class Solution: | |
| def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]: | |
| domain, *_ = startUrl[7:].split('/') | |
| domain = "http://" + domain | |
| def retrieve(item) -> tuple[str, list[str]]: | |
| return item, htmlParser.getUrls(item) | |
| with ThreadPoolExecutor(max_workers=16) as pool: | |
| seen = set() | |
| active_futures = {} | |
| def on_complete(future): | |
| url, links = future.result() | |
| for next_ in links: | |
| if next_ in seen or domain not in next_: | |
| continue | |
| f = pool.submit(retrieve, next_) | |
| f.add_done_callback(on_complete) | |
| active_futures[next_] = f | |
| seen.add(next_) | |
| active_futures.pop(url) | |
| seen.add(startUrl) | |
| f = pool.submit(retrieve, startUrl) | |
| f.add_done_callback(on_complete) | |
| active_futures[startUrl] = f | |
| while active_futures: | |
| time.sleep(0.001) | |
| return list(seen) | |
| # Thread version | |
| from threading import Thread, Lock, Event | |
| from queue import Queue | |
| class Solution: | |
| def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]: | |
| domain, *_ = startUrl[7:].split('/') | |
| domain = "http://" + domain | |
| work_queue = Queue() | |
| seen = set() | |
| seen_lock = Lock() | |
| should_exit = Event() | |
| def retrieve(): | |
| while not should_exit.is_set(): | |
| try: | |
| url = work_queue.get(timeout=0.005) | |
| except: | |
| continue | |
| links = htmlParser.getUrls(url) | |
| with seen_lock: | |
| for next_ in links: | |
| if domain in next_ and next_ not in seen: | |
| work_queue.put_nowait(next_) | |
| seen.add(next_) | |
| work_queue.task_done() | |
| work_queue.put(startUrl) | |
| seen.add(startUrl) | |
| for _ in range(10): | |
| Thread(target=retrieve, daemon=True).start() | |
| work_queue.join() | |
| should_exit.set() | |
| return list(seen) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment