Skip to content

Instantly share code, notes, and snippets.

@wiesty
Last active June 7, 2025 12:50
Show Gist options
  • Select an option

  • Save wiesty/5bebfcd5482c8a0c6a55244263c5a77b to your computer and use it in GitHub Desktop.

Select an option

Save wiesty/5bebfcd5482c8a0c6a55244263c5a77b to your computer and use it in GitHub Desktop.
Extract text from any page (and subpages) and bypass robots.txt
from playwright.sync_api import sync_playwright
from urllib.parse import urljoin, urlparse
import os
import time
# === config ===
START_URL = "https://www.example.com/somepath"
WAIT_BETWEEN_REQUESTS = 2.5
OUTPUT_FILE = "output1.txt"
# === init ===
visited = set()
to_visit = [START_URL]
extracted_texts = set()
START_DOMAIN = urlparse(START_URL).netloc
# === func ===
def extract_text(page):
page.wait_for_timeout(1500)
return page.inner_text("body")
def extract_links(page, current_url):
anchors = page.query_selector_all("a[href]")
links = set()
for a in anchors:
href = a.get_attribute("href")
if not href:
continue
abs_url = urljoin(current_url, href.split("?")[0])
parsed = urlparse(abs_url)
if parsed.netloc == START_DOMAIN:
links.add(abs_url)
return links
# === Hauptprogramm ===
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/124.0.0.0 Safari/537.36"
),
locale="en-US",
viewport={"width": 1280, "height": 800},
timezone_id="Europe/Berlin"
)
page = context.new_page()
with open(OUTPUT_FILE, "w", encoding="utf-8") as outfile:
while to_visit:
current_url = to_visit.pop(0)
if current_url in visited:
continue
try:
print(f"[+] Visiting: {current_url}")
page.goto(current_url, timeout=20000)
text = extract_text(page).strip()
if text and text not in extracted_texts:
outfile.write(f"\n--- Content from: {current_url} ---\n\n")
outfile.write(text + "\n")
extracted_texts.add(text)
links = extract_links(page, current_url)
to_visit.extend([link for link in links if link not in visited])
visited.add(current_url)
time.sleep(WAIT_BETWEEN_REQUESTS)
except Exception as e:
print(f"[!] Error visiting {current_url}: {e}")
browser.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment