Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created November 29, 2025 06:48
Show Gist options
  • Select an option

  • Save me-suzy/cc94c15de78d44c2282f486e582d09ab to your computer and use it in GitHub Desktop.

Select an option

Save me-suzy/cc94c15de78d44c2282f486e582d09ab to your computer and use it in GitHub Desktop.
teleport generic.py
#!/usr/bin/env python3
"""
Generic PDF downloader for 455464.ro
Works with any publication page.
Usage:
python download_biblioteca_digitala.py "https://455464.ro/?pub=7758-revista-romana-de-sociologie"
python download_biblioteca_digitala.py "https://455464.ro/?pub=6464-studii-si-cercetari-de-chimie"
"""
import requests
from bs4 import BeautifulSoup
import os
import re
import sys
import time
from urllib.parse import urljoin, urlparse, parse_qs, unquote
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8',
}
def get_soup(url, session):
"""Fetch page and return BeautifulSoup object"""
try:
response = session.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except Exception as e:
print(f" [ERROR] Failed to fetch {url}: {e}")
return None
def sanitize_filename(name):
"""Clean filename for filesystem"""
name = re.sub(r'[<>:"/\\|?*]', '_', name)
name = re.sub(r'\s+', '_', name)
name = name.strip('._')
return name[:200]
def extract_filename_from_url(url):
"""Extract filename from dl.asp?filename=... URL or from path"""
parsed = urlparse(url)
# Try to get from query parameter 'filename'
query_params = parse_qs(parsed.query)
if 'filename' in query_params:
filename = query_params['filename'][0]
return unquote(filename)
# Try to get from onclick attribute (track_pdf URL)
# This is handled separately
# Fallback: get from path
path = parsed.path
if path:
filename = os.path.basename(path)
if filename and filename.lower() != 'dl.asp':
return unquote(filename)
return None
def extract_filename_from_response(response):
"""Extract filename from Content-Disposition header"""
cd = response.headers.get('Content-Disposition', '')
if cd:
# Try standard filename
match = re.search(r'filename[^;=\n]*=([\'"]?)([^\'";\n]+)\1', cd, re.IGNORECASE)
if match:
filename = unquote(match.group(2))
if filename and filename.lower() != 'dl.asp' and not filename.lower().startswith('dl.asp'):
return filename
# Try filename* (RFC 5987)
match = re.search(r"filename\*=(?:UTF-8''|utf-8'')([^;\n]+)", cd, re.IGNORECASE)
if match:
filename = unquote(match.group(1))
if filename and filename.lower() != 'dl.asp':
return filename
return None
def get_publication_title(soup):
"""Extract publication title from page"""
h2 = soup.find('h2', class_='text-color-light')
if h2:
return h2.get_text(strip=True)
return "Unknown_Publication"
def get_pdf_links_from_page(soup, base_url):
"""Extract all PDF download links from page"""
pdf_links = []
# Find download links in the table
table = soup.find('table', {'id': 'datatable-default'})
if not table:
print("[WARNING] Could not find data table, searching entire page...")
search_area = soup
else:
search_area = table
for link in search_area.find_all('a', href=True):
href = link['href']
# Check for PDF links (dl.asp?filename=... or direct .pdf)
if 'dl.asp' in href.lower() or '.pdf' in href.lower():
# PRIORITY: Check onclick for track_pdf which has the DIRECT PDF URL
onclick = link.get('onclick', '')
if 'track_pdf' in onclick:
# Extract URL from track_pdf('...')
match = re.search(r"track_pdf\(['\"]([^'\"]+)['\"]", onclick)
if match:
direct_pdf_url = match.group(1)
# This is the real PDF URL, not the dl.asp wrapper
filename = os.path.basename(urlparse(direct_pdf_url).path)
pdf_links.append((direct_pdf_url, filename))
continue
# Fallback: use href (dl.asp URL)
full_url = urljoin(base_url, href)
filename = extract_filename_from_url(full_url)
pdf_links.append((full_url, filename))
return pdf_links
def get_volume_links(soup, base_url):
"""Extract volume links if PDFs are not directly on publication page"""
volume_links = []
table = soup.find('table', {'id': 'datatable-default'})
if not table:
return volume_links
for link in table.find_all('a', href=True):
href = link['href']
if 'volum=' in href:
full_url = urljoin(base_url, href)
volume_name = link.get_text(strip=True)
volume_links.append((full_url, volume_name))
return volume_links
def download_pdf(url, output_dir, suggested_filename, session):
"""Download PDF file"""
try:
response = session.get(url, headers=HEADERS, timeout=60, stream=True)
response.raise_for_status()
# Determine filename: URL param > Content-Disposition > suggested
filename = extract_filename_from_url(url)
if not filename or filename.lower() == 'dl.asp':
filename = extract_filename_from_response(response)
if not filename:
filename = suggested_filename
if not filename:
filename = f"download_{int(time.time())}.pdf"
if not filename.lower().endswith('.pdf'):
filename += '.pdf'
filename = sanitize_filename(filename)
output_path = os.path.join(output_dir, filename)
# Check if already exists
if os.path.exists(output_path):
print(f" [SKIP] Already exists: {filename}")
return True, filename
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return True, filename
except Exception as e:
print(f" [ERROR] Download failed: {e}")
return False, None
def main():
if len(sys.argv) >= 2:
pub_url = sys.argv[1]
else:
print("=" * 70)
print("Biblioteca Digitală - PDF Downloader")
print("=" * 70)
print("\nExemple de URL-uri:")
print(" https://455464.ro/?pub=7758-revista-romana-de-sociologie")
print(" https://455464.ro/?pub=6464-studii-si-cercetari-de-chimie")
print()
pub_url = input("Introdu URL-ul publicației: ").strip()
if not pub_url:
print("[ERROR] Nu ai introdus niciun URL!")
sys.exit(1)
print("=" * 70)
print("Biblioteca Digitală - PDF Downloader")
print("=" * 70)
print(f"Publication URL: {pub_url}")
session = requests.Session()
# Step 1: Get publication page
print(f"\n[1] Fetching publication page...")
soup = get_soup(pub_url, session)
if not soup:
print("[FATAL] Could not load publication page")
return
pub_title = get_publication_title(soup)
print(f" Publication: {pub_title}")
# Create output directory
output_dir = sanitize_filename(pub_title) + "_PDFs"
os.makedirs(output_dir, exist_ok=True)
print(f" Output directory: {output_dir}")
# Step 2: Check if PDFs are directly on page or need to go to volumes
pdf_links = get_pdf_links_from_page(soup, pub_url)
total_pdfs = 0
if pdf_links:
# PDFs are directly on the publication page
print(f"\n[2] Found {len(pdf_links)} PDF links directly on page")
for i, (pdf_url, suggested_name) in enumerate(pdf_links, 1):
print(f"\n [{i}/{len(pdf_links)}] Downloading...")
success, filename = download_pdf(pdf_url, output_dir, suggested_name, session)
if success and filename:
print(f" Saved as: {filename}")
total_pdfs += 1
time.sleep(0.5)
else:
# Need to check volume pages for PDFs
print(f"\n[2] No direct PDF links found, checking volume pages...")
volume_links = get_volume_links(soup, pub_url)
print(f" Found {len(volume_links)} volumes")
for i, (vol_url, vol_name) in enumerate(volume_links, 1):
print(f"\n[{i}/{len(volume_links)}] Processing: {vol_name}")
vol_soup = get_soup(vol_url, session)
if not vol_soup:
continue
vol_pdf_links = get_pdf_links_from_page(vol_soup, vol_url)
for pdf_url, suggested_name in vol_pdf_links:
if not suggested_name:
suggested_name = f"{sanitize_filename(vol_name)}.pdf"
print(f" Downloading: {suggested_name[:50]}...")
success, filename = download_pdf(pdf_url, output_dir, suggested_name, session)
if success and filename:
print(f" Saved as: {filename}")
total_pdfs += 1
time.sleep(0.5)
time.sleep(1)
# Summary
print("\n" + "=" * 70)
print("DOWNLOAD COMPLETE")
print("=" * 70)
print(f"Total PDFs downloaded: {total_pdfs}")
print(f"Output directory: {os.path.abspath(output_dir)}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment