Created
November 29, 2025 06:48
-
-
Save me-suzy/cc94c15de78d44c2282f486e582d09ab to your computer and use it in GitHub Desktop.
teleport generic.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Generic PDF downloader for 455464.ro | |
| Works with any publication page. | |
| Usage: | |
| python download_biblioteca_digitala.py "https://455464.ro/?pub=7758-revista-romana-de-sociologie" | |
| python download_biblioteca_digitala.py "https://455464.ro/?pub=6464-studii-si-cercetari-de-chimie" | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import re | |
| import sys | |
| import time | |
| from urllib.parse import urljoin, urlparse, parse_qs, unquote | |
| HEADERS = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8', | |
| } | |
| def get_soup(url, session): | |
| """Fetch page and return BeautifulSoup object""" | |
| try: | |
| response = session.get(url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| return BeautifulSoup(response.content, 'html.parser') | |
| except Exception as e: | |
| print(f" [ERROR] Failed to fetch {url}: {e}") | |
| return None | |
| def sanitize_filename(name): | |
| """Clean filename for filesystem""" | |
| name = re.sub(r'[<>:"/\\|?*]', '_', name) | |
| name = re.sub(r'\s+', '_', name) | |
| name = name.strip('._') | |
| return name[:200] | |
| def extract_filename_from_url(url): | |
| """Extract filename from dl.asp?filename=... URL or from path""" | |
| parsed = urlparse(url) | |
| # Try to get from query parameter 'filename' | |
| query_params = parse_qs(parsed.query) | |
| if 'filename' in query_params: | |
| filename = query_params['filename'][0] | |
| return unquote(filename) | |
| # Try to get from onclick attribute (track_pdf URL) | |
| # This is handled separately | |
| # Fallback: get from path | |
| path = parsed.path | |
| if path: | |
| filename = os.path.basename(path) | |
| if filename and filename.lower() != 'dl.asp': | |
| return unquote(filename) | |
| return None | |
| def extract_filename_from_response(response): | |
| """Extract filename from Content-Disposition header""" | |
| cd = response.headers.get('Content-Disposition', '') | |
| if cd: | |
| # Try standard filename | |
| match = re.search(r'filename[^;=\n]*=([\'"]?)([^\'";\n]+)\1', cd, re.IGNORECASE) | |
| if match: | |
| filename = unquote(match.group(2)) | |
| if filename and filename.lower() != 'dl.asp' and not filename.lower().startswith('dl.asp'): | |
| return filename | |
| # Try filename* (RFC 5987) | |
| match = re.search(r"filename\*=(?:UTF-8''|utf-8'')([^;\n]+)", cd, re.IGNORECASE) | |
| if match: | |
| filename = unquote(match.group(1)) | |
| if filename and filename.lower() != 'dl.asp': | |
| return filename | |
| return None | |
| def get_publication_title(soup): | |
| """Extract publication title from page""" | |
| h2 = soup.find('h2', class_='text-color-light') | |
| if h2: | |
| return h2.get_text(strip=True) | |
| return "Unknown_Publication" | |
| def get_pdf_links_from_page(soup, base_url): | |
| """Extract all PDF download links from page""" | |
| pdf_links = [] | |
| # Find download links in the table | |
| table = soup.find('table', {'id': 'datatable-default'}) | |
| if not table: | |
| print("[WARNING] Could not find data table, searching entire page...") | |
| search_area = soup | |
| else: | |
| search_area = table | |
| for link in search_area.find_all('a', href=True): | |
| href = link['href'] | |
| # Check for PDF links (dl.asp?filename=... or direct .pdf) | |
| if 'dl.asp' in href.lower() or '.pdf' in href.lower(): | |
| # PRIORITY: Check onclick for track_pdf which has the DIRECT PDF URL | |
| onclick = link.get('onclick', '') | |
| if 'track_pdf' in onclick: | |
| # Extract URL from track_pdf('...') | |
| match = re.search(r"track_pdf\(['\"]([^'\"]+)['\"]", onclick) | |
| if match: | |
| direct_pdf_url = match.group(1) | |
| # This is the real PDF URL, not the dl.asp wrapper | |
| filename = os.path.basename(urlparse(direct_pdf_url).path) | |
| pdf_links.append((direct_pdf_url, filename)) | |
| continue | |
| # Fallback: use href (dl.asp URL) | |
| full_url = urljoin(base_url, href) | |
| filename = extract_filename_from_url(full_url) | |
| pdf_links.append((full_url, filename)) | |
| return pdf_links | |
| def get_volume_links(soup, base_url): | |
| """Extract volume links if PDFs are not directly on publication page""" | |
| volume_links = [] | |
| table = soup.find('table', {'id': 'datatable-default'}) | |
| if not table: | |
| return volume_links | |
| for link in table.find_all('a', href=True): | |
| href = link['href'] | |
| if 'volum=' in href: | |
| full_url = urljoin(base_url, href) | |
| volume_name = link.get_text(strip=True) | |
| volume_links.append((full_url, volume_name)) | |
| return volume_links | |
| def download_pdf(url, output_dir, suggested_filename, session): | |
| """Download PDF file""" | |
| try: | |
| response = session.get(url, headers=HEADERS, timeout=60, stream=True) | |
| response.raise_for_status() | |
| # Determine filename: URL param > Content-Disposition > suggested | |
| filename = extract_filename_from_url(url) | |
| if not filename or filename.lower() == 'dl.asp': | |
| filename = extract_filename_from_response(response) | |
| if not filename: | |
| filename = suggested_filename | |
| if not filename: | |
| filename = f"download_{int(time.time())}.pdf" | |
| if not filename.lower().endswith('.pdf'): | |
| filename += '.pdf' | |
| filename = sanitize_filename(filename) | |
| output_path = os.path.join(output_dir, filename) | |
| # Check if already exists | |
| if os.path.exists(output_path): | |
| print(f" [SKIP] Already exists: {filename}") | |
| return True, filename | |
| with open(output_path, 'wb') as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| return True, filename | |
| except Exception as e: | |
| print(f" [ERROR] Download failed: {e}") | |
| return False, None | |
| def main(): | |
| if len(sys.argv) >= 2: | |
| pub_url = sys.argv[1] | |
| else: | |
| print("=" * 70) | |
| print("Biblioteca Digitală - PDF Downloader") | |
| print("=" * 70) | |
| print("\nExemple de URL-uri:") | |
| print(" https://455464.ro/?pub=7758-revista-romana-de-sociologie") | |
| print(" https://455464.ro/?pub=6464-studii-si-cercetari-de-chimie") | |
| print() | |
| pub_url = input("Introdu URL-ul publicației: ").strip() | |
| if not pub_url: | |
| print("[ERROR] Nu ai introdus niciun URL!") | |
| sys.exit(1) | |
| print("=" * 70) | |
| print("Biblioteca Digitală - PDF Downloader") | |
| print("=" * 70) | |
| print(f"Publication URL: {pub_url}") | |
| session = requests.Session() | |
| # Step 1: Get publication page | |
| print(f"\n[1] Fetching publication page...") | |
| soup = get_soup(pub_url, session) | |
| if not soup: | |
| print("[FATAL] Could not load publication page") | |
| return | |
| pub_title = get_publication_title(soup) | |
| print(f" Publication: {pub_title}") | |
| # Create output directory | |
| output_dir = sanitize_filename(pub_title) + "_PDFs" | |
| os.makedirs(output_dir, exist_ok=True) | |
| print(f" Output directory: {output_dir}") | |
| # Step 2: Check if PDFs are directly on page or need to go to volumes | |
| pdf_links = get_pdf_links_from_page(soup, pub_url) | |
| total_pdfs = 0 | |
| if pdf_links: | |
| # PDFs are directly on the publication page | |
| print(f"\n[2] Found {len(pdf_links)} PDF links directly on page") | |
| for i, (pdf_url, suggested_name) in enumerate(pdf_links, 1): | |
| print(f"\n [{i}/{len(pdf_links)}] Downloading...") | |
| success, filename = download_pdf(pdf_url, output_dir, suggested_name, session) | |
| if success and filename: | |
| print(f" Saved as: {filename}") | |
| total_pdfs += 1 | |
| time.sleep(0.5) | |
| else: | |
| # Need to check volume pages for PDFs | |
| print(f"\n[2] No direct PDF links found, checking volume pages...") | |
| volume_links = get_volume_links(soup, pub_url) | |
| print(f" Found {len(volume_links)} volumes") | |
| for i, (vol_url, vol_name) in enumerate(volume_links, 1): | |
| print(f"\n[{i}/{len(volume_links)}] Processing: {vol_name}") | |
| vol_soup = get_soup(vol_url, session) | |
| if not vol_soup: | |
| continue | |
| vol_pdf_links = get_pdf_links_from_page(vol_soup, vol_url) | |
| for pdf_url, suggested_name in vol_pdf_links: | |
| if not suggested_name: | |
| suggested_name = f"{sanitize_filename(vol_name)}.pdf" | |
| print(f" Downloading: {suggested_name[:50]}...") | |
| success, filename = download_pdf(pdf_url, output_dir, suggested_name, session) | |
| if success and filename: | |
| print(f" Saved as: {filename}") | |
| total_pdfs += 1 | |
| time.sleep(0.5) | |
| time.sleep(1) | |
| # Summary | |
| print("\n" + "=" * 70) | |
| print("DOWNLOAD COMPLETE") | |
| print("=" * 70) | |
| print(f"Total PDFs downloaded: {total_pdfs}") | |
| print(f"Output directory: {os.path.abspath(output_dir)}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment