Created
November 29, 2025 05:48
-
-
Save me-suzy/96010a9949f65c844e61ac2dfb6140a2 to your computer and use it in GitHub Desktop.
teleport biblioteca diig.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Script to download all PDFs from "Studii și Cercetări de Chimie" | |
| on 455464.ro | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import re | |
| import time | |
| from urllib.parse import urljoin, urlparse | |
| from pathlib import Path | |
| BASE_URL = "https://455464.ro/" | |
| PUB_URL = "https://455464.ro/?pub=6464-studii-si-cercetari-de-chimie" | |
| OUTPUT_DIR = "Studii_Cercetari_Chimie_PDFs" | |
| HEADERS = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8', | |
| } | |
| def get_soup(url, session): | |
| """Fetch page and return BeautifulSoup object""" | |
| try: | |
| response = session.get(url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| return BeautifulSoup(response.content, 'html.parser') | |
| except Exception as e: | |
| print(f" [ERROR] Failed to fetch {url}: {e}") | |
| return None | |
| def get_all_volume_links(soup): | |
| """Extract all volume links from publication page""" | |
| volume_links = [] | |
| table = soup.find('table', {'id': 'datatable-default'}) | |
| if not table: | |
| print("[ERROR] Could not find volumes table") | |
| return volume_links | |
| for row in table.find_all('tr'): | |
| for link in row.find_all('a', href=True): | |
| href = link['href'] | |
| if 'volum=' in href: | |
| full_url = urljoin(BASE_URL, href) | |
| volume_name = link.get_text(strip=True) | |
| volume_links.append((full_url, volume_name)) | |
| return volume_links | |
| def get_pdf_links_from_volume(soup, volume_url): | |
| """Extract PDF links from a volume page""" | |
| pdf_links = [] | |
| # Look for PDF links - they might be in different formats | |
| for link in soup.find_all('a', href=True): | |
| href = link['href'] | |
| # Check for PDF links | |
| if '.pdf' in href.lower() or 'download' in href.lower(): | |
| full_url = urljoin(volume_url, href) | |
| pdf_links.append(full_url) | |
| # Also check for article links that might lead to PDFs | |
| for link in soup.find_all('a', href=True): | |
| href = link['href'] | |
| if 'articol=' in href: | |
| full_url = urljoin(BASE_URL, href) | |
| pdf_links.append(('article', full_url)) | |
| return pdf_links | |
| def get_pdf_from_article(soup, article_url): | |
| """Extract PDF link from an article page""" | |
| for link in soup.find_all('a', href=True): | |
| href = link['href'] | |
| if '.pdf' in href.lower(): | |
| return urljoin(article_url, href) | |
| return None | |
| def sanitize_filename(name): | |
| """Clean filename for filesystem""" | |
| # Remove/replace invalid characters | |
| name = re.sub(r'[<>:"/\\|?*]', '_', name) | |
| name = re.sub(r'\s+', '_', name) | |
| name = name.strip('._') | |
| return name[:200] # Limit length | |
| def download_pdf(url, output_path, session): | |
| """Download PDF file""" | |
| try: | |
| response = session.get(url, headers=HEADERS, timeout=60, stream=True) | |
| response.raise_for_status() | |
| with open(output_path, 'wb') as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| return True | |
| except Exception as e: | |
| print(f" [ERROR] Download failed: {e}") | |
| return False | |
| def main(): | |
| print("=" * 60) | |
| print("Studii și Cercetări de Chimie - PDF Downloader") | |
| print("=" * 60) | |
| # Create output directory | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| session = requests.Session() | |
| # Step 1: Get all volume links | |
| print(f"\n[1] Fetching publication page: {PUB_URL}") | |
| soup = get_soup(PUB_URL, session) | |
| if not soup: | |
| print("[FATAL] Could not load publication page") | |
| return | |
| volume_links = get_all_volume_links(soup) | |
| print(f" Found {len(volume_links)} volumes") | |
| # Step 2: Process each volume | |
| total_pdfs = 0 | |
| failed_downloads = [] | |
| for i, (vol_url, vol_name) in enumerate(volume_links, 1): | |
| print(f"\n[{i}/{len(volume_links)}] Processing: {vol_name}") | |
| # Create subdirectory for volume | |
| vol_dir = os.path.join(OUTPUT_DIR, sanitize_filename(vol_name)) | |
| os.makedirs(vol_dir, exist_ok=True) | |
| vol_soup = get_soup(vol_url, session) | |
| if not vol_soup: | |
| failed_downloads.append(vol_url) | |
| continue | |
| pdf_links = get_pdf_links_from_volume(vol_soup, vol_url) | |
| for item in pdf_links: | |
| if isinstance(item, tuple) and item[0] == 'article': | |
| # Need to fetch article page to get PDF | |
| article_url = item[1] | |
| art_soup = get_soup(article_url, session) | |
| if art_soup: | |
| pdf_url = get_pdf_from_article(art_soup, article_url) | |
| if pdf_url: | |
| pdf_name = os.path.basename(urlparse(pdf_url).path) | |
| if not pdf_name.endswith('.pdf'): | |
| pdf_name += '.pdf' | |
| pdf_path = os.path.join(vol_dir, sanitize_filename(pdf_name)) | |
| if not os.path.exists(pdf_path): | |
| print(f" Downloading: {pdf_name}") | |
| if download_pdf(pdf_url, pdf_path, session): | |
| total_pdfs += 1 | |
| time.sleep(0.5) # Be polite | |
| else: | |
| # Direct PDF link | |
| pdf_url = item | |
| pdf_name = os.path.basename(urlparse(pdf_url).path) | |
| if not pdf_name.endswith('.pdf'): | |
| pdf_name += '.pdf' | |
| pdf_path = os.path.join(vol_dir, sanitize_filename(pdf_name)) | |
| if not os.path.exists(pdf_path): | |
| print(f" Downloading: {pdf_name}") | |
| if download_pdf(pdf_url, pdf_path, session): | |
| total_pdfs += 1 | |
| time.sleep(0.5) # Be polite | |
| time.sleep(1) # Rate limiting between volumes | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("DOWNLOAD COMPLETE") | |
| print("=" * 60) | |
| print(f"Total PDFs downloaded: {total_pdfs}") | |
| print(f"Output directory: {os.path.abspath(OUTPUT_DIR)}") | |
| if failed_downloads: | |
| print(f"\nFailed to process {len(failed_downloads)} volumes:") | |
| for url in failed_downloads: | |
| print(f" - {url}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment