Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created November 29, 2025 05:48
Show Gist options
  • Select an option

  • Save me-suzy/96010a9949f65c844e61ac2dfb6140a2 to your computer and use it in GitHub Desktop.

Select an option

Save me-suzy/96010a9949f65c844e61ac2dfb6140a2 to your computer and use it in GitHub Desktop.
teleport biblioteca diig.py
#!/usr/bin/env python3
"""
Script to download all PDFs from "Studii și Cercetări de Chimie"
on 455464.ro
"""
import requests
from bs4 import BeautifulSoup
import os
import re
import time
from urllib.parse import urljoin, urlparse
from pathlib import Path
BASE_URL = "https://455464.ro/"
PUB_URL = "https://455464.ro/?pub=6464-studii-si-cercetari-de-chimie"
OUTPUT_DIR = "Studii_Cercetari_Chimie_PDFs"
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8',
}
def get_soup(url, session):
"""Fetch page and return BeautifulSoup object"""
try:
response = session.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
return BeautifulSoup(response.content, 'html.parser')
except Exception as e:
print(f" [ERROR] Failed to fetch {url}: {e}")
return None
def get_all_volume_links(soup):
"""Extract all volume links from publication page"""
volume_links = []
table = soup.find('table', {'id': 'datatable-default'})
if not table:
print("[ERROR] Could not find volumes table")
return volume_links
for row in table.find_all('tr'):
for link in row.find_all('a', href=True):
href = link['href']
if 'volum=' in href:
full_url = urljoin(BASE_URL, href)
volume_name = link.get_text(strip=True)
volume_links.append((full_url, volume_name))
return volume_links
def get_pdf_links_from_volume(soup, volume_url):
"""Extract PDF links from a volume page"""
pdf_links = []
# Look for PDF links - they might be in different formats
for link in soup.find_all('a', href=True):
href = link['href']
# Check for PDF links
if '.pdf' in href.lower() or 'download' in href.lower():
full_url = urljoin(volume_url, href)
pdf_links.append(full_url)
# Also check for article links that might lead to PDFs
for link in soup.find_all('a', href=True):
href = link['href']
if 'articol=' in href:
full_url = urljoin(BASE_URL, href)
pdf_links.append(('article', full_url))
return pdf_links
def get_pdf_from_article(soup, article_url):
"""Extract PDF link from an article page"""
for link in soup.find_all('a', href=True):
href = link['href']
if '.pdf' in href.lower():
return urljoin(article_url, href)
return None
def sanitize_filename(name):
"""Clean filename for filesystem"""
# Remove/replace invalid characters
name = re.sub(r'[<>:"/\\|?*]', '_', name)
name = re.sub(r'\s+', '_', name)
name = name.strip('._')
return name[:200] # Limit length
def download_pdf(url, output_path, session):
"""Download PDF file"""
try:
response = session.get(url, headers=HEADERS, timeout=60, stream=True)
response.raise_for_status()
with open(output_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return True
except Exception as e:
print(f" [ERROR] Download failed: {e}")
return False
def main():
print("=" * 60)
print("Studii și Cercetări de Chimie - PDF Downloader")
print("=" * 60)
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)
session = requests.Session()
# Step 1: Get all volume links
print(f"\n[1] Fetching publication page: {PUB_URL}")
soup = get_soup(PUB_URL, session)
if not soup:
print("[FATAL] Could not load publication page")
return
volume_links = get_all_volume_links(soup)
print(f" Found {len(volume_links)} volumes")
# Step 2: Process each volume
total_pdfs = 0
failed_downloads = []
for i, (vol_url, vol_name) in enumerate(volume_links, 1):
print(f"\n[{i}/{len(volume_links)}] Processing: {vol_name}")
# Create subdirectory for volume
vol_dir = os.path.join(OUTPUT_DIR, sanitize_filename(vol_name))
os.makedirs(vol_dir, exist_ok=True)
vol_soup = get_soup(vol_url, session)
if not vol_soup:
failed_downloads.append(vol_url)
continue
pdf_links = get_pdf_links_from_volume(vol_soup, vol_url)
for item in pdf_links:
if isinstance(item, tuple) and item[0] == 'article':
# Need to fetch article page to get PDF
article_url = item[1]
art_soup = get_soup(article_url, session)
if art_soup:
pdf_url = get_pdf_from_article(art_soup, article_url)
if pdf_url:
pdf_name = os.path.basename(urlparse(pdf_url).path)
if not pdf_name.endswith('.pdf'):
pdf_name += '.pdf'
pdf_path = os.path.join(vol_dir, sanitize_filename(pdf_name))
if not os.path.exists(pdf_path):
print(f" Downloading: {pdf_name}")
if download_pdf(pdf_url, pdf_path, session):
total_pdfs += 1
time.sleep(0.5) # Be polite
else:
# Direct PDF link
pdf_url = item
pdf_name = os.path.basename(urlparse(pdf_url).path)
if not pdf_name.endswith('.pdf'):
pdf_name += '.pdf'
pdf_path = os.path.join(vol_dir, sanitize_filename(pdf_name))
if not os.path.exists(pdf_path):
print(f" Downloading: {pdf_name}")
if download_pdf(pdf_url, pdf_path, session):
total_pdfs += 1
time.sleep(0.5) # Be polite
time.sleep(1) # Rate limiting between volumes
# Summary
print("\n" + "=" * 60)
print("DOWNLOAD COMPLETE")
print("=" * 60)
print(f"Total PDFs downloaded: {total_pdfs}")
print(f"Output directory: {os.path.abspath(OUTPUT_DIR)}")
if failed_downloads:
print(f"\nFailed to process {len(failed_downloads)} volumes:")
for url in failed_downloads:
print(f" - {url}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment