Skip to content

Instantly share code, notes, and snippets.

@me-suzy
Created November 27, 2025 20:44
Show Gist options
  • Select an option

  • Save me-suzy/9d1d4ef731f4e023affda2c40537499c to your computer and use it in GitHub Desktop.

Select an option

Save me-suzy/9d1d4ef731f4e023affda2c40537499c to your computer and use it in GitHub Desktop.
telerpot3654.py
import requests
from bs4 import BeautifulSoup
import os
import time
import re
def download_biblioteca_sociologie():
base_url = "https://b56hgf.ro/carti/"
download_folder = "biblioteca_sociologie_pdfs"
os.makedirs(download_folder, exist_ok=True)
print(f"📁 Folder: {os.path.abspath(download_folder)}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8',
'Referer': 'https://b56hgf.ro/',
}
session = requests.Session()
session.headers.update(headers)
# Pasul 1: Colectăm toate link-urile /download/ din pagina de cărți
print("🔍 Scanare pagini de cărți...")
download_pages = []
page = 1
while True:
url = base_url if page == 1 else f"{base_url}page/{page}/"
print(f"📄 Pagina {page}...")
try:
response = session.get(url, timeout=30)
if response.status_code == 404:
break
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
found = 0
for link in soup.find_all('a', href=True):
href = link['href']
if '/download/' in href and href not in download_pages:
if 'b56hgf.ro/download/' in href:
download_pages.append(href)
found += 1
print(f" Găsite: {found} cărți")
if found == 0:
break
page += 1
time.sleep(0.5)
except Exception as e:
print(f" Eroare: {e}")
break
# Eliminăm duplicatele
download_pages = list(set(download_pages))
print(f"\n🎯 Total cărți unice: {len(download_pages)}")
if not download_pages:
return
# Pasul 2: Pentru fiecare carte, extragem data-downloadurl și descărcăm
print(f"\n⏬ Începe descărcarea...\n")
success = 0
failed = []
for i, page_url in enumerate(download_pages, 1):
try:
# Extrage numele cărții din URL
title = page_url.split('/download/')[-1].rstrip('/')[:80]
filename = re.sub(r'[<>:"/\\|?*]', '_', title) + '.pdf'
filepath = os.path.join(download_folder, filename)
if os.path.exists(filepath) and os.path.getsize(filepath) > 10000:
print(f"{i}. ✓ Există: {filename[:50]}...")
success += 1
continue
print(f"{i}. 📖 {title[:50]}...")
# Accesăm pagina de descărcare pentru a obține URL-ul real
response = session.get(page_url, timeout=30)
if response.status_code != 200:
print(f" ❌ Nu pot accesa pagina")
failed.append(page_url)
continue
soup = BeautifulSoup(response.content, 'html.parser')
# Căutăm data-downloadurl
download_link = soup.find('a', {'data-downloadurl': True})
if not download_link:
# Alternativ, căutăm orice link cu wpdmdl
for link in soup.find_all('a', href=True):
if 'wpdmdl=' in link.get('href', ''):
real_url = link['href']
break
else:
print(f" ❌ Nu găsesc link-ul de descărcare")
failed.append(page_url)
continue
else:
real_url = download_link['data-downloadurl']
print(f" ⬇️ Descarc...")
# Descărcăm PDF-ul
session.headers['Referer'] = page_url
response = session.get(real_url, timeout=120, stream=True)
if response.status_code == 200:
content_type = response.headers.get('Content-Type', '')
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
size = os.path.getsize(filepath)
# Verificăm dacă e PDF valid
with open(filepath, 'rb') as f:
header = f.read(5)
if header == b'%PDF-' and size > 5000:
print(f" ✅ OK ({size:,} bytes)")
success += 1
else:
os.remove(filepath)
print(f" ❌ Nu e PDF valid (Content-Type: {content_type})")
failed.append(page_url)
else:
print(f" ❌ HTTP {response.status_code}")
failed.append(page_url)
time.sleep(2)
except Exception as e:
print(f" ❌ Eroare: {e}")
failed.append(page_url)
print(f"\n{'='*50}")
print(f"✅ Descărcate cu succes: {success}")
print(f"❌ Eșuate: {len(failed)}")
print(f"📁 {os.path.abspath(download_folder)}")
if failed:
print(f"\n📋 URL-uri eșuate:")
for url in failed[:10]:
print(f" {url}")
if __name__ == "__main__":
download_biblioteca_sociologie()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment