Created
November 27, 2025 20:44
-
-
Save me-suzy/9d1d4ef731f4e023affda2c40537499c to your computer and use it in GitHub Desktop.
telerpot3654.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import time | |
| import re | |
| def download_biblioteca_sociologie(): | |
| base_url = "https://b56hgf.ro/carti/" | |
| download_folder = "biblioteca_sociologie_pdfs" | |
| os.makedirs(download_folder, exist_ok=True) | |
| print(f"📁 Folder: {os.path.abspath(download_folder)}") | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'ro-RO,ro;q=0.9,en;q=0.8', | |
| 'Referer': 'https://b56hgf.ro/', | |
| } | |
| session = requests.Session() | |
| session.headers.update(headers) | |
| # Pasul 1: Colectăm toate link-urile /download/ din pagina de cărți | |
| print("🔍 Scanare pagini de cărți...") | |
| download_pages = [] | |
| page = 1 | |
| while True: | |
| url = base_url if page == 1 else f"{base_url}page/{page}/" | |
| print(f"📄 Pagina {page}...") | |
| try: | |
| response = session.get(url, timeout=30) | |
| if response.status_code == 404: | |
| break | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| found = 0 | |
| for link in soup.find_all('a', href=True): | |
| href = link['href'] | |
| if '/download/' in href and href not in download_pages: | |
| if 'b56hgf.ro/download/' in href: | |
| download_pages.append(href) | |
| found += 1 | |
| print(f" Găsite: {found} cărți") | |
| if found == 0: | |
| break | |
| page += 1 | |
| time.sleep(0.5) | |
| except Exception as e: | |
| print(f" Eroare: {e}") | |
| break | |
| # Eliminăm duplicatele | |
| download_pages = list(set(download_pages)) | |
| print(f"\n🎯 Total cărți unice: {len(download_pages)}") | |
| if not download_pages: | |
| return | |
| # Pasul 2: Pentru fiecare carte, extragem data-downloadurl și descărcăm | |
| print(f"\n⏬ Începe descărcarea...\n") | |
| success = 0 | |
| failed = [] | |
| for i, page_url in enumerate(download_pages, 1): | |
| try: | |
| # Extrage numele cărții din URL | |
| title = page_url.split('/download/')[-1].rstrip('/')[:80] | |
| filename = re.sub(r'[<>:"/\\|?*]', '_', title) + '.pdf' | |
| filepath = os.path.join(download_folder, filename) | |
| if os.path.exists(filepath) and os.path.getsize(filepath) > 10000: | |
| print(f"{i}. ✓ Există: {filename[:50]}...") | |
| success += 1 | |
| continue | |
| print(f"{i}. 📖 {title[:50]}...") | |
| # Accesăm pagina de descărcare pentru a obține URL-ul real | |
| response = session.get(page_url, timeout=30) | |
| if response.status_code != 200: | |
| print(f" ❌ Nu pot accesa pagina") | |
| failed.append(page_url) | |
| continue | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Căutăm data-downloadurl | |
| download_link = soup.find('a', {'data-downloadurl': True}) | |
| if not download_link: | |
| # Alternativ, căutăm orice link cu wpdmdl | |
| for link in soup.find_all('a', href=True): | |
| if 'wpdmdl=' in link.get('href', ''): | |
| real_url = link['href'] | |
| break | |
| else: | |
| print(f" ❌ Nu găsesc link-ul de descărcare") | |
| failed.append(page_url) | |
| continue | |
| else: | |
| real_url = download_link['data-downloadurl'] | |
| print(f" ⬇️ Descarc...") | |
| # Descărcăm PDF-ul | |
| session.headers['Referer'] = page_url | |
| response = session.get(real_url, timeout=120, stream=True) | |
| if response.status_code == 200: | |
| content_type = response.headers.get('Content-Type', '') | |
| with open(filepath, 'wb') as f: | |
| for chunk in response.iter_content(chunk_size=8192): | |
| if chunk: | |
| f.write(chunk) | |
| size = os.path.getsize(filepath) | |
| # Verificăm dacă e PDF valid | |
| with open(filepath, 'rb') as f: | |
| header = f.read(5) | |
| if header == b'%PDF-' and size > 5000: | |
| print(f" ✅ OK ({size:,} bytes)") | |
| success += 1 | |
| else: | |
| os.remove(filepath) | |
| print(f" ❌ Nu e PDF valid (Content-Type: {content_type})") | |
| failed.append(page_url) | |
| else: | |
| print(f" ❌ HTTP {response.status_code}") | |
| failed.append(page_url) | |
| time.sleep(2) | |
| except Exception as e: | |
| print(f" ❌ Eroare: {e}") | |
| failed.append(page_url) | |
| print(f"\n{'='*50}") | |
| print(f"✅ Descărcate cu succes: {success}") | |
| print(f"❌ Eșuate: {len(failed)}") | |
| print(f"📁 {os.path.abspath(download_folder)}") | |
| if failed: | |
| print(f"\n📋 URL-uri eșuate:") | |
| for url in failed[:10]: | |
| print(f" {url}") | |
| if __name__ == "__main__": | |
| download_biblioteca_sociologie() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment