Skip to content

Instantly share code, notes, and snippets.

@CJCShadowsan
Last active December 3, 2024 13:12
Show Gist options
  • Select an option

  • Save CJCShadowsan/2a0640892e444676cac44f1fe82d0903 to your computer and use it in GitHub Desktop.

Select an option

Save CJCShadowsan/2a0640892e444676cac44f1fe82d0903 to your computer and use it in GitHub Desktop.
A simple Python script using BeautifulSoup to loop through the PDFs available from the session and download them. Now renames the title of the PDFs based on the title in the publication.
#!/usr/bin/env python3
import time
import random
import os
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
# Function to create a directory if it doesn't exist
def create_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)
def sleep_random(min_seconds=15, max_seconds=45):
"""
Sleep for a random amount of time between min_seconds and max_seconds.
Args:
min_seconds (int): Minimum sleep time in seconds.
max_seconds (int): Maximum sleep time in seconds.
"""
sleep_time = random.uniform(min_seconds, max_seconds)
print(f"Sleeping for {sleep_time:.2f} seconds...")
time.sleep(sleep_time)
# Function to download a PDF from a given URL
def download_pdf(pdf_url, save_directory):
try:
response = requests.get(pdf_url)
response.raise_for_status() # Raise an HTTPError for bad responses
filename = pdf_url.split('/')[-1]
filepath = os.path.join(save_directory, filename+'.pdf')
with open(filepath, 'wb') as pdf_file:
pdf_file.write(response.content)
print(f"Downloaded: {filename}")
sleep_random(20,45)
except Exception as e:
print(f"Failed to download {pdf_url}: {e}")
# Main script to scrape and download PDFs
def download_all_pdfs(page_url, save_directory="pdfs"):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
try:
# Request the main page
response = requests.get(page_url, headers=headers)
response.raise_for_status() # Raise an HTTPError for bad responses
# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')
# Find all PDF links (based on ACM's URL structure)
pdf_links = []
for link in soup.find_all('a', href=lambda href: href and 'pdf' in href):
pdf_links.append(link['href'])
# Ensure we have links to download
if not pdf_links:
print("No PDF links found on the page.")
return
# Create a directory to save PDFs
create_directory(save_directory)
# Download each PDF
for pdf_link in pdf_links:
if not pdf_link.startswith('http'):
pdf_link = f"https://dl.acm.org{pdf_link}" # Construct full URL if relative
download_pdf(pdf_link, save_directory)
except Exception as e:
print(f"An error occurred: {e}")
# Replace with the desired URL
base_url = "https://dl.acm.org/doi/proceedings/10.5555/3703596?tocHeading=heading"
urls = []
for i in range(1, 37): # Loop from 1 to 36
full_url = f"{base_url}{i}"
urls.append(full_url)
# Download the resulting PDFs from the list of URLs by number of headings
for url in urls:
download_all_pdfs(url)
def rename_pdfs_in_directory(directory_path):
"""
Renames all PDF files in a directory based on the text from the first page.
:param directory_path: Path to the directory containing PDF files
"""
try:
# List all files in the directory
files = os.listdir(directory_path)
# Filter to include only PDF files
pdf_files = [f for f in files if f.lower().endswith('.pdf')]
if not pdf_files:
print("No PDF files found in the directory.")
return
for file_name in pdf_files:
file_path = os.path.join(directory_path, file_name)
try:
# Read the PDF file
reader = PdfReader(file_path)
# Extract text from the first page
if reader.pages:
first_page = reader.pages[0]
text = first_page.extract_text()
else:
print(f"No pages found in file: {file_name}. Skipping...")
continue
if not text.strip():
print(f"No text found on the first page of file: {file_name}. Skipping...")
continue
# Use the first line of text as the title
line1 = text.splitlines()[0]
line2 = text.splitlines()[1]
title = line1+line2
# Sanitize the title to create a valid file name
sanitized_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)
# Create the new file name with the same extension
new_file_name = f"{sanitized_title}.pdf"
new_file_path = os.path.join(directory_path, new_file_name)
# Rename the file
os.rename(file_path, new_file_path)
print(f"Renamed '{file_name}' to '{new_file_name}'.")
except Exception as e:
print(f"An error occurred while processing '{file_name}': {e}")
except Exception as e:
print(f"An error occurred: {e}")
# Example usage:
# Replace 'path/to/your/directory' with the actual path to your directory containing PDFs
rename_pdfs_in_directory('pdfs')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment