CJCShadowsan/SC24ProceedingsDownloader.py

## SC24ProceedingsDownloader.py
#!/usr/bin/env python3

import time
import random
import os
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader

# Function to create a directory if it doesn't exist
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def sleep_random(min_seconds=15, max_seconds=45):
    """
    Sleep for a random amount of time between min_seconds and max_seconds.
    Args:
        min_seconds (int): Minimum sleep time in seconds.
        max_seconds (int): Maximum sleep time in seconds.
    """
    sleep_time = random.uniform(min_seconds, max_seconds)
    print(f"Sleeping for {sleep_time:.2f} seconds...")
    time.sleep(sleep_time)

# Function to download a PDF from a given URL
def download_pdf(pdf_url, save_directory):
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()  # Raise an HTTPError for bad responses
        filename = pdf_url.split('/')[-1]
        filepath = os.path.join(save_directory, filename+'.pdf')
        with open(filepath, 'wb') as pdf_file:
            pdf_file.write(response.content)
        print(f"Downloaded: {filename}")
        sleep_random(20,45)
    except Exception as e:
        print(f"Failed to download {pdf_url}: {e}")

# Main script to scrape and download PDFs
def download_all_pdfs(page_url, save_directory="pdfs"):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }
    try:
        # Request the main page
        response = requests.get(page_url, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all PDF links (based on ACM's URL structure)
        pdf_links = []
        for link in soup.find_all('a', href=lambda href: href and 'pdf' in href):
            pdf_links.append(link['href'])

        # Ensure we have links to download
        if not pdf_links:
            print("No PDF links found on the page.")
            return

        # Create a directory to save PDFs
        create_directory(save_directory)

        # Download each PDF
        for pdf_link in pdf_links:
            if not pdf_link.startswith('http'):
                pdf_link = f"https://dl.acm.org{pdf_link}"  # Construct full URL if relative
            download_pdf(pdf_link, save_directory)

    except Exception as e:
        print(f"An error occurred: {e}")

# Replace with the desired URL

base_url = "https://dl.acm.org/doi/proceedings/10.5555/3703596?tocHeading=heading"
urls = []

for i in range(1, 37):  # Loop from 1 to 36
    full_url = f"{base_url}{i}"
    urls.append(full_url)

# Download the resulting PDFs from the list of URLs by number of headings
for url in urls:
    download_all_pdfs(url)

def rename_pdfs_in_directory(directory_path):
    """
    Renames all PDF files in a directory based on the text from the first page.

    :param directory_path: Path to the directory containing PDF files
    """
    try:
        # List all files in the directory
        files = os.listdir(directory_path)

        # Filter to include only PDF files
        pdf_files = [f for f in files if f.lower().endswith('.pdf')]

        if not pdf_files:
            print("No PDF files found in the directory.")
            return

        for file_name in pdf_files:
            file_path = os.path.join(directory_path, file_name)

            try:
                # Read the PDF file
                reader = PdfReader(file_path)

                # Extract text from the first page
                if reader.pages:
                    first_page = reader.pages[0]
                    text = first_page.extract_text()
                else:
                    print(f"No pages found in file: {file_name}. Skipping...")
                    continue

                if not text.strip():
                    print(f"No text found on the first page of file: {file_name}. Skipping...")
                    continue

                # Use the first line of text as the title
                line1 = text.splitlines()[0]
                line2 = text.splitlines()[1]
                title = line1+line2

                # Sanitize the title to create a valid file name
                sanitized_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)

                # Create the new file name with the same extension
                new_file_name = f"{sanitized_title}.pdf"
                new_file_path = os.path.join(directory_path, new_file_name)

                # Rename the file
                os.rename(file_path, new_file_path)
                print(f"Renamed '{file_name}' to '{new_file_name}'.")

            except Exception as e:
                print(f"An error occurred while processing '{file_name}': {e}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
# Replace 'path/to/your/directory' with the actual path to your directory containing PDFs
rename_pdfs_in_directory('pdfs')
	#!/usr/bin/env python3

	import time
	import random
	import os
	import requests
	from bs4 import BeautifulSoup
	from PyPDF2 import PdfReader

	# Function to create a directory if it doesn't exist
	def create_directory(directory):
	if not os.path.exists(directory):
	os.makedirs(directory)

	def sleep_random(min_seconds=15, max_seconds=45):
	"""
	Sleep for a random amount of time between min_seconds and max_seconds.
	Args:
	min_seconds (int): Minimum sleep time in seconds.
	max_seconds (int): Maximum sleep time in seconds.
	"""
	sleep_time = random.uniform(min_seconds, max_seconds)
	print(f"Sleeping for {sleep_time:.2f} seconds...")
	time.sleep(sleep_time)

	# Function to download a PDF from a given URL
	def download_pdf(pdf_url, save_directory):
	try:
	response = requests.get(pdf_url)
	response.raise_for_status() # Raise an HTTPError for bad responses
	filename = pdf_url.split('/')[-1]
	filepath = os.path.join(save_directory, filename+'.pdf')
	with open(filepath, 'wb') as pdf_file:
	pdf_file.write(response.content)
	print(f"Downloaded: {filename}")
	sleep_random(20,45)
	except Exception as e:
	print(f"Failed to download {pdf_url}: {e}")

	# Main script to scrape and download PDFs
	def download_all_pdfs(page_url, save_directory="pdfs"):
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
	}
	try:
	# Request the main page
	response = requests.get(page_url, headers=headers)
	response.raise_for_status() # Raise an HTTPError for bad responses

	# Parse the HTML content
	soup = BeautifulSoup(response.text, 'html.parser')

	# Find all PDF links (based on ACM's URL structure)
	pdf_links = []
	for link in soup.find_all('a', href=lambda href: href and 'pdf' in href):
	pdf_links.append(link['href'])

	# Ensure we have links to download
	if not pdf_links:
	print("No PDF links found on the page.")
	return

	# Create a directory to save PDFs
	create_directory(save_directory)

	# Download each PDF
	for pdf_link in pdf_links:
	if not pdf_link.startswith('http'):
	pdf_link = f"https://dl.acm.org{pdf_link}" # Construct full URL if relative
	download_pdf(pdf_link, save_directory)

	except Exception as e:
	print(f"An error occurred: {e}")

	# Replace with the desired URL

	base_url = "https://dl.acm.org/doi/proceedings/10.5555/3703596?tocHeading=heading"
	urls = []

	for i in range(1, 37): # Loop from 1 to 36
	full_url = f"{base_url}{i}"
	urls.append(full_url)

	# Download the resulting PDFs from the list of URLs by number of headings
	for url in urls:
	download_all_pdfs(url)

	def rename_pdfs_in_directory(directory_path):
	"""
	Renames all PDF files in a directory based on the text from the first page.

	:param directory_path: Path to the directory containing PDF files
	"""
	try:
	# List all files in the directory
	files = os.listdir(directory_path)

	# Filter to include only PDF files
	pdf_files = [f for f in files if f.lower().endswith('.pdf')]

	if not pdf_files:
	print("No PDF files found in the directory.")
	return

	for file_name in pdf_files:
	file_path = os.path.join(directory_path, file_name)

	try:
	# Read the PDF file
	reader = PdfReader(file_path)

	# Extract text from the first page
	if reader.pages:
	first_page = reader.pages[0]
	text = first_page.extract_text()
	else:
	print(f"No pages found in file: {file_name}. Skipping...")
	continue

	if not text.strip():
	print(f"No text found on the first page of file: {file_name}. Skipping...")
	continue

	# Use the first line of text as the title
	line1 = text.splitlines()[0]
	line2 = text.splitlines()[1]
	title = line1+line2

	# Sanitize the title to create a valid file name
	sanitized_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title)

	# Create the new file name with the same extension
	new_file_name = f"{sanitized_title}.pdf"
	new_file_path = os.path.join(directory_path, new_file_name)

	# Rename the file
	os.rename(file_path, new_file_path)
	print(f"Renamed '{file_name}' to '{new_file_name}'.")

	except Exception as e:
	print(f"An error occurred while processing '{file_name}': {e}")

	except Exception as e:
	print(f"An error occurred: {e}")

	# Example usage:
	# Replace 'path/to/your/directory' with the actual path to your directory containing PDFs
	rename_pdfs_in_directory('pdfs')
No results found