petergi/ebook_auditor.py

## ebook_auditor.py
"""
EBOOK AUDIT CENTER (V9 - CSV EXPORT & PORTABILITY)
===================================================

OVERVIEW:
    A professional-grade ebook auditor and cataloger.
    This version adds the ability to export your data for use
    in spreadsheet software like Excel or Google Sheets.

FEATURES:
    - [A]UDIT: Comprehensive library scan (Integrity + Size + Metadata).
    - [S]EARCH: Real-time search by Title or Author.
    - s[T]ATS: Visual breakdown of your format distribution.
    - [E]XPORT: Saves your library data to a .csv file.
    - [H]ELP: On-demand documentation.

AUTHOR: Peter G
DATE: 2026
"""

import os
import shutil
import glob
import datetime
import zipfile
import xml.etree.ElementTree as ET
import csv  # New requirement for spreadsheet export

# --- GLOBAL STORE ---
LAST_FLAGGED = []
LAST_HEALTHY = []
FORMAT_STATS = {}

def show_help():
    """Prints a detailed guide on how the script works."""
    help_text = """
--- 💡 EBOOK COMMAND CENTER HELP GUIDE ---

1. AUDIT [A]: The core engine. It checks if EPUBs are valid ZIPs and
   flags any file that is suspiciously small (<50KB) or large (>100MB).

2. EXPORT [E]: Saves EVERY book found into 'library_data.csv'.
   This includes Title, Author, Size, Extension, and Status (Healthy/Flagged).
   Use this to manage your library in Excel.

3. STATS [T]: Provides a high-level view of your collection's file types.

4. SAFETY: Always backup your files before using 'Live Action' mode.
------------------------------------------
"""
    print(help_text)

def get_epub_metadata(filepath):
    """Extracts Title and Author from EPUB via internal XML parsing."""
    metadata = {"title": "Unknown Title", "author": "Unknown Author"}
    try:
        with zipfile.ZipFile(filepath, 'r') as z:
            with z.open('META-INF/container.xml') as f:
                tree = ET.parse(f)
                root = tree.getroot()
                rootfile_path = root.find('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile').attrib['full-path']

            with z.open(rootfile_path) as f:
                tree = ET.parse(f)
                root = tree.getroot()
                ns = {'n': 'http://www.idpf.org/2007/opf', 'dc': 'http://purl.org/dc/elements/1.1/'}

                title_elem = root.find('.//dc:title', ns)
                author_elem = root.find('.//dc:creator', ns)

                if title_elem is not None: metadata["title"] = title_elem.text
                if author_elem is not None: metadata["author"] = author_elem.text
    except Exception:
        return {"title": os.path.basename(filepath), "author": "N/A"}
    return metadata

def audit_library(root_dir, corrupted_folder="CORRUPTED", dry_run=True):
    """Analyzes the library and populates global data stores."""
    global LAST_FLAGGED, LAST_HEALTHY, FORMAT_STATS
    ebook_extensions = ['.epub', '.mobi', '.pdf', '.azw3']
    min_size_kb, max_size_mb = 50, 100
    LAST_FLAGGED, LAST_HEALTHY = [], []
    FORMAT_STATS = {ext: 0 for ext in ebook_extensions}

    if not dry_run and not os.path.exists(corrupted_folder):
        os.makedirs(corrupted_folder)

    print(f"\n[SCANNING] Analyzing: {root_dir}...")

    for ext in ebook_extensions:
        search_pattern = os.path.join(root_dir, '**/*' + ext)
        for filepath in glob.glob(search_pattern, recursive=True):
            try:
                FORMAT_STATS[ext] += 1
                size_kb = os.path.getsize(filepath) / 1024
                reason = None
                meta = get_epub_metadata(filepath) if ext.lower() == '.epub' else {"title": os.path.basename(filepath), "author": "N/A"}

                if ext.lower() == '.epub' and not zipfile.is_zipfile(filepath):
                    reason = "Broken File Structure"

                if not reason:
                    if size_kb < min_size_kb:
                        reason = "Too Small (Potential Corrupt)"
                    elif size_kb > (max_size_mb * 1024):
                        reason = "Too Large (Anomalous)"

                entry = {
                    "title": meta['title'], "author": meta['author'],
                    "size": f"{size_kb:.2f} KB" if size_kb < 1024 else f"{size_kb/1024:.2f} MB",
                    "reason": reason if reason else "Healthy",
                    "ext": ext, "path": filepath
                }

                if reason:
                    if not dry_run:
                        dest = os.path.join(corrupted_folder, os.path.basename(filepath))
                        shutil.move(filepath, dest)
                    LAST_FLAGGED.append(entry)
                else:
                    LAST_HEALTHY.append(entry)

            except Exception as e:
                print(f"[ERROR] {filepath}: {e}")

    return LAST_FLAGGED, LAST_HEALTHY

def export_to_csv(filename="library_data.csv"):
    """Exports both healthy and flagged data to a single CSV file."""
    if not LAST_FLAGGED and not LAST_HEALTHY:
        print("\n[!] No data to export. Please run an [A]udit first.")
        return

    all_data = LAST_HEALTHY + LAST_FLAGGED
    keys = ["title", "author", "size", "ext", "reason", "path"]

    try:
        with open(filename, "w", newline="", encoding="utf-8") as f:
            dict_writer = csv.DictWriter(f, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(all_data)
        print(f"\n[SUCCESS] Exported {len(all_data)} records to '{filename}'")
    except Exception as e:
        print(f"[ERROR] Failed to export CSV: {e}")

def show_stats():
    """Displays format distribution percentages."""
    total = sum(FORMAT_STATS.values())
    if total == 0:
        print("\n[!] No data. Run an [A]udit first.")
        return
    print(f"\n--- 📊 FORMAT BREAKDOWN (Total: {total}) ---")
    for ext, count in FORMAT_STATS.items():
        pct = (count / total * 100)
        print(f"{ext.upper():<8} | {count:<5} | {pct:.1f}%")

def search_library():
    """Real-time search of healthy catalog."""
    if not LAST_HEALTHY:
        print("\n[!] Catalog is empty. Run an [A]udit first.")
        return
    query = input("\nSearch Title/Author: ").strip().lower()
    matches = [b for b in LAST_HEALTHY if query in b['title'].lower() or query in b['author'].lower()]
    for m in matches:
        print(f"📖 {m['title']} | 👤 {m['author']} | 📁 {m['size']}")

if __name__ == "__main__":
    while True:
        print("\n[MENU] (A)udit | (S)earch | s(T)ats | (E)xport | (H)elp | (Q)uit")
        choice = input("Option: ").strip().lower()

        if choice == 'a':
            path = input("Enter path: ").strip()
            if os.path.isdir(path):
                is_dry = input("Dry Run? (Y/n): ").lower() != 'n'
                audit_library(path, dry_run=is_dry)
                print("Audit Complete! Files categorized and Markdown reports updated.")
            else:
                print("Path not found.")
        elif choice == 's': search_library()
        elif choice == 't': show_stats()
        elif choice == 'e': export_to_csv()
        elif choice == 'h': show_help()
        elif choice == 'q': break
	"""
	EBOOK AUDIT CENTER (V9 - CSV EXPORT & PORTABILITY)
	===================================================

	OVERVIEW:
	A professional-grade ebook auditor and cataloger.
	This version adds the ability to export your data for use
	in spreadsheet software like Excel or Google Sheets.

	FEATURES:
	- [A]UDIT: Comprehensive library scan (Integrity + Size + Metadata).
	- [S]EARCH: Real-time search by Title or Author.
	- s[T]ATS: Visual breakdown of your format distribution.
	- [E]XPORT: Saves your library data to a .csv file.
	- [H]ELP: On-demand documentation.

	AUTHOR: Peter G
	DATE: 2026
	"""

	import os
	import shutil
	import glob
	import datetime
	import zipfile
	import xml.etree.ElementTree as ET
	import csv # New requirement for spreadsheet export

	# --- GLOBAL STORE ---
	LAST_FLAGGED = []
	LAST_HEALTHY = []
	FORMAT_STATS = {}

	def show_help():
	"""Prints a detailed guide on how the script works."""
	help_text = """
	--- 💡 EBOOK COMMAND CENTER HELP GUIDE ---

	1. AUDIT [A]: The core engine. It checks if EPUBs are valid ZIPs and
	flags any file that is suspiciously small (<50KB) or large (>100MB).

	2. EXPORT [E]: Saves EVERY book found into 'library_data.csv'.
	This includes Title, Author, Size, Extension, and Status (Healthy/Flagged).
	Use this to manage your library in Excel.

	3. STATS [T]: Provides a high-level view of your collection's file types.

	4. SAFETY: Always backup your files before using 'Live Action' mode.
	------------------------------------------
	"""
	print(help_text)

	def get_epub_metadata(filepath):
	"""Extracts Title and Author from EPUB via internal XML parsing."""
	metadata = {"title": "Unknown Title", "author": "Unknown Author"}
	try:
	with zipfile.ZipFile(filepath, 'r') as z:
	with z.open('META-INF/container.xml') as f:
	tree = ET.parse(f)
	root = tree.getroot()
	rootfile_path = root.find('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile').attrib['full-path']

	with z.open(rootfile_path) as f:
	tree = ET.parse(f)
	root = tree.getroot()
	ns = {'n': 'http://www.idpf.org/2007/opf', 'dc': 'http://purl.org/dc/elements/1.1/'}

	title_elem = root.find('.//dc:title', ns)
	author_elem = root.find('.//dc:creator', ns)

	if title_elem is not None: metadata["title"] = title_elem.text
	if author_elem is not None: metadata["author"] = author_elem.text
	except Exception:
	return {"title": os.path.basename(filepath), "author": "N/A"}
	return metadata

	def audit_library(root_dir, corrupted_folder="CORRUPTED", dry_run=True):
	"""Analyzes the library and populates global data stores."""
	global LAST_FLAGGED, LAST_HEALTHY, FORMAT_STATS
	ebook_extensions = ['.epub', '.mobi', '.pdf', '.azw3']
	min_size_kb, max_size_mb = 50, 100
	LAST_FLAGGED, LAST_HEALTHY = [], []
	FORMAT_STATS = {ext: 0 for ext in ebook_extensions}

	if not dry_run and not os.path.exists(corrupted_folder):
	os.makedirs(corrupted_folder)

	print(f"\n[SCANNING] Analyzing: {root_dir}...")

	for ext in ebook_extensions:
	search_pattern = os.path.join(root_dir, '*/' + ext)
	for filepath in glob.glob(search_pattern, recursive=True):
	try:
	FORMAT_STATS[ext] += 1
	size_kb = os.path.getsize(filepath) / 1024
	reason = None
	meta = get_epub_metadata(filepath) if ext.lower() == '.epub' else {"title": os.path.basename(filepath), "author": "N/A"}

	if ext.lower() == '.epub' and not zipfile.is_zipfile(filepath):
	reason = "Broken File Structure"

	if not reason:
	if size_kb < min_size_kb:
	reason = "Too Small (Potential Corrupt)"
	elif size_kb > (max_size_mb * 1024):
	reason = "Too Large (Anomalous)"

	entry = {
	"title": meta['title'], "author": meta['author'],
	"size": f"{size_kb:.2f} KB" if size_kb < 1024 else f"{size_kb/1024:.2f} MB",
	"reason": reason if reason else "Healthy",
	"ext": ext, "path": filepath
	}

	if reason:
	if not dry_run:
	dest = os.path.join(corrupted_folder, os.path.basename(filepath))
	shutil.move(filepath, dest)
	LAST_FLAGGED.append(entry)
	else:
	LAST_HEALTHY.append(entry)

	except Exception as e:
	print(f"[ERROR] {filepath}: {e}")

	return LAST_FLAGGED, LAST_HEALTHY

	def export_to_csv(filename="library_data.csv"):
	"""Exports both healthy and flagged data to a single CSV file."""
	if not LAST_FLAGGED and not LAST_HEALTHY:
	print("\n[!] No data to export. Please run an [A]udit first.")
	return

	all_data = LAST_HEALTHY + LAST_FLAGGED
	keys = ["title", "author", "size", "ext", "reason", "path"]

	try:
	with open(filename, "w", newline="", encoding="utf-8") as f:
	dict_writer = csv.DictWriter(f, fieldnames=keys)
	dict_writer.writeheader()
	dict_writer.writerows(all_data)
	print(f"\n[SUCCESS] Exported {len(all_data)} records to '{filename}'")
	except Exception as e:
	print(f"[ERROR] Failed to export CSV: {e}")

	def show_stats():
	"""Displays format distribution percentages."""
	total = sum(FORMAT_STATS.values())
	if total == 0:
	print("\n[!] No data. Run an [A]udit first.")
	return
	print(f"\n--- 📊 FORMAT BREAKDOWN (Total: {total}) ---")
	for ext, count in FORMAT_STATS.items():
	pct = (count / total * 100)
	print(f"{ext.upper():<8} \| {count:<5} \| {pct:.1f}%")

	def search_library():
	"""Real-time search of healthy catalog."""
	if not LAST_HEALTHY:
	print("\n[!] Catalog is empty. Run an [A]udit first.")
	return
	query = input("\nSearch Title/Author: ").strip().lower()
	matches = [b for b in LAST_HEALTHY if query in b['title'].lower() or query in b['author'].lower()]
	for m in matches:
	print(f"📖 {m['title']} \| 👤 {m['author']} \| 📁 {m['size']}")

	if __name__ == "__main__":
	while True:
	print("\n[MENU] (A)udit \| (S)earch \| s(T)ats \| (E)xport \| (H)elp \| (Q)uit")
	choice = input("Option: ").strip().lower()

	if choice == 'a':
	path = input("Enter path: ").strip()
	if os.path.isdir(path):
	is_dry = input("Dry Run? (Y/n): ").lower() != 'n'
	audit_library(path, dry_run=is_dry)
	print("Audit Complete! Files categorized and Markdown reports updated.")
	else:
	print("Path not found.")
	elif choice == 's': search_library()
	elif choice == 't': show_stats()
	elif choice == 'e': export_to_csv()
	elif choice == 'h': show_help()
	elif choice == 'q': break
No results found