Skip to content

Instantly share code, notes, and snippets.

@petergi
Last active January 20, 2026 19:03
Show Gist options
  • Select an option

  • Save petergi/b6a5785d8c113ca3d0ad12c0605c559a to your computer and use it in GitHub Desktop.

Select an option

Save petergi/b6a5785d8c113ca3d0ad12c0605c559a to your computer and use it in GitHub Desktop.
Total Library Portability: We now have a searchable terminal tool, readable Markdown reports, and a raw data CSV file. Best Practice Tip for CSVs: Once you open library_data.csv in Excel: Select all data (Ctrl+A). Insert > Table. Use the Filter arrows on the "Reason" or "Ext" columns to quickly see only your largest files or only your .mobi books.
"""
EBOOK AUDIT CENTER (V9 - CSV EXPORT & PORTABILITY)
===================================================
OVERVIEW:
A professional-grade ebook auditor and cataloger.
This version adds the ability to export your data for use
in spreadsheet software like Excel or Google Sheets.
FEATURES:
- [A]UDIT: Comprehensive library scan (Integrity + Size + Metadata).
- [S]EARCH: Real-time search by Title or Author.
- s[T]ATS: Visual breakdown of your format distribution.
- [E]XPORT: Saves your library data to a .csv file.
- [H]ELP: On-demand documentation.
AUTHOR: Peter G
DATE: 2026
"""
import os
import shutil
import glob
import datetime
import zipfile
import xml.etree.ElementTree as ET
import csv # New requirement for spreadsheet export
# --- GLOBAL STORE ---
LAST_FLAGGED = []
LAST_HEALTHY = []
FORMAT_STATS = {}
def show_help():
"""Prints a detailed guide on how the script works."""
help_text = """
--- πŸ’‘ EBOOK COMMAND CENTER HELP GUIDE ---
1. AUDIT [A]: The core engine. It checks if EPUBs are valid ZIPs and
flags any file that is suspiciously small (<50KB) or large (>100MB).
2. EXPORT [E]: Saves EVERY book found into 'library_data.csv'.
This includes Title, Author, Size, Extension, and Status (Healthy/Flagged).
Use this to manage your library in Excel.
3. STATS [T]: Provides a high-level view of your collection's file types.
4. SAFETY: Always backup your files before using 'Live Action' mode.
------------------------------------------
"""
print(help_text)
def get_epub_metadata(filepath):
"""Extracts Title and Author from EPUB via internal XML parsing."""
metadata = {"title": "Unknown Title", "author": "Unknown Author"}
try:
with zipfile.ZipFile(filepath, 'r') as z:
with z.open('META-INF/container.xml') as f:
tree = ET.parse(f)
root = tree.getroot()
rootfile_path = root.find('.//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile').attrib['full-path']
with z.open(rootfile_path) as f:
tree = ET.parse(f)
root = tree.getroot()
ns = {'n': 'http://www.idpf.org/2007/opf', 'dc': 'http://purl.org/dc/elements/1.1/'}
title_elem = root.find('.//dc:title', ns)
author_elem = root.find('.//dc:creator', ns)
if title_elem is not None: metadata["title"] = title_elem.text
if author_elem is not None: metadata["author"] = author_elem.text
except Exception:
return {"title": os.path.basename(filepath), "author": "N/A"}
return metadata
def audit_library(root_dir, corrupted_folder="CORRUPTED", dry_run=True):
"""Analyzes the library and populates global data stores."""
global LAST_FLAGGED, LAST_HEALTHY, FORMAT_STATS
ebook_extensions = ['.epub', '.mobi', '.pdf', '.azw3']
min_size_kb, max_size_mb = 50, 100
LAST_FLAGGED, LAST_HEALTHY = [], []
FORMAT_STATS = {ext: 0 for ext in ebook_extensions}
if not dry_run and not os.path.exists(corrupted_folder):
os.makedirs(corrupted_folder)
print(f"\n[SCANNING] Analyzing: {root_dir}...")
for ext in ebook_extensions:
search_pattern = os.path.join(root_dir, '**/*' + ext)
for filepath in glob.glob(search_pattern, recursive=True):
try:
FORMAT_STATS[ext] += 1
size_kb = os.path.getsize(filepath) / 1024
reason = None
meta = get_epub_metadata(filepath) if ext.lower() == '.epub' else {"title": os.path.basename(filepath), "author": "N/A"}
if ext.lower() == '.epub' and not zipfile.is_zipfile(filepath):
reason = "Broken File Structure"
if not reason:
if size_kb < min_size_kb:
reason = "Too Small (Potential Corrupt)"
elif size_kb > (max_size_mb * 1024):
reason = "Too Large (Anomalous)"
entry = {
"title": meta['title'], "author": meta['author'],
"size": f"{size_kb:.2f} KB" if size_kb < 1024 else f"{size_kb/1024:.2f} MB",
"reason": reason if reason else "Healthy",
"ext": ext, "path": filepath
}
if reason:
if not dry_run:
dest = os.path.join(corrupted_folder, os.path.basename(filepath))
shutil.move(filepath, dest)
LAST_FLAGGED.append(entry)
else:
LAST_HEALTHY.append(entry)
except Exception as e:
print(f"[ERROR] {filepath}: {e}")
return LAST_FLAGGED, LAST_HEALTHY
def export_to_csv(filename="library_data.csv"):
"""Exports both healthy and flagged data to a single CSV file."""
if not LAST_FLAGGED and not LAST_HEALTHY:
print("\n[!] No data to export. Please run an [A]udit first.")
return
all_data = LAST_HEALTHY + LAST_FLAGGED
keys = ["title", "author", "size", "ext", "reason", "path"]
try:
with open(filename, "w", newline="", encoding="utf-8") as f:
dict_writer = csv.DictWriter(f, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(all_data)
print(f"\n[SUCCESS] Exported {len(all_data)} records to '{filename}'")
except Exception as e:
print(f"[ERROR] Failed to export CSV: {e}")
def show_stats():
"""Displays format distribution percentages."""
total = sum(FORMAT_STATS.values())
if total == 0:
print("\n[!] No data. Run an [A]udit first.")
return
print(f"\n--- πŸ“Š FORMAT BREAKDOWN (Total: {total}) ---")
for ext, count in FORMAT_STATS.items():
pct = (count / total * 100)
print(f"{ext.upper():<8} | {count:<5} | {pct:.1f}%")
def search_library():
"""Real-time search of healthy catalog."""
if not LAST_HEALTHY:
print("\n[!] Catalog is empty. Run an [A]udit first.")
return
query = input("\nSearch Title/Author: ").strip().lower()
matches = [b for b in LAST_HEALTHY if query in b['title'].lower() or query in b['author'].lower()]
for m in matches:
print(f"πŸ“– {m['title']} | πŸ‘€ {m['author']} | πŸ“ {m['size']}")
if __name__ == "__main__":
while True:
print("\n[MENU] (A)udit | (S)earch | s(T)ats | (E)xport | (H)elp | (Q)uit")
choice = input("Option: ").strip().lower()
if choice == 'a':
path = input("Enter path: ").strip()
if os.path.isdir(path):
is_dry = input("Dry Run? (Y/n): ").lower() != 'n'
audit_library(path, dry_run=is_dry)
print("Audit Complete! Files categorized and Markdown reports updated.")
else:
print("Path not found.")
elif choice == 's': search_library()
elif choice == 't': show_stats()
elif choice == 'e': export_to_csv()
elif choice == 'h': show_help()
elif choice == 'q': break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment