rayvoelker/00-README.md

## 00-README.md

      
    Raw
  

              00-README.md
            
          
    ilsaux Documentation Generator Scripts - CHPL Sierra ILS

Category: scripts
Files: 16
Master Index: https://gist.github.com/rayvoelker/cdb532b9b3d535e76dabf784d09ca4b9

Files in This Gist


00-README.md
scripts--generate-archive-plan.py
scripts--generate-framework-doc.py
scripts--generate-module-docs.py
scripts--generate-report-docs.py
scripts--gist-manifest.json
scripts--manifest-cron.py
scripts--manifest-git.py
scripts--manifest-perl-deps.py
scripts--manifest-report-status.py
scripts--manifest-script-content.py
scripts--manifest-summary.py
scripts--manifest-tree.py
scripts--publish-ilsaux-gist.sh
scripts--publish-ilsaux-gists.py
scripts--sanitize-for-gist.py


## scripts--generate-archive-plan.py
#!/usr/bin/env python3
"""Generate archive classification and migration plan from all manifests.

Output: docs/ilsaux/archive-plan.md
"""

import csv
import os
import sys
import time
from collections import Counter, defaultdict
from datetime import datetime

MANIFEST_DIR = "/home/ray/claude/docs/ilsaux/manifests"
OUT_PATH = "/home/ray/claude/docs/ilsaux/archive-plan.md"


def human_size(nbytes):
    for unit in ("B", "KB", "MB", "GB", "TB"):
        if abs(nbytes) < 1024:
            return f"{nbytes:.1f} {unit}"
        nbytes /= 1024
    return f"{nbytes:.1f} PB"


def read_csv_file(filename):
    path = os.path.join(MANIFEST_DIR, filename)
    if not os.path.exists(path):
        print(f"  WARNING: {path} not found", file=sys.stderr)
        return []
    with open(path, newline="") as f:
        return list(csv.DictReader(f))


def main():
    t_start = time.monotonic()
    print("[generate-archive-plan] Generating archive classification ...", file=sys.stderr)

    report_status = read_csv_file("report-status.csv")
    file_manifest = read_csv_file("file-manifest.csv")
    credential_locs = read_csv_file("credential-locations.csv")
    git_summaries = read_csv_file("git-summaries.csv")

    # Build size map per report directory
    report_sizes = defaultdict(int)
    report_file_counts = defaultdict(int)
    top_dir_sizes = defaultdict(int)
    for f in file_manifest:
        parts = f["parent_dir"].split("/")
        if len(parts) >= 2 and parts[0] == "Reports":
            report_sizes[parts[1]] += int(f["size_bytes"])
            report_file_counts[parts[1]] += 1
        top = parts[0] if parts[0] != "." else "(root)"
        top_dir_sizes[top] += int(f["size_bytes"])

    # Classify reports
    categories = {
        "active-critical": [],
        "active": [],
        "inactive-recent": [],
        "inactive-legacy": [],
        "obsolete": [],
    }

    for r in report_status:
        name = r["report_name"]
        status = r["status"]
        is_sl = name.startswith("sl")

        if status == "active" and is_sl:
            categories["active-critical"].append(r)
        elif status == "active":
            categories["active"].append(r)
        elif status == "inactive-recent":
            categories["inactive-recent"].append(r)
        elif status == "obsolete":
            # Further classify by last run date
            last = r.get("last_run_date", "")
            if last:
                try:
                    dt = datetime.strptime(last, "%Y-%m-%d")
                    if dt.year >= 2020:
                        categories["inactive-recent"].append(r)
                    else:
                        categories["inactive-legacy"].append(r)
                    continue
                except ValueError:
                    pass
            categories["obsolete"].append(r)
        else:
            categories["obsolete"].append(r)

    # Credential files per report
    cred_by_report = defaultdict(set)
    for c in credential_locs:
        parts = c["file"].split("/")
        if len(parts) >= 2:
            # Reports/name/file -> name
            if parts[0] == "Reports":
                cred_by_report[parts[1]].add(c["credential_type"])

    with open(OUT_PATH, "w") as f:
        def w(line=""):
            f.write(line + "\n")

        w("# ILS Auxiliary Server Archive & Migration Plan")
        w()
        w(f"Generated: {datetime.now().isoformat(timespec='seconds')}")
        w()
        w("---")
        w()

        # Classification table
        w("## Report Classification")
        w()

        for cat, label, desc in [
            ("active-critical", "Active-Critical", "Running in 2026, shelf list reports -- document fully, migrate first"),
            ("active", "Active", "Running in 2026, non-shelf-list -- document, evaluate for migration"),
            ("inactive-recent", "Inactive-Recent", "Last run 2020-2025 -- review with stakeholders before archiving"),
            ("inactive-legacy", "Inactive-Legacy", "Last run before 2020 -- archive only, low priority"),
            ("obsolete", "Obsolete", "No output found or commented out -- archive as historical record"),
        ]:
            reports = categories[cat]
            if not reports:
                continue

            total_size = sum(report_sizes.get(r["report_name"], 0) for r in reports)
            total_files = sum(report_file_counts.get(r["report_name"], 0) for r in reports)

            w(f"### {label} ({len(reports)} reports, {human_size(total_size)}, {total_files:,} files)")
            w()
            w(f"> {desc}")
            w()
            w(f"| Report | Full Name | Last Run | Size | Credentials |")
            w(f"|--------|-----------|----------|------|-------------|")

            for r in sorted(reports, key=lambda x: x["report_name"]):
                name = r["report_name"]
                fullname = r.get("fullname", "") or ""
                last = r.get("last_run_date", "") or "never"
                size = human_size(report_sizes.get(name, 0))
                creds = ", ".join(sorted(cred_by_report.get(name, []))) or "none"
                w(f"| `{name}` | {fullname} | {last} | {size} | {creds} |")
            w()

        # Historical/non-report directories
        w("### Historical Directories")
        w()
        w("These are not report directories but contain historical data:")
        w()
        historical_dirs = ["Symphony_Hist", "Symphony_Bincustom", "webpac"]
        for d in historical_dirs:
            size = top_dir_sizes.get(d, 0)
            if size > 0:
                w(f"- **{d}**: {human_size(size)} -- archive as historical record")
        w()

        # Credential rotation
        w("## Credential Rotation Requirements")
        w()
        w(f"Total credential references found: {len(credential_locs)}")
        w()
        cred_types = Counter(c["credential_type"] for c in credential_locs)
        w("| Type | Count | Action |")
        w("|------|-------|--------|")
        for ctype, count in cred_types.most_common():
            action = "Rotate immediately" if ctype in ("password", "db_connection") else "Review"
            w(f"| {ctype} | {count} | {action} |")
        w()
        w("**All `.cfg` files contain plaintext credentials and must NOT be migrated as-is.**")
        w()

        # Size breakdown
        w("## Size Breakdown")
        w()
        total_size = sum(int(fi["size_bytes"]) for fi in file_manifest)
        w(f"**Total ilsaux size:** {human_size(total_size)}")
        w()

        active_size = sum(
            report_sizes.get(r["report_name"], 0)
            for r in categories["active-critical"] + categories["active"]
        )
        inactive_size = sum(
            report_sizes.get(r["report_name"], 0)
            for r in categories["inactive-recent"] + categories["inactive-legacy"]
        )
        obsolete_size = sum(
            report_sizes.get(r["report_name"], 0)
            for r in categories["obsolete"]
        )

        w(f"| Category | Size | Percentage |")
        w(f"|----------|------|------------|")
        for label, size in [
            ("Active (critical + other)", active_size),
            ("Inactive (recent + legacy)", inactive_size),
            ("Obsolete", obsolete_size),
            ("Non-report dirs", total_size - active_size - inactive_size - obsolete_size),
        ]:
            pct = (size / total_size * 100) if total_size > 0 else 0
            w(f"| {label} | {human_size(size)} | {pct:.1f}% |")
        w()

        # Migration priorities
        w("## Migration Priorities")
        w()
        w("### Priority 1: Active-Critical (Shelf List Reports)")
        w()
        for r in categories["active-critical"]:
            w(f"1. **`{r['report_name']}`** -- {r.get('fullname', '')}")
        w()
        w("These reports are actively running and serve shelf list operations.")
        w("Document fully, test migration, coordinate with staff.")
        w()

        w("### Priority 2: Active (Other Reports)")
        w()
        for r in categories["active"]:
            w(f"1. **`{r['report_name']}`** -- {r.get('fullname', '')}")
        w()
        w("Running in production. Evaluate each for continued need.")
        w()

        w("### Priority 3: Inactive-Recent")
        w()
        w("Review with stakeholders. Some may need reactivation, others can be archived.")
        w()

        w("### Priority 4: Legacy & Historical")
        w()
        w("Archive for reference. No migration needed.")
        w()

        # Recommendations
        w("## Recommendations")
        w()
        w("1. **Credential rotation**: All plaintext credentials must be rotated before any migration")
        w("2. **Config modernization**: Replace Config::Simple `.cfg` with environment variables or vault")
        w("3. **Consolidate git**: Merge per-report repos into monorepo for easier management")
        w("4. **Archive Symphony data**: The 13 GB of historical Symphony logs can be compressed and cold-stored")
        w("5. **Document sl-reports first**: These are the highest-value, most-used reports")
        w("6. **Test framework**: The generic-cron.sh pattern is sound but should be modernized (systemd timers, structured logging)")

    print(f"  Wrote {OUT_PATH}", file=sys.stderr)

    elapsed = time.monotonic() - t_start
    print(f"  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--generate-framework-doc.py
#!/usr/bin/env python3
"""Document the generic-cron.sh execution framework and .cfg file format.

Output: docs/ilsaux/framework/generic-cron-framework.md, config-file-format.md
"""

import os
import re
import sys
import time

BASE_DIR = "/home/ray/Documents/ilsaux"
OUT_DIR = "/home/ray/claude/docs/ilsaux/framework"


def document_generic_cron():
    """Read and document the generic-cron.sh framework."""
    cron_path = os.path.join(BASE_DIR, "Reports/generic/generic-cron.sh")

    try:
        with open(cron_path, "r") as f:
            content = f.read()
    except OSError as e:
        print(f"  ERROR: {e}", file=sys.stderr)
        return

    lines = content.split("\n")

    doc_path = os.path.join(OUT_DIR, "generic-cron-framework.md")
    with open(doc_path, "w") as f:
        f.write("# Generic Cron Framework\n\n")
        f.write(f"**File:** `Reports/generic/generic-cron.sh`\n")
        f.write(f"**Lines:** {len(lines)}\n\n")
        f.write("---\n\n")

        f.write("## Overview\n\n")
        f.write("The generic-cron.sh script is the execution framework for all ilsaux reports.\n")
        f.write("Each report has a thin wrapper (`<name>-cron.sh`) that sets variables and sources this script.\n\n")

        f.write("## Execution Flow\n\n")
        f.write("1. **Variable Setup** -- Date variables (TODAY, WEEKAGO, MONTHAGO, YEARAGO)\n")
        f.write("2. **Defaults** -- Sets REPORTNAME, LOGFILE, JSONFILE, KEEPPERIOD, LINK if not provided by wrapper\n")
        f.write("3. **Run Report** -- `cd` to report dir, run `perl ./$SOURCEFILE >> $LOGFILE-$TODAY.txt`\n")
        f.write("4. **JSON Metadata** -- Creates timestamped JSON with fullName, name, date, timeStarted, timeFinished, logFile, link\n")
        f.write("5. **Cleanup** -- Deletes old log files and JSON based on KEEPPERIOD (WEEK/MONTH/YEAR)\n")
        f.write("6. **MESA Integration** -- Copies JSON + log to `/var/www/html/mesa/`, cleans old files, rebuilds index\n\n")

        f.write("## Required Variables (set by wrapper)\n\n")
        f.write("| Variable | Required | Default | Description |\n")
        f.write("|----------|----------|---------|-------------|\n")
        f.write("| `REPORTNAME` | Yes | `generic` | Directory name and base filename |\n")
        f.write("| `FULLNAME` | Yes | `Generic Report` | Human-readable name for JSON/MESA |\n")
        f.write("| `SOURCEFILE` | Yes | `SierraGenericReport.pl` | Perl script filename |\n")
        f.write("| `LINK` | No | `nil` | URL for the report output |\n")
        f.write("| `KEEPPERIOD` | No | `YEAR` | Retention: WEEK, MONTH, or YEAR |\n")
        f.write("| `LOGFILE` | No | `$REPORTNAME-log` | Log file basename |\n")
        f.write("| `JSONFILE` | No | `$REPORTNAME` | JSON metadata basename |\n\n")

        f.write("## Cron Wrapper Pattern\n\n")
        f.write("Every report follows this pattern:\n\n")
        f.write("```bash\n")
        f.write('#!/bin/bash\n\n')
        f.write('REPORTNAME=slmainmissing\n')
        f.write('FULLNAME="Shelflist - Main Missing"\n')
        f.write('SOURCEFILE=SierraShelfListMainMissing.pl\n')
        f.write('LINK="http://[REDACTED-HOST]/ils/shelflists/mainmissing.asp"\n')
        f.write('KEEPPERIOD=MONTH\n\n')
        f.write('source ~/Reports/generic/generic-cron.sh\n')
        f.write("```\n\n")

        f.write("## JSON Metadata Format\n\n")
        f.write("```json\n")
        f.write('{\n')
        f.write('  "fullName": "Shelflist - Main Missing",\n')
        f.write('  "name": "slmainmissing",\n')
        f.write('  "date": "2026-01-15",\n')
        f.write('  "timeStarted": "1737000000",\n')
        f.write('  "timeFinished": "1737000300",\n')
        f.write('  "logFile": "slmainmissing-log-20260115.txt",\n')
        f.write('  "link": "http://[REDACTED-HOST]/ils/shelflists/mainmissing.asp"\n')
        f.write('}\n')
        f.write("```\n\n")

        f.write("## MESA Dashboard Integration\n\n")
        f.write("- JSON metadata copied to `/var/www/html/mesa/finished/`\n")
        f.write("- Log files copied to `/var/www/html/mesa/logs/`\n")
        f.write("- `json-wn.pl` generates `/var/www/html/mesa/upcoming.json`\n")
        f.write("- `json-index.pl` generates `/var/www/html/mesa/finished/index.json`\n")
        f.write("- Old MESA files cleaned after 32 days\n\n")

        f.write("## Retention Periods\n\n")
        f.write("| Period | Log Cleanup | JSON Cleanup |\n")
        f.write("|--------|-------------|-------------|\n")
        f.write("| WEEK   | 7 days      | 8 days      |\n")
        f.write("| MONTH  | 30 days     | 32 days     |\n")
        f.write("| YEAR   | 365 days    | 366 days    |\n\n")

        f.write("## Historical Note\n\n")
        f.write("The script contains commented-out FTP code that previously transferred files to `[REDACTED-HOST]`.\n")
        f.write("This was replaced by direct file copy to the MESA web directory on the same server.\n")

    print(f"  Wrote {doc_path}", file=sys.stderr)


def document_config_format():
    """Document the .cfg file format used by reports."""
    doc_path = os.path.join(OUT_DIR, "config-file-format.md")

    # Scan for .cfg files to find common keys
    config_keys = {}  # key -> [files]
    reports_dir = os.path.join(BASE_DIR, "Reports")

    for root, dirs, files in os.walk(reports_dir):
        dirs[:] = [d for d in dirs if d != ".git"]
        for name in files:
            if not name.endswith(".cfg"):
                continue
            path = os.path.join(root, name)
            rel = os.path.relpath(path, BASE_DIR)
            try:
                with open(path, "r", errors="replace") as f:
                    for line in f:
                        line = line.strip()
                        # Config::Simple format: key value or key=value
                        m = re.match(r'^(\w+)\s*[=:]\s*(.+)', line)
                        if m:
                            key = m.group(1)
                            config_keys.setdefault(key, []).append(rel)
            except OSError:
                pass

    with open(doc_path, "w") as f:
        f.write("# Config File Format (.cfg)\n\n")
        f.write("Reports use `Config::Simple` to read `.cfg` files.\n")
        f.write("Format: `key value` or `key=value` (one per line).\n\n")
        f.write("---\n\n")

        f.write("## Common Configuration Keys\n\n")
        f.write("| Key | Used By (count) | Description |\n")
        f.write("|-----|----------------|-------------|\n")
        for key in sorted(config_keys, key=lambda k: len(config_keys[k]), reverse=True):
            count = len(config_keys[key])
            desc = ""
            kl = key.lower()
            if "module" in kl:
                desc = "Path to Sierra:: Perl modules"
            elif "host" in kl or "server" in kl:
                desc = "Database or server hostname"
            elif "database" in kl or "dbname" in kl:
                desc = "Database name"
            elif "user" in kl:
                desc = "Database or service username"
            elif "password" in kl or "passwd" in kl:
                desc = "**CREDENTIAL** -- database or service password"
            elif "port" in kl:
                desc = "Service port number"
            elif "ftp" in kl:
                desc = "FTP-related setting"
            elif "output" in kl or "file" in kl:
                desc = "Output file path"
            f.write(f"| `{key}` | {count} | {desc} |\n")

        f.write("\n## Security Note\n\n")
        f.write("Many `.cfg` files contain **plaintext credentials** (database passwords, FTP credentials).\n")
        f.write("These are NOT documented here and must be rotated as part of any migration.\n")
        f.write("See `credential-locations.csv` for an inventory of affected files.\n")

    print(f"  Wrote {doc_path}", file=sys.stderr)


def main():
    t_start = time.monotonic()
    print("[generate-framework-doc] Generating framework documentation ...", file=sys.stderr)

    document_generic_cron()
    document_config_format()

    elapsed = time.monotonic() - t_start
    print(f"  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--generate-module-docs.py
#!/usr/bin/env python3
"""Generate documentation for each Sierra:: Perl module.

Reads the modules directly plus perl-dependencies.csv for reverse dependency map.
Output: docs/ilsaux/modules/ (one .md per module)
"""

import csv
import os
import re
import sys
import time

BASE_DIR = "/home/ray/Documents/ilsaux/Modules/Sierra"
MANIFEST_DIR = "/home/ray/claude/docs/ilsaux/manifests"
OUT_DIR = "/home/ray/claude/docs/ilsaux/modules"

CREDENTIAL_PATTERNS = [
    re.compile(r'password', re.IGNORECASE),
    re.compile(r'passwd', re.IGNORECASE),
    re.compile(r'secret', re.IGNORECASE),
    re.compile(r'DBI->connect', re.IGNORECASE),
]


def extract_module_info(filepath):
    """Extract package, exports, subs, and credential flags from a .pm file."""
    with open(filepath, "r", errors="replace") as f:
        content = f.read()

    lines = content.split("\n")

    # Package name
    package = ""
    m = re.search(r'^package\s+([\w:]+)', content, re.MULTILINE)
    if m:
        package = m.group(1)

    # Exports
    export_ok = []
    export = []
    for m in re.finditer(r'@EXPORT_OK\s*=\s*qw\(\s*(.*?)\s*\)', content, re.DOTALL):
        export_ok.extend(m.group(1).split())
    for m in re.finditer(r'@EXPORT\s*=\s*qw\(\s*(.*?)\s*\)', content, re.DOTALL):
        export.extend(m.group(1).split())

    # Subroutines
    subs = []
    for i, line in enumerate(lines):
        sm = re.match(r'^sub\s+(\w+)', line)
        if sm:
            sub_name = sm.group(1)
            # Look back for leading comments
            comments = []
            j = i - 1
            while j >= 0 and lines[j].strip().startswith("#"):
                comments.insert(0, lines[j].strip().lstrip("#").strip())
                j -= 1

            # Parameter unpacking
            params = ""
            for k in range(i, min(i + 10, len(lines))):
                pm = re.search(r'my\s*\(([^)]+)\)\s*=\s*@_', lines[k])
                if pm:
                    params = pm.group(1).strip()
                    break

            subs.append({
                "name": sub_name,
                "line": i + 1,
                "comments": comments,
                "params": params,
            })

    # Credential flags
    has_credentials = False
    cred_lines = []
    for i, line in enumerate(lines, 1):
        for pat in CREDENTIAL_PATTERNS:
            if pat.search(line):
                has_credentials = True
                cred_lines.append(i)
                break

    # Data structures (hashes)
    data_maps = []
    for i, line in enumerate(lines):
        hm = re.match(r'my\s+(%\w+)\s*=\s*\(', line)
        if hm:
            data_maps.append({"name": hm.group(1), "line": i + 1})

    return {
        "package": package,
        "export_ok": export_ok,
        "export": export,
        "subs": subs,
        "has_credentials": has_credentials,
        "credential_lines": cred_lines,
        "data_maps": data_maps,
        "line_count": len(lines),
    }


def main():
    if not os.path.isdir(BASE_DIR):
        print(f"Error: {BASE_DIR} not found", file=sys.stderr)
        sys.exit(1)

    t_start = time.monotonic()
    print("[generate-module-docs] Generating module documentation ...", file=sys.stderr)

    # Build reverse dependency map from perl-dependencies.csv
    reverse_deps = {}  # module_name -> [files that use it]
    deps_path = os.path.join(MANIFEST_DIR, "perl-dependencies.csv")
    if os.path.exists(deps_path):
        with open(deps_path, newline="") as f:
            for row in csv.DictReader(f):
                if row["classification"] == "local":
                    reverse_deps.setdefault(row["module"], []).append(row["file"])

    # Process each .pm file
    count = 0
    for name in sorted(os.listdir(BASE_DIR)):
        if not name.endswith(".pm"):
            continue
        # Skip backup files
        if "backup" in name or name.endswith(".orig"):
            continue

        filepath = os.path.join(BASE_DIR, name)
        print(f"  {name} ...", file=sys.stderr)

        info = extract_module_info(filepath)

        # Generate slug for output filename
        slug = name.replace(".pm", "").lower()
        slug = f"sierra-{slug}"

        # Find users of this module
        module_name = info["package"] or f"Sierra::{name.replace('.pm', '')}"
        users = reverse_deps.get(module_name, [])

        doc_path = os.path.join(OUT_DIR, f"{slug}.md")
        with open(doc_path, "w") as f:
            f.write(f"# {module_name}\n\n")
            f.write(f"**File:** `Modules/Sierra/{name}`\n")
            f.write(f"**Lines:** {info['line_count']}\n")
            if info["has_credentials"]:
                f.write(f"**WARNING:** Contains credential references (lines: {', '.join(map(str, info['credential_lines']))})\n")
            f.write("\n---\n\n")

            # Exports
            f.write("## Exports\n\n")
            if info["export_ok"]:
                f.write("**@EXPORT_OK:**\n")
                for sym in info["export_ok"]:
                    f.write(f"- `{sym}`\n")
            if info["export"]:
                f.write("\n**@EXPORT (auto-imported):**\n")
                for sym in info["export"]:
                    f.write(f"- `{sym}`\n")
            if not info["export_ok"] and not info["export"]:
                f.write("No exports defined.\n")
            f.write("\n")

            # Subroutines
            f.write("## Subroutines\n\n")
            if info["subs"]:
                for s in info["subs"]:
                    params = f"({s['params']})" if s["params"] else "()"
                    f.write(f"### `{s['name']}`{params}\n\n")
                    f.write(f"Line {s['line']}\n\n")
                    if s["comments"]:
                        f.write("> " + " ".join(s["comments"]) + "\n\n")
            else:
                f.write("No subroutines found.\n")
            f.write("\n")

            # Data structures
            if info["data_maps"]:
                f.write("## Data Structures\n\n")
                for dm in info["data_maps"]:
                    f.write(f"- `{dm['name']}` (line {dm['line']})\n")
                f.write("\n")

            # Used by
            f.write("## Used By\n\n")
            if users:
                for u in sorted(users):
                    f.write(f"- `{u}`\n")
            else:
                f.write("No known users found in dependency scan.\n")
            f.write("\n")

        count += 1

    print(f"\n  Generated {count} module docs in {OUT_DIR}", file=sys.stderr)

    elapsed = time.monotonic() - t_start
    print(f"  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--generate-report-docs.py
#!/usr/bin/env python3
"""Generate per-report markdown documentation from Phase 1 manifests.

Reads report-status.csv, perl-dependencies.csv, git-summaries.csv, and script-content.json
to produce pre-populated documentation for each report.

Output: docs/ilsaux/reports/_template.md + one .md per report
"""

import csv
import json
import os
import re
import sys
import time
from datetime import datetime

BASE_DIR = "/home/ray/Documents/ilsaux"
MANIFEST_DIR = "/home/ray/claude/docs/ilsaux/manifests"
OUT_DIR = "/home/ray/claude/docs/ilsaux/reports"

SL_PREFIX = "sl"

TEMPLATE = """# {fullname}

**Report:** `{report_name}`
**Status:** {status}{priority}
**Last Run:** {last_run_date}
**Retention:** {keepperiod}
**Perl Script:** `{pl_file}` ({pl_lines} lines)
**Link:** {link}
**Has Git:** {has_git}

---

## Purpose

{purpose}

---

## Execution Flow

{execution_narrative}

---

## Dependencies

### Sierra:: Modules
{sierra_deps}

### CPAN Modules
{cpan_deps}

---

## Subroutines

{subroutines}

---

## SQL Queries

{sql_queries}

---

## Domain Data Maps

{data_maps}

---

## Configuration Keys

{config_keys}

---

## Known Issues / TODOs

{todos}

---

## Historical Notes (Commented-out Code)

{historical_notes}

---

## Git History

{git_history}

---

## Database Connections

{db_connections}

---

<!-- TODO: Add business context -->
<!-- TODO: Add stakeholders -->
<!-- TODO: Add migration plan -->
"""


def read_csv_file(filename):
    path = os.path.join(MANIFEST_DIR, filename)
    if not os.path.exists(path):
        print(f"  WARNING: {path} not found", file=sys.stderr)
        return []
    with open(path, newline="") as f:
        return list(csv.DictReader(f))


def read_json_file(filename):
    path = os.path.join(MANIFEST_DIR, filename)
    if not os.path.exists(path):
        print(f"  WARNING: {path} not found", file=sys.stderr)
        return []
    with open(path) as f:
        return json.load(f)


def format_subroutines(subs):
    if not subs:
        return "No subroutines found."
    lines = []
    for s in subs:
        params = f"({s['parameters']})" if s.get("parameters") else "()"
        comment = ""
        if s.get("leading_comments"):
            comment = " -- " + " ".join(s["leading_comments"])
        lines.append(f"- **`{s['name']}`**{params} (line {s['line']}, ~{s['line_count']} lines){comment}")
    return "\n".join(lines)


def format_sql(queries):
    if not queries:
        return "No SQL queries extracted."
    lines = []
    for i, q in enumerate(queries, 1):
        lines.append(f"### Query {i} (`${q['variable']}`, line {q['start_line']})")
        lines.append("```sql")
        lines.append(q["sql"])
        lines.append("```")
        lines.append("")
    return "\n".join(lines)


def format_data_maps(maps):
    if not maps:
        return "No data maps found."
    lines = []
    for m in maps:
        keys_sample = ", ".join(m["sample_keys"])
        lines.append(f"- **`{m['variable']}`** (line {m['line']}, {m['key_count']} keys) -- sample: {keys_sample}")
    return "\n".join(lines)


def format_config_keys(refs):
    if not refs:
        return "No config keys found."
    seen = {}
    for r in refs:
        if r["key"] not in seen:
            seen[r["key"]] = r["line"]
    lines = []
    for key, line in sorted(seen.items()):
        lines.append(f"- `{key}` (first used line {line})")
    return "\n".join(lines)


def format_execution_narrative(prints):
    if not prints:
        return "No print statements extracted."
    lines = []
    for p in prints:
        text = sanitize_text(p["text"])
        if text and not all(c in "+-=." for c in text):
            lines.append(f"{p['line']:>5}: {text}")
    if not lines:
        return "No meaningful print statements."
    return "```\n" + "\n".join(lines[:40]) + "\n```"


def format_todos(todos):
    if not todos:
        return "None found."
    lines = []
    for t in todos:
        lines.append(f"- Line {t['line']}: {t['text']}")
    return "\n".join(lines)


def format_historical_notes(blocks):
    if not blocks:
        return "No significant commented-out code blocks found."
    lines = []
    for b in blocks:
        sample = sanitize_text(" / ".join(b["sample"][:2]))
        lines.append(f"- Lines {b['start_line']}-{b['end_line']} ({b['line_count']} lines): `{sample}`")
    return "\n".join(lines)


def sanitize_text(text):
    """Belt-and-suspenders credential redaction for generated output."""
    # Literal passwords
    text = re.sub(r'\b[REDACTED-PASSWORD]\b', '[REDACTED-PASSWORD]', text, flags=re.IGNORECASE)
    text = re.sub(r'\b[REDACTED-PASSWORD]\b', '[REDACTED-PASSWORD]', text, flags=re.IGNORECASE)
    text = re.sub(r'[REDACTED-PASSWORD]', '[REDACTED-PASSWORD]', text, flags=re.IGNORECASE)
    # DB usernames as quoted strings
    text = re.sub(r'(?<=["\'])(?:sqlaccess|sqllabels\d*|sqldataentryerrors|svc_vmsp1)(?=["\'])',
                  '[REDACTED-USER]', text, flags=re.IGNORECASE)
    # Internal hostnames
    text = re.sub(r'\b[\w.-]+\.plch\.net\b', '[REDACTED-HOST]', text, flags=re.IGNORECASE)
    text = re.sub(r'\b[\w.-]+\.iii\.com\b', '[REDACTED-HOST]', text, flags=re.IGNORECASE)
    text = re.sub(r'\b[\w.-]+\.cincinnatilibrary\.org\b', '[REDACTED-HOST]', text, flags=re.IGNORECASE)
    # Internal emails
    text = re.sub(r'\b[\w.+-]+@cincinnatilibrary\.org\b', '[REDACTED-EMAIL]', text, flags=re.IGNORECASE)
    text = re.sub(r'\b[\w.+-]+@plch\.net\b', '[REDACTED-EMAIL]', text, flags=re.IGNORECASE)
    # DBI->connect credential args
    text = re.sub(r'(DBI->connect\(\s*"[^"]*"\s*,\s*)"[^"]*"\s*,\s*"[^"]*"',
                  r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"', text, flags=re.IGNORECASE)
    # $password/$username = "..."
    text = re.sub(r'(\$password\s*=\s*)"[^"]*"', r'\1"[REDACTED-PASSWORD]"', text, flags=re.IGNORECASE)
    text = re.sub(r'(\$username\s*=\s*)"[^"]*"', r'\1"[REDACTED-USER]"', text, flags=re.IGNORECASE)
    return text


def format_db_connections(conns):
    if not conns:
        return "No direct DB connections found (may use Sierra::DB module)."
    lines = []
    for c in conns:
        pattern = sanitize_text(c['pattern'])
        lines.append(f"- Line {c['line']}: `{pattern}`")
    return "\n".join(lines)


def find_script_content(script_contents, report_name):
    """Find matching script-content entries for a report."""
    matches = []
    for sc in script_contents:
        file_path = sc["file"]
        # Match by report directory
        if file_path.startswith(f"Reports/{report_name}/"):
            matches.append(sc)
    return matches


def main():
    t_start = time.monotonic()
    print("[generate-report-docs] Generating report documentation ...", file=sys.stderr)

    # Read manifests
    report_status = read_csv_file("report-status.csv")
    perl_deps = read_csv_file("perl-dependencies.csv")
    git_summaries = read_csv_file("git-summaries.csv")
    script_contents = read_json_file("script-content.json")

    # Build lookup maps
    deps_by_file = {}
    for r in perl_deps:
        deps_by_file.setdefault(r["file"], []).append(r)

    git_by_path = {}
    for g in git_summaries:
        # Normalize path
        path = g["repo_path"]
        if path.startswith("Reports/"):
            name = path.split("/")[1] if "/" in path else path
            git_by_path[name] = g

    # Write template
    template_path = os.path.join(OUT_DIR, "_template.md")
    with open(template_path, "w") as f:
        f.write(TEMPLATE.replace("{", "{{").replace("}", "}}").replace("{{{{", "{").replace("}}}}", "}"))
    # Actually just write a reference template
    with open(template_path, "w") as f:
        f.write("# Report Documentation Template\n\n")
        f.write("This template is used by `generate-report-docs.py` to create per-report docs.\n")
        f.write("See any generated report file for the actual structure.\n")
    print(f"  Wrote {template_path}", file=sys.stderr)

    # Generate per-report docs
    for report in report_status:
        name = report["report_name"]
        print(f"  {name} ...", file=sys.stderr)

        # Find script content
        scs = find_script_content(script_contents, name)

        # Aggregate content from all matching scripts
        all_subs = []
        all_sql = []
        all_maps = []
        all_config = []
        all_prints = []
        all_todos = []
        all_historical = []
        all_db = []
        purpose = ""

        for sc in scs:
            if sc.get("report_identity") and not purpose:
                purpose = sc["report_identity"]
            all_subs.extend(sc.get("subroutines", []))
            all_sql.extend(sc.get("sql_queries", []))
            all_maps.extend(sc.get("data_maps", []))
            all_config.extend(sc.get("config_refs", []))
            all_prints.extend(sc.get("print_statements", []))
            all_todos.extend(sc.get("todo_comments", []))
            all_historical.extend(sc.get("commented_code_blocks", []))
            all_db.extend(sc.get("db_connections", []))

        if not purpose:
            purpose = report.get("fullname") or f"Report: {name}"

        # Collect dependencies
        sierra_deps = []
        cpan_deps = []
        for dep_list in deps_by_file.values():
            for d in dep_list:
                if d["file"].startswith(f"Reports/{name}/"):
                    if d["classification"] == "local":
                        sierra_deps.append(f"- `{d['module']}` ({d['imported_symbols'] or 'default'})")
                    elif d["classification"] == "cpan":
                        cpan_deps.append(f"- `{d['module']}`")

        sierra_deps_str = "\n".join(sorted(set(sierra_deps))) if sierra_deps else "None"
        cpan_deps_str = "\n".join(sorted(set(cpan_deps))) if cpan_deps else "None"

        # Git history
        git_info = git_by_path.get(name)
        if git_info:
            git_history = (
                f"- Commits: {git_info['total_commits']}\n"
                f"- First commit: {git_info['first_commit_date']}\n"
                f"- Last commit: {git_info['last_commit_date']}\n"
                f"- Branches: {git_info['branches']}\n"
                f"- Last message: {git_info['last_commit_message']}"
            )
        else:
            git_history = "No git repository found for this report."

        priority = " **[HIGH PRIORITY - Shelf List]**" if name.startswith(SL_PREFIX) else ""

        content = TEMPLATE.format(
            fullname=report.get("fullname") or name,
            report_name=name,
            status=report["status"],
            priority=priority,
            last_run_date=report["last_run_date"] or "Unknown",
            keepperiod=report.get("keepperiod") or "Unknown",
            pl_file=report.get("pl_file") or "Unknown",
            pl_lines=report.get("pl_lines", 0),
            link=sanitize_text(report.get("link") or "N/A"),
            has_git=report.get("has_git", False),
            purpose=sanitize_text(purpose),
            execution_narrative=format_execution_narrative(all_prints),
            sierra_deps=sierra_deps_str,
            cpan_deps=cpan_deps_str,
            subroutines=format_subroutines(all_subs),
            sql_queries=format_sql(all_sql),
            data_maps=format_data_maps(all_maps),
            config_keys=format_config_keys(all_config),
            todos=format_todos(all_todos),
            historical_notes=format_historical_notes(all_historical),
            git_history=git_history,
            db_connections=format_db_connections(all_db),
        )

        doc_path = os.path.join(OUT_DIR, f"{name}.md")
        with open(doc_path, "w") as f:
            f.write(content)

    print(f"\n  Generated {len(report_status)} report docs in {OUT_DIR}", file=sys.stderr)

    elapsed = time.monotonic() - t_start
    print(f"  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--gist-manifest.json
{
  "version": 1,
  "groups": {
    "reports": {
      "gist_id": "8aad3cf2c3d6c80742604fa76e9045bd",
      "gist_url": "https://gist.github.com/rayvoelker/8aad3cf2c3d6c80742604fa76e9045bd",
      "file_count": 53
    },
    "modules": {
      "gist_id": "66b8bf43f5d840f9c71724433b20ba56",
      "gist_url": "https://gist.github.com/rayvoelker/66b8bf43f5d840f9c71724433b20ba56",
      "file_count": 17
    },
    "framework": {
      "gist_id": "5aae40b92a5758a8713360931b2df2e5",
      "gist_url": "https://gist.github.com/rayvoelker/5aae40b92a5758a8713360931b2df2e5",
      "file_count": 4
    },
    "manifests-small": {
      "gist_id": "28151f90732c9f2484c205602ba17852",
      "gist_url": "https://gist.github.com/rayvoelker/28151f90732c9f2484c205602ba17852",
      "file_count": 8
    },
    "manifests-large": {
      "gist_id": "c060b43289f560745f77008c56e0a4ee",
      "gist_url": "https://gist.github.com/rayvoelker/c060b43289f560745f77008c56e0a4ee",
      "file_count": 3
    },
    "scripts": {
      "gist_id": "558ac1812ff2dc29c22ba4ed9cc1a72c",
      "gist_url": "https://gist.github.com/rayvoelker/558ac1812ff2dc29c22ba4ed9cc1a72c",
      "file_count": 15
    }
  },
  "old_single_gist_id": "cce2e74ff232c461e6c6b0e9a620a24f",
  "master_toc_gist_id": "cdb532b9b3d535e76dabf784d09ca4b9",
  "master_toc_gist_url": "https://gist.github.com/rayvoelker/cdb532b9b3d535e76dabf784d09ca4b9"
}

## scripts--manifest-cron.py
#!/usr/bin/env python3
"""Parse all crontab backup files and extract schedule entries.

Output: docs/ilsaux/manifests/cron-schedule.csv
"""

import csv
import os
import re
import sys
import time

BASE_DIR = "/home/ray/Documents/ilsaux/crontab_files"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"

DOW_NAMES = {0: "Sun", 1: "Mon", 2: "Tue", 3: "Wed", 4: "Thu", 5: "Fri", 6: "Sat", 7: "Sun"}


def human_schedule(minute, hour, dom, month, dow):
    """Convert cron fields to a human-readable schedule description."""
    parts = []

    # Day of week
    if dow != "*":
        if dow in DOW_NAMES:
            parts.append(f"on {DOW_NAMES[dow]}")
        elif "-" in str(dow):
            parts.append(f"days {dow}")
        else:
            parts.append(f"dow={dow}")
    elif dom != "*":
        parts.append(f"on day {dom}")

    # Time
    if hour != "*" and minute != "*":
        try:
            h = int(hour)
            m = int(minute)
            ampm = "AM" if h < 12 else "PM"
            h12 = h % 12 or 12
            parts.append(f"at {h12}:{m:02d} {ampm}")
        except (ValueError, TypeError):
            parts.append(f"at {hour}:{minute}")
    elif hour != "*":
        parts.append(f"at hour {hour}")

    if month != "*":
        parts.append(f"month={month}")

    return " ".join(parts) if parts else "every minute"


def extract_report_name(command):
    """Try to extract report name from cron command."""
    # Match patterns like Reports/slitemdata/slitemdata-cron.sh
    m = re.search(r'Reports/(\w+)/\w+-cron\.sh', command)
    if m:
        return m.group(1)

    # Match patterns like /path/to/reportname-cron.sh
    m = re.search(r'/(\w+)-cron\.sh', command)
    if m:
        return m.group(1)

    # Match perl scripts
    m = re.search(r'Sierra(\w+)\.pl', command)
    if m:
        return m.group(1).lower()

    return ""


def parse_crontab(filepath):
    """Parse a single crontab file and return list of entries."""
    filename = os.path.basename(filepath)
    # Extract date from filename: crontab.backup.YYYYMMDD
    m = re.search(r'(\d{8})$', filename)
    crontab_date = m.group(1) if m else ""

    entries = []
    try:
        with open(filepath, "r", errors="replace") as f:
            lines = f.readlines()
    except OSError as e:
        print(f"  SKIP: {filepath}: {e}", file=sys.stderr)
        return entries

    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue

        # Skip variable assignments and shell settings
        if re.match(r'^(SHELL|PATH|MAILTO|HOME|#\s*m\s+h)', stripped):
            continue

        is_commented = stripped.startswith("#")
        notes = ""

        # Extract inline comment/notes
        if is_commented:
            # Remove leading # and check if it's a cron entry
            uncommented = stripped.lstrip("#").strip()
            # Check if there's a note after the command
            if re.match(r'^\d', uncommented) or re.match(r'^\*', uncommented):
                stripped = uncommented
            else:
                # Pure comment line - extract as note if relevant
                continue

        # Parse cron fields
        m = re.match(
            r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$',
            stripped
        )
        if not m:
            continue

        minute, hour, dom, month, dow, command = m.groups()

        # Check for inline comment in command
        if "#" in command:
            command, _, notes = command.partition("#")
            command = command.strip()
            notes = notes.strip()

        report_name = extract_report_name(command)

        entries.append({
            "crontab_file": filename,
            "crontab_date": crontab_date,
            "minute": minute,
            "hour": hour,
            "dom": dom,
            "month": month,
            "dow": dow,
            "command": command,
            "report_name": report_name,
            "is_commented": is_commented,
            "human_schedule": human_schedule(minute, hour, dom, month, dow),
            "notes": notes,
        })

    return entries


def main():
    if not os.path.isdir(BASE_DIR):
        print(f"Error: {BASE_DIR} not found", file=sys.stderr)
        sys.exit(1)

    t_start = time.monotonic()
    print(f"[manifest-cron] Parsing crontab files in {BASE_DIR} ...", file=sys.stderr)

    all_entries = []

    crontab_files = sorted(os.listdir(BASE_DIR))
    for name in crontab_files:
        path = os.path.join(BASE_DIR, name)
        if os.path.isfile(path):
            entries = parse_crontab(path)
            all_entries.extend(entries)
            print(f"  {name}: {len(entries)} entries", file=sys.stderr)

    csv_path = os.path.join(OUT_DIR, "cron-schedule.csv")
    with open(csv_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "crontab_file", "crontab_date", "minute", "hour", "dom", "month",
            "dow", "command", "report_name", "is_commented", "human_schedule", "notes"
        ])
        writer.writeheader()
        writer.writerows(all_entries)

    print(f"  Wrote {csv_path} ({len(all_entries)} rows)", file=sys.stderr)

    # Summary of latest crontab
    latest = [e for e in all_entries if e["crontab_date"] == "20190709"]
    if latest:
        active = [e for e in latest if not e["is_commented"]]
        commented = [e for e in latest if e["is_commented"]]
        print(f"\n  Latest crontab (20190709): {len(active)} active, {len(commented)} commented", file=sys.stderr)
        for e in active:
            print(f"    {e['human_schedule']}: {e['report_name'] or e['command'][:50]}", file=sys.stderr)

    elapsed = time.monotonic() - t_start
    print(f"\n  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--manifest-git.py
#!/usr/bin/env python3
"""Find all git repos in ilsaux and extract summary information.

Output: docs/ilsaux/manifests/git-summaries.csv
"""

import csv
import os
import subprocess
import sys
import time

BASE_DIR = "/home/ray/Documents/ilsaux"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"
TIMEOUT = 10


def git_cmd(repo_dir, args):
    """Run a git command in repo_dir with timeout, return stdout or empty string."""
    try:
        result = subprocess.run(
            ["git"] + args,
            cwd=repo_dir,
            capture_output=True,
            text=True,
            timeout=TIMEOUT,
        )
        return result.stdout.strip()
    except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
        return ""


def main():
    if not os.path.isdir(BASE_DIR):
        print(f"Error: {BASE_DIR} not found", file=sys.stderr)
        sys.exit(1)

    t_start = time.monotonic()
    print(f"[manifest-git] Finding git repos in {BASE_DIR} ...", file=sys.stderr)

    # Find all .git directories
    git_dirs = []
    for root, dirs, files in os.walk(BASE_DIR):
        if ".git" in dirs:
            git_dirs.append(root)
            dirs.remove(".git")  # Don't recurse into .git
        # Don't recurse into .cpan
        if ".cpan" in dirs:
            dirs.remove(".cpan")

    print(f"  Found {len(git_dirs)} git repos", file=sys.stderr)

    rows = []
    for repo_dir in sorted(git_dirs):
        rel_path = os.path.relpath(repo_dir, BASE_DIR)
        print(f"  {rel_path} ...", file=sys.stderr)

        # Total commits
        log_count = git_cmd(repo_dir, ["rev-list", "--count", "HEAD"])
        total_commits = int(log_count) if log_count.isdigit() else 0

        # First commit date
        first_date = git_cmd(repo_dir, [
            "log", "--reverse", "--format=%aI", "--max-count=1"
        ])

        # Last commit date and message
        last_info = git_cmd(repo_dir, [
            "log", "--format=%aI|||%s", "--max-count=1"
        ])
        last_date = ""
        last_msg = ""
        if "|||" in last_info:
            last_date, last_msg = last_info.split("|||", 1)

        # Branches
        branches_raw = git_cmd(repo_dir, ["branch", "--format=%(refname:short)"])
        branches = ", ".join(branches_raw.split("\n")) if branches_raw else ""

        rows.append({
            "repo_path": rel_path,
            "total_commits": total_commits,
            "first_commit_date": first_date,
            "last_commit_date": last_date,
            "branches": branches,
            "last_commit_message": last_msg,
        })

    csv_path = os.path.join(OUT_DIR, "git-summaries.csv")
    with open(csv_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "repo_path", "total_commits", "first_commit_date",
            "last_commit_date", "branches", "last_commit_message"
        ])
        writer.writeheader()
        writer.writerows(rows)

    print(f"  Wrote {csv_path} ({len(rows)} repos)", file=sys.stderr)

    elapsed = time.monotonic() - t_start
    print(f"  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--manifest-perl-deps.py
#!/usr/bin/env python3
"""Extract Perl module dependencies from all .pl/.pm files in ilsaux.

Output: docs/ilsaux/manifests/perl-dependencies.csv
Includes reverse dependency map (which reports use which Sierra:: modules).
"""

import csv
import os
import re
import sys
import time

BASE_DIR = "/home/ray/Documents/ilsaux"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"

# Core Perl modules (common ones seen in this codebase)
CORE_MODULES = {
    "strict", "warnings", "Carp", "DBI", "Exporter", "POSIX",
    "File::Basename", "File::Copy", "File::Path", "File::Find",
    "Getopt::Long", "Getopt::Std", "Data::Dumper", "Scalar::Util",
    "List::Util", "Time::Local", "Time::HiRes", "IO::File",
    "Encode", "utf8",
}

# Known CPAN modules
CPAN_MODULES = {
    "Config::Simple", "XML::Simple", "Net::FTP", "SQL::Beautify",
    "Net::SFTP::Foreign", "Text::CSV", "Text::CSV_XS", "JSON",
    "JSON::XS", "LWP::UserAgent", "HTTP::Request", "HTTP::Response",
    "SOAP::Lite", "MARC::Record", "MARC::Field", "MARC::Batch",
    "MARC::File::USMARC", "DBIx::Class", "Excel::Writer::XLSX",
    "Spreadsheet::WriteExcel", "CGI", "Template",
}

USE_RE = re.compile(r'^\s*use\s+([\w:]+)(?:\s+qw\(([^)]*)\))?', re.MULTILINE)
REQUIRE_RE = re.compile(r'^\s*require\s+([\w:]+)', re.MULTILINE)
EXPORT_OK_RE = re.compile(r'@EXPORT_OK\s*=\s*qw\(\s*(.*?)\s*\)', re.DOTALL)
EXPORT_RE = re.compile(r'@EXPORT\s*=\s*qw\(\s*(.*?)\s*\)', re.DOTALL)


def classify_module(name):
    if name.startswith("Sierra::"):
        return "local"
    if name in CORE_MODULES:
        return "core"
    if name in CPAN_MODULES:
        return "cpan"
    # Version numbers (use 5.008007)
    if re.match(r'^\d', name):
        return "pragma"
    # Lowercase = pragma
    if name[0].islower():
        return "pragma"
    return "cpan"


def main():
    if not os.path.isdir(BASE_DIR):
        print(f"Error: {BASE_DIR} not found", file=sys.stderr)
        sys.exit(1)

    t_start = time.monotonic()
    print("[manifest-perl-deps] Scanning .pl/.pm files ...", file=sys.stderr)

    rows = []
    exports = {}  # module_file -> {"export_ok": [...], "export": [...]}

    for root, dirs, files in os.walk(BASE_DIR):
        # Skip .git internals
        dirs[:] = [d for d in dirs if d != ".git"]

        for name in files:
            if not (name.endswith(".pl") or name.endswith(".pm")):
                continue

            path = os.path.join(root, name)
            rel_path = os.path.relpath(path, BASE_DIR)

            try:
                with open(path, "r", errors="replace") as f:
                    content = f.read()
            except OSError as e:
                print(f"  SKIP: {path}: {e}", file=sys.stderr)
                continue

            # Extract use statements
            for m in USE_RE.finditer(content):
                module = m.group(1)
                symbols = m.group(2) or ""
                symbols = " ".join(symbols.split())
                classification = classify_module(module)
                rows.append({
                    "file": rel_path,
                    "module": module,
                    "import_type": "use",
                    "imported_symbols": symbols,
                    "classification": classification,
                })

            # Extract require statements
            for m in REQUIRE_RE.finditer(content):
                module = m.group(1)
                classification = classify_module(module)
                rows.append({
                    "file": rel_path,
                    "module": module,
                    "import_type": "require",
                    "imported_symbols": "",
                    "classification": classification,
                })

            # Extract exports from .pm files
            if name.endswith(".pm"):
                export_ok = []
                export = []
                for m in EXPORT_OK_RE.finditer(content):
                    export_ok.extend(m.group(1).split())
                for m in EXPORT_RE.finditer(content):
                    export.extend(m.group(1).split())
                if export_ok or export:
                    exports[rel_path] = {
                        "export_ok": export_ok,
                        "export": export,
                    }

    print(f"  Found {len(rows)} dependency entries", file=sys.stderr)

    # Write main CSV
    csv_path = os.path.join(OUT_DIR, "perl-dependencies.csv")
    rows.sort(key=lambda r: (r["file"], r["module"]))
    with open(csv_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "file", "module", "import_type", "imported_symbols", "classification"
        ])
        writer.writeheader()
        writer.writerows(rows)
    print(f"  Wrote {csv_path} ({len(rows)} rows)", file=sys.stderr)

    # Print reverse dependency summary
    sierra_users = {}  # sierra_module -> [files that use it]
    for r in rows:
        if r["classification"] == "local":
            sierra_users.setdefault(r["module"], []).append(r["file"])

    print("\n  Reverse dependency map (Sierra:: modules):", file=sys.stderr)
    for mod in sorted(sierra_users):
        users = sierra_users[mod]
        print(f"    {mod}: used by {len(users)} files", file=sys.stderr)
        for u in users:
            print(f"      - {u}", file=sys.stderr)

    elapsed = time.monotonic() - t_start
    print(f"\n  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--manifest-report-status.py
#!/usr/bin/env python3
"""Determine status of each report: active, inactive, or obsolete.

Parses cron wrappers, JSON metadata files, and .cfg files.
Output: docs/ilsaux/manifests/report-status.csv, credential-locations.csv
"""

import csv
import json
import os
import re
import sys
import time
from datetime import datetime

BASE_DIR = "/home/ray/Documents/ilsaux/Reports"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"

# Patterns that indicate credentials (file+line only, NO values)
CREDENTIAL_PATTERNS = [
    (re.compile(r'password', re.IGNORECASE), "password"),
    (re.compile(r'passwd', re.IGNORECASE), "password"),
    (re.compile(r'secret', re.IGNORECASE), "secret"),
    (re.compile(r'api[_-]?key', re.IGNORECASE), "api_key"),
    (re.compile(r'token', re.IGNORECASE), "token"),
    (re.compile(r'DBI->connect\s*\(', re.IGNORECASE), "db_connection"),
    (re.compile(r'host\s*=', re.IGNORECASE), "host_config"),
    (re.compile(r'user\s*=', re.IGNORECASE), "user_config"),
]

# Now threshold: 60 days for "active"
ACTIVE_DAYS = 60
INACTIVE_YEAR = 2020


def parse_cron_wrapper(path):
    """Extract variables from a -cron.sh file."""
    info = {}
    try:
        with open(path, "r") as f:
            for line in f:
                line = line.strip()
                for var in ("REPORTNAME", "FULLNAME", "SOURCEFILE", "LINK", "KEEPPERIOD"):
                    m = re.match(rf'^{var}=(.+)', line)
                    if m:
                        val = m.group(1).strip().strip('"').strip("'")
                        info[var.lower()] = val
    except OSError:
        pass
    return info


def find_newest_json(report_dir):
    """Find the newest .json metadata file and extract last run date."""
    newest_time = 0
    newest_date = None

    for name in os.listdir(report_dir):
        if name.endswith(".json") and "-" in name:
            path = os.path.join(report_dir, name)
            try:
                mtime = os.stat(path).st_mtime
                if mtime > newest_time:
                    newest_time = mtime
                    # Try to parse date from the JSON content
                    try:
                        with open(path) as f:
                            data = json.load(f)
                        if "date" in data:
                            newest_date = data["date"]
                        elif "timeFinished" in data:
                            ts = int(data["timeFinished"])
                            newest_date = datetime.fromtimestamp(ts).strftime("%Y-%m-%d")
                    except (json.JSONDecodeError, ValueError, KeyError):
                        newest_date = datetime.fromtimestamp(mtime).strftime("%Y-%m-%d")
            except OSError:
                continue

    return newest_date


def count_pl_lines(report_dir):
    """Count lines in .pl files."""
    total = 0
    pl_file = None
    for name in os.listdir(report_dir):
        if name.endswith(".pl"):
            pl_file = name
            path = os.path.join(report_dir, name)
            try:
                with open(path) as f:
                    total += sum(1 for _ in f)
            except OSError:
                pass
    return pl_file, total


def scan_credentials(report_dir):
    """Scan .cfg and .pm and .pl files for credential patterns. Return file+line only."""
    cred_entries = []

    for name in os.listdir(report_dir):
        if not any(name.endswith(ext) for ext in (".cfg", ".pl", ".pm", ".conf")):
            continue

        path = os.path.join(report_dir, name)
        rel_path = os.path.relpath(path, os.path.dirname(BASE_DIR))

        try:
            with open(path, "r", errors="replace") as f:
                for i, line in enumerate(f, 1):
                    for pattern, cred_type in CREDENTIAL_PATTERNS:
                        if pattern.search(line):
                            cred_entries.append({
                                "file": rel_path,
                                "line_number": i,
                                "credential_type": cred_type,
                            })
                            break  # One match per line is enough
        except OSError:
            pass

    return cred_entries


def classify_status(last_run_date):
    """Classify report status based on last run date."""
    if not last_run_date:
        return "obsolete"
    try:
        dt = datetime.strptime(last_run_date, "%Y-%m-%d")
    except ValueError:
        return "unknown"

    now = datetime.now()
    delta = (now - dt).days

    if delta <= ACTIVE_DAYS:
        return "active"
    elif dt.year >= INACTIVE_YEAR:
        return "inactive-recent"
    else:
        return "obsolete"


def main():
    if not os.path.isdir(BASE_DIR):
        print(f"Error: {BASE_DIR} not found", file=sys.stderr)
        sys.exit(1)

    t_start = time.monotonic()
    print(f"[manifest-report-status] Scanning {BASE_DIR} ...", file=sys.stderr)

    report_rows = []
    all_credentials = []

    for name in sorted(os.listdir(BASE_DIR)):
        report_dir = os.path.join(BASE_DIR, name)
        if not os.path.isdir(report_dir):
            continue
        # Skip tar.gz entries
        if name.endswith(".tar.gz"):
            continue

        print(f"  {name} ...", file=sys.stderr)

        # Parse cron wrapper
        cron_files = [f for f in os.listdir(report_dir) if f.endswith("-cron.sh")]
        cron_info = {}
        schedule = ""
        for cf in cron_files:
            cron_info = parse_cron_wrapper(os.path.join(report_dir, cf))

        # Find last run date from JSON metadata
        last_run_date = find_newest_json(report_dir)

        # Count .pl lines
        pl_file, pl_lines = count_pl_lines(report_dir)

        # Check for .git
        has_git = os.path.isdir(os.path.join(report_dir, ".git"))

        # Scan for credentials
        creds = scan_credentials(report_dir)
        all_credentials.extend(creds)

        status = classify_status(last_run_date)

        report_rows.append({
            "report_name": name,
            "status": status,
            "last_run_date": last_run_date or "",
            "schedule": cron_info.get("keepperiod", ""),
            "pl_file": pl_file or "",
            "pl_lines": pl_lines,
            "fullname": cron_info.get("fullname", ""),
            "link": cron_info.get("link", ""),
            "keepperiod": cron_info.get("keepperiod", ""),
            "has_git": has_git,
            "credential_count": len(creds),
        })

    # Write report-status.csv
    csv_path = os.path.join(OUT_DIR, "report-status.csv")
    with open(csv_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "report_name", "status", "last_run_date", "schedule",
            "pl_file", "pl_lines", "fullname", "link", "keepperiod",
            "has_git", "credential_count"
        ])
        writer.writeheader()
        writer.writerows(report_rows)
    print(f"  Wrote {csv_path} ({len(report_rows)} reports)", file=sys.stderr)

    # Write credential-locations.csv (NO values!)
    cred_path = os.path.join(OUT_DIR, "credential-locations.csv")
    with open(cred_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["file", "line_number", "credential_type"])
        writer.writeheader()
        writer.writerows(all_credentials)
    print(f"  Wrote {cred_path} ({len(all_credentials)} entries, NO values)", file=sys.stderr)

    # Summary
    status_counts = {}
    for r in report_rows:
        status_counts[r["status"]] = status_counts.get(r["status"], 0) + 1
    print("\n  Status summary:", file=sys.stderr)
    for s, c in sorted(status_counts.items()):
        print(f"    {s}: {c}", file=sys.stderr)

    elapsed = time.monotonic() - t_start
    print(f"\n  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--manifest-script-content.py
#!/usr/bin/env python3
"""Extract self-documenting content from Perl scripts: comments, SQL, subs, data maps, etc.

Output: docs/ilsaux/manifests/script-content.json
This is the richest source of documentation since formal docs are sparse.
"""

import json
import os
import re
import sys
import time

BASE_DIR = "/home/ray/Documents/ilsaux"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"

# Sensitive patterns to redact from all extracted content
_SANITIZE_PATTERNS = [
    # Literal passwords
    (re.compile(r'\b[REDACTED-PASSWORD]\b', re.IGNORECASE), '[REDACTED-PASSWORD]'),
    (re.compile(r'\b[REDACTED-PASSWORD]\b', re.IGNORECASE), '[REDACTED-PASSWORD]'),
    (re.compile(r'[REDACTED-PASSWORD]', re.IGNORECASE), '[REDACTED-PASSWORD]'),
    # DB usernames as quoted strings
    (re.compile(r'(?<=["\'])(?:sqlaccess|sqllabels\d*|sqldataentryerrors|svc_vmsp1)(?=["\'])', re.IGNORECASE), '[REDACTED-USER]'),
    # Internal hostnames
    (re.compile(r'\b[\w.-]+\.plch\.net\b', re.IGNORECASE), '[REDACTED-HOST]'),
    (re.compile(r'\b[\w.-]+\.iii\.com\b', re.IGNORECASE), '[REDACTED-HOST]'),
    (re.compile(r'\b[\w.-]+\.cincinnatilibrary\.org\b', re.IGNORECASE), '[REDACTED-HOST]'),
    # Email addresses at internal domains
    (re.compile(r'\b[\w.+-]+@cincinnatilibrary\.org\b', re.IGNORECASE), '[REDACTED-EMAIL]'),
    (re.compile(r'\b[\w.+-]+@plch\.net\b', re.IGNORECASE), '[REDACTED-EMAIL]'),
    # DBI->connect credential args
    (re.compile(r'(DBI->connect\(\s*"[^"]*"\s*,\s*)"[^"]*"\s*,\s*"[^"]*"', re.IGNORECASE), r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"'),
    # $password = "..." and $username = "..."
    (re.compile(r'(\$password\s*=\s*)"[^"]*"', re.IGNORECASE), r'\1"[REDACTED-PASSWORD]"'),
    (re.compile(r'(\$username\s*=\s*)"[^"]*"', re.IGNORECASE), r'\1"[REDACTED-USER]"'),
    # $ua->credentials
    (re.compile(r"(\$ua->credentials\([^)]*)'[^']*'\s*,\s*'[^']*'\s*\)", re.IGNORECASE), r"\1'[REDACTED-USER]', '[REDACTED-PASSWORD]')"),
]


def sanitize_text(text):
    """Apply credential redaction to extracted text."""
    for pattern, replacement in _SANITIZE_PATTERNS:
        text = pattern.sub(replacement, text)
    return text


def extract_report_identity(lines):
    """Extract human-readable report name from BEGIN block print statements."""
    in_begin = False
    for line in lines:
        stripped = line.strip()
        if stripped.startswith("BEGIN"):
            in_begin = True
        if in_begin:
            m = re.search(r'print\s+"([^"]*(?:begin|Report)[^"]*)"\s*;', stripped, re.IGNORECASE)
            if m:
                name = m.group(1)
                # Clean up the name
                name = re.sub(r'[\+\n\\]', '', name)
                name = name.strip()
                return name
            if stripped == "}":
                in_begin = False
    return None


def extract_comments(lines):
    """Extract all comment lines, categorized."""
    todo_comments = []
    section_comments = []
    inline_comments = []

    for i, line in enumerate(lines, 1):
        stripped = line.strip()
        if not stripped:
            continue

        # Pure comment line
        if stripped.startswith("#"):
            comment = stripped.lstrip("#").strip()
            if re.match(r'TODO|FIXME|HACK', comment, re.IGNORECASE):
                todo_comments.append({"line": i, "text": comment})
            elif re.match(r'-{3,}', stripped.lstrip("#")):
                section_comments.append({"line": i, "text": comment})
            else:
                inline_comments.append({"line": i, "text": comment})
        # Inline comment on code line
        elif "#" in stripped:
            # Avoid matches inside strings
            code_part, _, comment_part = stripped.partition("#")
            # Simple heuristic: if there's a quote before #, skip
            if code_part.count('"') % 2 == 0 and code_part.count("'") % 2 == 0:
                if comment_part.strip():
                    if re.match(r'TODO|FIXME|HACK', comment_part.strip(), re.IGNORECASE):
                        todo_comments.append({"line": i, "text": comment_part.strip()})
                    else:
                        inline_comments.append({
                            "line": i,
                            "text": comment_part.strip(),
                            "code": code_part.strip(),
                        })

    return todo_comments, section_comments, inline_comments


def extract_sql(lines):
    """Reconstruct SQL from $sql_query .= "..." concatenation patterns."""
    queries = []
    current_sql = []
    current_var = None
    start_line = None

    for i, line in enumerate(lines, 1):
        stripped = line.strip()

        # Match: $sql_query = "..."; or $sql .= "...";
        m = re.match(r'\$(\w+)\s*\.?=\s*"(.*?)"\s*;', stripped)
        if m:
            var_name = m.group(1)
            sql_part = m.group(2)

            if "sql" in var_name.lower() or "query" in var_name.lower():
                if ".=" not in stripped and current_sql:
                    # New assignment, save previous
                    queries.append({
                        "variable": current_var,
                        "start_line": start_line,
                        "sql": "\n".join(current_sql),
                    })
                    current_sql = []

                if not current_sql:
                    current_var = var_name
                    start_line = i

                current_sql.append(sql_part)
                continue

        # Also match heredoc-style SQL
        if re.match(r'\$\w+\s*=\s*<<', stripped):
            # Heredoc start - capture until delimiter
            pass

        # If we were building SQL and hit a non-continuation line, save
        if current_sql and not re.match(r'\$\w*sql\w*\s*\.?=', stripped, re.IGNORECASE):
            queries.append({
                "variable": current_var,
                "start_line": start_line,
                "sql": "\n".join(current_sql),
            })
            current_sql = []
            current_var = None

    # Don't forget last one
    if current_sql:
        queries.append({
            "variable": current_var,
            "start_line": start_line,
            "sql": "\n".join(current_sql),
        })

    return queries


def extract_subroutines(lines):
    """Extract sub declarations with leading comments and parameters."""
    subs = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        m = re.match(r'^sub\s+(\w+)', line)
        if m:
            sub_name = m.group(1)
            sub_start = i + 1  # 1-indexed

            # Look back for leading comments
            leading_comments = []
            j = i - 1
            while j >= 0 and lines[j].strip().startswith("#"):
                leading_comments.insert(0, lines[j].strip().lstrip("#").strip())
                j -= 1

            # Count lines until closing brace (approximate)
            brace_depth = 0
            sub_end = i
            for k in range(i, len(lines)):
                brace_depth += lines[k].count("{") - lines[k].count("}")
                if brace_depth <= 0 and k > i:
                    sub_end = k
                    break
            else:
                sub_end = len(lines) - 1

            line_count = sub_end - i + 1

            # Look for parameter unpacking
            params = ""
            for k in range(i, min(i + 10, len(lines))):
                pm = re.search(r'my\s*\(([^)]+)\)\s*=\s*@_', lines[k])
                if pm:
                    params = pm.group(1).strip()
                    break

            subs.append({
                "name": sub_name,
                "line": sub_start,
                "line_count": line_count,
                "leading_comments": leading_comments,
                "parameters": params,
            })
        i += 1

    return subs


def extract_data_maps(lines):
    """Extract named hash definitions with sample keys."""
    maps = []
    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Match: my %name = ( or my %name_for_... = (
        m = re.match(r'my\s+(%\w+)\s*=\s*\(', line)
        if m:
            var_name = m.group(1)
            start_line = i + 1

            # Collect all lines until closing paren
            content_lines = [line]
            brace_depth = line.count("(") - line.count(")")
            k = i + 1
            while k < len(lines) and brace_depth > 0:
                content_lines.append(lines[k])
                brace_depth += lines[k].count("(") - lines[k].count(")")
                k += 1

            full_content = "\n".join(content_lines)

            # Extract key => value pairs
            pairs = re.findall(r'["\']?(\w+)["\']?\s*=>', full_content)
            sample_keys = pairs[:5]

            if pairs:  # Only include if we found actual key-value pairs
                maps.append({
                    "variable": var_name,
                    "line": start_line,
                    "key_count": len(pairs),
                    "sample_keys": sample_keys,
                })
            i = k
            continue
        i += 1

    return maps


def extract_config_refs(lines):
    """Extract $cfg->param("...") calls."""
    refs = []
    for i, line in enumerate(lines, 1):
        for m in re.finditer(r'\$cfg->param\(\s*["\']([^"\']+)["\']\s*\)', line):
            refs.append({"line": i, "key": m.group(1)})
    return refs


def extract_print_stmts(lines):
    """Extract print statements (execution narrative)."""
    stmts = []
    for i, line in enumerate(lines, 1):
        m = re.search(r'print\s+"([^"]+)"', line.strip())
        if m:
            text = m.group(1)
            # Skip purely variable prints and separator lines
            if text.strip() and not re.match(r'^[\+\-\=]+$', text.strip()):
                stmts.append({"line": i, "text": text.replace("\\n", "").strip()})
    return stmts


def extract_commented_code(lines):
    """Detect blocks of 3+ consecutive commented lines that look like code."""
    blocks = []
    current_block = []
    current_start = None
    code_indicators = re.compile(r'[\$\@\%]|->|=\s|;\s*$|if\s*\(|while|foreach|sub\s')

    for i, line in enumerate(lines, 1):
        stripped = line.strip()
        if stripped.startswith("#") and not stripped.startswith("#!"):
            comment_content = stripped.lstrip("#")
            if code_indicators.search(comment_content):
                if not current_block:
                    current_start = i
                current_block.append({"line": i, "text": comment_content.strip()})
            else:
                if len(current_block) >= 3:
                    blocks.append({
                        "start_line": current_start,
                        "end_line": current_block[-1]["line"],
                        "line_count": len(current_block),
                        "sample": [b["text"] for b in current_block[:3]],
                    })
                current_block = []
        else:
            if len(current_block) >= 3:
                blocks.append({
                    "start_line": current_start,
                    "end_line": current_block[-1]["line"],
                    "line_count": len(current_block),
                    "sample": [b["text"] for b in current_block[:3]],
                })
            current_block = []

    # Final block
    if len(current_block) >= 3:
        blocks.append({
            "start_line": current_start,
            "end_line": current_block[-1]["line"],
            "line_count": len(current_block),
            "sample": [b["text"] for b in current_block[:3]],
        })

    return blocks


def extract_db_connections(lines):
    """Extract DBI->connect calls (sanitized -- credentials and hostnames redacted)."""
    connections = []
    for i, line in enumerate(lines, 1):
        if "DBI->connect" in line or "dbi:Pg" in line.lower() or "dbi:mysql" in line.lower():
            sanitized = line.strip()
            # Redact host= values in DSN strings
            sanitized = re.sub(
                r'(host=)[\w.-]+\.(plch\.net|iii\.com|cincinnatilibrary\.org)',
                r'\1[REDACTED-HOST]',
                sanitized,
                flags=re.IGNORECASE,
            )
            # Redact user/password args in DBI->connect("dsn","[REDACTED-USER]","[REDACTED-PASSWORD]",...)
            sanitized = re.sub(
                r'(DBI->connect\(\s*"[^"]*"\s*,\s*)"[^"]*"\s*,\s*"[^"]*"',
                r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"',
                sanitized,
                flags=re.IGNORECASE,
            )
            connections.append({"line": i, "pattern": sanitized})
    return connections


def sanitize_result(obj):
    """Recursively sanitize all string values in a nested data structure."""
    if isinstance(obj, str):
        return sanitize_text(obj)
    if isinstance(obj, list):
        return [sanitize_result(item) for item in obj]
    if isinstance(obj, dict):
        return {k: sanitize_result(v) for k, v in obj.items()}
    return obj


def process_file(path, rel_path):
    """Process a single .pl/.pm file and extract all self-documenting content."""
    try:
        with open(path, "r", errors="replace") as f:
            content = f.read()
    except OSError as e:
        print(f"  SKIP: {path}: {e}", file=sys.stderr)
        return None

    lines = content.split("\n")

    todo_comments, section_comments, inline_comments = extract_comments(lines)

    result = {
        "file": rel_path,
        "line_count": len(lines),
        "report_identity": extract_report_identity(lines),
        "todo_comments": todo_comments,
        "section_comments": section_comments,
        "inline_comment_count": len(inline_comments),
        "inline_comments_sample": inline_comments[:20],
        "sql_queries": extract_sql(lines),
        "subroutines": extract_subroutines(lines),
        "data_maps": extract_data_maps(lines),
        "config_refs": extract_config_refs(lines),
        "print_statements": extract_print_stmts(lines),
        "commented_code_blocks": extract_commented_code(lines),
        "db_connections": extract_db_connections(lines),
    }

    # Sanitize all string content to redact credentials/hostnames
    return sanitize_result(result)


def main():
    if not os.path.isdir(BASE_DIR):
        print(f"Error: {BASE_DIR} not found", file=sys.stderr)
        sys.exit(1)

    t_start = time.monotonic()
    print("[manifest-script-content] Extracting self-documentation ...", file=sys.stderr)

    results = []
    file_count = 0

    for root, dirs, files in os.walk(BASE_DIR):
        dirs[:] = [d for d in dirs if d != ".git"]

        for name in files:
            if not (name.endswith(".pl") or name.endswith(".pm")):
                continue

            path = os.path.join(root, name)
            rel_path = os.path.relpath(path, BASE_DIR)
            result = process_file(path, rel_path)
            if result:
                results.append(result)
                file_count += 1

    # Sort by file path
    results.sort(key=lambda r: r["file"])

    out_path = os.path.join(OUT_DIR, "script-content.json")
    with open(out_path, "w") as f:
        json.dump(results, f, indent=2)

    print(f"  Processed {file_count} files", file=sys.stderr)
    print(f"  Wrote {out_path}", file=sys.stderr)

    # Summary stats
    total_subs = sum(len(r["subroutines"]) for r in results)
    total_sql = sum(len(r["sql_queries"]) for r in results)
    total_maps = sum(len(r["data_maps"]) for r in results)
    total_todos = sum(len(r["todo_comments"]) for r in results)
    print(f"  Totals: {total_subs} subs, {total_sql} SQL queries, "
          f"{total_maps} data maps, {total_todos} TODOs", file=sys.stderr)

    elapsed = time.monotonic() - t_start
    print(f"  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--manifest-summary.py
#!/usr/bin/env python3
"""Read all Phase 1 CSVs and produce a human-readable summary report.

Output: docs/ilsaux/manifests/summary-report.txt
"""

import csv
import os
import sys
import time
from collections import Counter, defaultdict
from datetime import datetime

OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"


def human_size(nbytes):
    for unit in ("B", "KB", "MB", "GB", "TB"):
        if abs(nbytes) < 1024:
            return f"{nbytes:.1f} {unit}"
        nbytes /= 1024
    return f"{nbytes:.1f} PB"


def read_csv(filename):
    path = os.path.join(OUT_DIR, filename)
    if not os.path.exists(path):
        print(f"  WARNING: {path} not found", file=sys.stderr)
        return []
    with open(path, newline="") as f:
        return list(csv.DictReader(f))


def main():
    t_start = time.monotonic()
    print("[manifest-summary] Generating summary report ...", file=sys.stderr)

    # Read all CSVs
    file_manifest = read_csv("file-manifest.csv")
    perl_deps = read_csv("perl-dependencies.csv")
    cron_schedule = read_csv("cron-schedule.csv")
    report_status = read_csv("report-status.csv")
    git_summaries = read_csv("git-summaries.csv")
    credential_locs = read_csv("credential-locations.csv")

    out_path = os.path.join(OUT_DIR, "summary-report.txt")
    with open(out_path, "w") as f:
        def w(line=""):
            f.write(line + "\n")

        w("=" * 72)
        w("ILS AUXILIARY SERVER (ilsaux) DOCUMENTATION SUMMARY")
        w(f"Generated: {datetime.now().isoformat(timespec='seconds')}")
        w("=" * 72)

        # 1. File Statistics
        w()
        w("1. FILE STATISTICS")
        w("-" * 50)
        total_files = len(file_manifest)
        total_size = sum(int(r["size_bytes"]) for r in file_manifest)
        w(f"  Total files:  {total_files:,}")
        w(f"  Total size:   {human_size(total_size)}")

        # Extension breakdown
        ext_counts = Counter()
        ext_sizes = Counter()
        for r in file_manifest:
            ext = r["extension"] or "(none)"
            ext_counts[ext] += 1
            ext_sizes[ext] += int(r["size_bytes"])

        w()
        w("  Top extensions by count:")
        for ext, count in ext_counts.most_common(15):
            size = ext_sizes[ext]
            w(f"    {ext:<15} {count:>6} files  {human_size(size):>10}")

        # 2. Report Status
        w()
        w("2. REPORT STATUS")
        w("-" * 50)
        status_counts = Counter(r["status"] for r in report_status)
        for status in ["active", "inactive-recent", "obsolete", "unknown"]:
            if status in status_counts:
                reports = [r for r in report_status if r["status"] == status]
                w(f"\n  {status.upper()} ({status_counts[status]}):")
                for r in sorted(reports, key=lambda x: x["report_name"]):
                    last = r["last_run_date"] or "never"
                    fullname = r["fullname"]
                    name = r["report_name"]
                    sl = " [HIGH PRIORITY]" if name.startswith("sl") else ""
                    w(f"    {name:<30} last run: {last:<12} {fullname}{sl}")

        # 3. Perl Dependency Frequency
        w()
        w("3. PERL DEPENDENCY FREQUENCY")
        w("-" * 50)
        dep_class = Counter()
        sierra_usage = Counter()
        for r in perl_deps:
            dep_class[r["classification"]] += 1
            if r["classification"] == "local":
                sierra_usage[r["module"]] += 1

        w("  By classification:")
        for cls, count in dep_class.most_common():
            w(f"    {cls:<15} {count:>4} imports")

        w()
        w("  Sierra:: module usage (most to least):")
        for mod, count in sierra_usage.most_common():
            w(f"    {mod:<30} used by {count} files")

        # 4. Cron Timeline
        w()
        w("4. CRON SCHEDULE (latest crontab)")
        w("-" * 50)
        latest_cron = [e for e in cron_schedule if e.get("crontab_date") == "20190709"]
        active_cron = [e for e in latest_cron if e["is_commented"] == "False"]
        commented_cron = [e for e in latest_cron if e["is_commented"] == "True"]

        w(f"  Active entries: {len(active_cron)}")
        w(f"  Commented out:  {len(commented_cron)}")
        w()
        w("  Active schedule:")
        for e in sorted(active_cron, key=lambda x: (x["hour"], x["minute"])):
            name = e["report_name"] or e["command"][:40]
            w(f"    {e['human_schedule']:<30} {name}")

        if commented_cron:
            w()
            w("  Commented out (historical):")
            for e in sorted(commented_cron, key=lambda x: x.get("report_name", "")):
                name = e["report_name"] or e["command"][:40]
                notes = f" -- {e['notes']}" if e.get("notes") else ""
                w(f"    {name}{notes}")

        # 5. Git Repositories
        w()
        w("5. GIT REPOSITORIES")
        w("-" * 50)
        w(f"  Total repos: {len(git_summaries)}")
        w()
        for g in sorted(git_summaries, key=lambda x: x.get("last_commit_date", ""), reverse=True):
            commits = g["total_commits"]
            last = g["last_commit_date"][:10] if g["last_commit_date"] else "unknown"
            w(f"    {g['repo_path']:<40} {commits:>4} commits  last: {last}")

        # 6. Credential Exposure
        w()
        w("6. CREDENTIAL EXPOSURE SUMMARY")
        w("-" * 50)
        w(f"  Total credential references found: {len(credential_locs)}")
        w("  (File + line number only -- NO values stored)")
        w()
        cred_types = Counter(c["credential_type"] for c in credential_locs)
        for ctype, count in cred_types.most_common():
            w(f"    {ctype:<20} {count:>4} occurrences")

        cred_files = Counter(c["file"] for c in credential_locs)
        w()
        w("  Files with most credential references:")
        for cfile, count in cred_files.most_common(10):
            w(f"    {cfile:<50} {count:>3}")

        # 7. Size Breakdown
        w()
        w("7. SIZE BREAKDOWN BY DIRECTORY")
        w("-" * 50)
        dir_sizes = defaultdict(int)
        for r in file_manifest:
            # Get top-level parent
            parts = r["parent_dir"].split("/")
            top = parts[0] if parts[0] != "." else "(root)"
            dir_sizes[top] += int(r["size_bytes"])

        for d, size in sorted(dir_sizes.items(), key=lambda x: x[1], reverse=True):
            w(f"    {d:<40} {human_size(size):>10}")

        w()
        w("=" * 72)
        w(f"Report based on manifests in {OUT_DIR}")
        w("=" * 72)

    print(f"  Wrote {out_path}", file=sys.stderr)

    elapsed = time.monotonic() - t_start
    print(f"  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--manifest-tree.py
#!/usr/bin/env python3
"""Walk ilsaux directory tree and produce directory-tree.txt + file-manifest.csv.

Skips .git/ and .cpan/ internals but notes their presence.
Output: docs/ilsaux/manifests/directory-tree.txt, file-manifest.csv
"""

import csv
import os
import sys
import time
from datetime import datetime

BASE_DIR = "/home/ray/Documents/ilsaux"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"
SKIP_DIRS = {".git", ".cpan"}


def human_size(nbytes):
    for unit in ("B", "KB", "MB", "GB", "TB"):
        if abs(nbytes) < 1024:
            return f"{nbytes:.1f} {unit}"
        nbytes /= 1024
    return f"{nbytes:.1f} PB"


def main():
    if not os.path.isdir(BASE_DIR):
        print(f"Error: {BASE_DIR} not found", file=sys.stderr)
        sys.exit(1)

    t_start = time.monotonic()
    print(f"[manifest-tree] Walking {BASE_DIR} ...", file=sys.stderr)

    # Collect all files
    all_files = []
    dir_sizes = {}  # dir_path -> total size
    dir_has_git = set()
    dir_has_cpan = set()
    errors = 0

    for root, dirs, files in os.walk(BASE_DIR):
        rel_root = os.path.relpath(root, BASE_DIR)

        # Note and skip .git/.cpan
        skip = []
        for d in dirs:
            if d in SKIP_DIRS:
                skip.append(d)
                full = os.path.join(root, d)
                if d == ".git":
                    dir_has_git.add(rel_root)
                elif d == ".cpan":
                    dir_has_cpan.add(rel_root)
        for s in skip:
            dirs.remove(s)

        for name in files:
            path = os.path.join(root, name)
            try:
                st = os.stat(path)
                size = st.st_size
                mtime = st.st_mtime
            except OSError as e:
                print(f"  SKIP: {path}: {e}", file=sys.stderr)
                errors += 1
                continue

            rel_path = os.path.relpath(path, BASE_DIR)
            _, ext = os.path.splitext(name)
            parent = os.path.relpath(root, BASE_DIR)

            all_files.append({
                "path": rel_path,
                "size_bytes": size,
                "size_human": human_size(size),
                "mtime_iso": datetime.fromtimestamp(mtime).isoformat(timespec="seconds"),
                "extension": ext,
                "parent_dir": parent,
            })

            # Accumulate dir sizes
            dir_sizes[parent] = dir_sizes.get(parent, 0) + size

    print(f"  Found {len(all_files):,} files, {errors} errors", file=sys.stderr)

    # Write file-manifest.csv
    csv_path = os.path.join(OUT_DIR, "file-manifest.csv")
    all_files.sort(key=lambda f: f["path"])
    with open(csv_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=[
            "path", "size_bytes", "size_human", "mtime_iso", "extension", "parent_dir"
        ])
        writer.writeheader()
        writer.writerows(all_files)
    print(f"  Wrote {csv_path} ({len(all_files):,} rows)", file=sys.stderr)

    # Write directory-tree.txt
    tree_path = os.path.join(OUT_DIR, "directory-tree.txt")
    # Collect unique directories with their total sizes
    all_dirs = set()
    for f in all_files:
        p = f["parent_dir"]
        while p and p != ".":
            all_dirs.add(p)
            p = os.path.dirname(p)
    all_dirs.add(".")

    with open(tree_path, "w") as f:
        f.write(f"Directory tree of {BASE_DIR}\n")
        f.write(f"Generated: {datetime.now().isoformat(timespec='seconds')}\n")
        f.write(f"Total files: {len(all_files):,}\n")
        total_size = sum(fi["size_bytes"] for fi in all_files)
        f.write(f"Total size: {human_size(total_size)}\n")
        f.write("=" * 72 + "\n\n")

        for root, dirs, files in os.walk(BASE_DIR):
            rel = os.path.relpath(root, BASE_DIR)
            depth = 0 if rel == "." else rel.count(os.sep) + 1
            indent = "  " * depth
            dirname = os.path.basename(root) if rel != "." else BASE_DIR
            size = dir_sizes.get(rel, 0)

            # Skip internals of .git/.cpan
            parts = rel.split(os.sep)
            if any(p in SKIP_DIRS for p in parts):
                continue

            annotations = []
            if rel in dir_has_git:
                annotations.append("[git]")
            if rel in dir_has_cpan:
                annotations.append("[cpan]")

            ann = " ".join(annotations)
            if ann:
                ann = " " + ann

            f.write(f"{indent}{dirname}/ ({human_size(size)}){ann}\n")

            # Remove skip dirs from walk
            dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
            dirs.sort()

        f.write("\n")
        f.write("Per-directory size rollup (top 30):\n")
        f.write("-" * 60 + "\n")
        sorted_dirs = sorted(dir_sizes.items(), key=lambda x: x[1], reverse=True)
        for dpath, dsize in sorted_dirs[:30]:
            f.write(f"  {human_size(dsize):>10}  {dpath}\n")

    print(f"  Wrote {tree_path}", file=sys.stderr)
    elapsed = time.monotonic() - t_start
    print(f"  Done in {elapsed:.1f}s", file=sys.stderr)


if __name__ == "__main__":
    main()

## scripts--publish-ilsaux-gist.sh
#!/bin/bash
set -e

# Publish sanitized ilsaux documentation to a GitHub gist.
# Uses sanitize-for-gist.py to create a clean staging copy, then
# bundles all files into a gist with directory-prefixed filenames.

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
WORKSPACE="$(cd "$SCRIPT_DIR/../.." && pwd)"

echo "=== ilsaux Documentation Gist Publisher ==="
echo ""

# Step 1: Run sanitizer
echo "[1/3] Sanitizing documentation ..."
SANITIZE_LOG=$(mktemp)
STAGING_DIR=$(python3 "$SCRIPT_DIR/sanitize-for-gist.py" 2>"$SANITIZE_LOG") || true

if [ -z "$STAGING_DIR" ] || [ ! -d "$STAGING_DIR" ]; then
    echo "ERROR: Sanitization failed. Details:"
    cat "$SANITIZE_LOG"
    rm -f "$SANITIZE_LOG"
    exit 1
fi
echo "  Staging directory: $STAGING_DIR"
echo "  $(grep 'Processed\|CLEAN' "$SANITIZE_LOG" || true)"
rm -f "$SANITIZE_LOG"

# Step 2: Belt-and-suspenders verification
echo ""
echo "[2/3] Final verification grep ..."
GREP_HITS=$(grep -ri '[REDACTED-PASSWORD]\|sqlaccess\|sqllabels\|sqldataentryerrors\|[REDACTED-PASSWORD]\|eS3cuRe\|svc_vmsp\|cinci-db\|sierra-db\|\.plch\.net\|\.iii\.com\|cincinnatilibrary\.org' "$STAGING_DIR" \
    --include='*.md' --include='*.csv' --include='*.txt' --include='*.json' \
    2>/dev/null | grep -v 'sanitize-for-gist\.py' | grep -v 'manifest-script-content\.py' | grep -v 'generate-report-docs\.py' || true)

if [ -n "$GREP_HITS" ]; then
    echo "ERROR: Sensitive patterns still found in staging directory!"
    echo "$GREP_HITS"
    echo ""
    echo "Aborting. Fix sanitization rules before publishing."
    exit 1
fi
echo "  CLEAN: No sensitive patterns found."

# Step 3: Flatten files with prefixed names for gist (gists have no subdirs)
echo ""
echo "[3/3] Creating gist ..."

FLAT_DIR="/tmp/ilsaux-gist-flat"
rm -rf "$FLAT_DIR"
mkdir -p "$FLAT_DIR"

flatten_dir() {
    local src_dir="$1"
    local prefix="$2"
    [ -d "$src_dir" ] || return
    for file in "$src_dir"/*; do
        [ -f "$file" ] || continue
        local basename=$(basename "$file")
        cp "$file" "$FLAT_DIR/${prefix}${basename}"
    done
}

flatten_dir "$STAGING_DIR/docs/manifests"   "manifests--"
flatten_dir "$STAGING_DIR/docs/reports"     "reports--"
flatten_dir "$STAGING_DIR/docs/modules"     "modules--"
flatten_dir "$STAGING_DIR/docs/framework"   "framework--"
flatten_dir "$STAGING_DIR/scripts"          "scripts--"

# Top-level docs
[ -f "$STAGING_DIR/docs/00-INDEX.md" ] && cp "$STAGING_DIR/docs/00-INDEX.md" "$FLAT_DIR/00-INDEX.md"
[ -f "$STAGING_DIR/docs/archive-plan.md" ] && cp "$STAGING_DIR/docs/archive-plan.md" "$FLAT_DIR/archive-plan.md"
[ -f "$STAGING_DIR/ilsaux-documentation.md" ] && cp "$STAGING_DIR/ilsaux-documentation.md" "$FLAT_DIR/ilsaux-documentation.md"

FILE_COUNT=$(find "$FLAT_DIR" -type f | wc -l)
echo "  Prepared $FILE_COUNT files in flat layout ..."

if [ "$FILE_COUNT" -eq 0 ]; then
    echo "ERROR: No files found to publish."
    exit 1
fi

# Exclude very large files that have low sharing value
# file-manifest.csv (~2MB, 17K rows of paths/sizes) is borderline
# script-content.json (~3MB) has high value -- keep it

# Create gist with all flattened files
# gh gist create uses basename, so the prefixed names become the gist filenames
GIST_FILES=()
for file in "$FLAT_DIR"/*; do
    [ -f "$file" ] || continue
    GIST_FILES+=("$file")
done

echo "  Uploading to GitHub gist ..."
GIST_OUTPUT=$(gh gist create --public \
    --desc "ilsaux ILS Auxiliary System - Documentation & Analysis Scripts (CHPL)" \
    "${GIST_FILES[@]}" 2>&1)
GH_EXIT=$?

if [ $GH_EXIT -ne 0 ]; then
    echo "ERROR: gh gist create failed:"
    echo "$GIST_OUTPUT"
    echo ""
    echo "Staging directory preserved at: $STAGING_DIR"
    echo "Flat directory preserved at: $FLAT_DIR"
    echo "You can inspect or manually publish from there."
    exit 1
fi

# Extract URL from output (last line typically)
GIST_URL=$(echo "$GIST_OUTPUT" | grep -o 'https://gist.github.com/[^ ]*' | tail -1)

echo ""
echo "=== SUCCESS ==="
echo "Gist URL: $GIST_URL"
echo "Files published: $FILE_COUNT"
echo ""
echo "Staging directory: $STAGING_DIR"
echo "Flat directory: $FLAT_DIR"
echo "(Clean up with: rm -rf $STAGING_DIR $FLAT_DIR)"

## scripts--publish-ilsaux-gists.py
#!/usr/bin/env python3
"""Publish sanitized ilsaux documentation as multiple GitHub gists.

Breaks the documentation into category-based gists to avoid GitHub's
rendering limits, then creates a master TOC gist linking them all.

Requires: gh CLI authenticated with gist scope.

Usage:
    python3 scripts/ilsaux/publish-ilsaux-gists.py [--dry-run] [--delete-old]
"""

import json
import os
import re
import subprocess
import sys

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
WORKSPACE = os.path.dirname(os.path.dirname(SCRIPT_DIR))
STAGING_DIR = "/tmp/ilsaux-gist"
MANIFEST_PATH = os.path.join(SCRIPT_DIR, "gist-manifest.json")

# GitHub username for constructing gist URLs
GITHUB_USER = "rayvoelker"

# Old single gist to optionally delete
OLD_SINGLE_GIST_ID = "cce2e74ff232c461e6c6b0e9a620a24f"

# Gist group definitions: (group_name, description, source_subdir, file_filter)
# source_subdir is relative to STAGING_DIR
# file_filter: None means all files, or a callable(filename) -> bool
GIST_GROUPS = [
    {
        "name": "reports",
        "description": "ilsaux Report Documentation (52 reports) - CHPL Sierra ILS",
        "source_dirs": ["docs/reports"],
        "prefix": "reports--",
    },
    {
        "name": "modules",
        "description": "ilsaux Sierra:: Perl Module Documentation (16 modules) - CHPL",
        "source_dirs": ["docs/modules"],
        "prefix": "modules--",
    },
    {
        "name": "framework",
        "description": "ilsaux Framework & Archive Plan - CHPL Sierra ILS",
        "source_dirs": ["docs/framework"],
        "extra_files": ["docs/archive-plan.md"],
        "prefix": "framework--",
    },
    {
        "name": "manifests-small",
        "description": "ilsaux Manifests (CSVs & Text) - CHPL Sierra ILS",
        "source_dirs": ["docs/manifests"],
        "prefix": "manifests--",
        "exclude": {"file-manifest.csv", "script-content.json"},
    },
    {
        "name": "manifests-large",
        "description": "ilsaux Large Manifests (file listing & script content) - CHPL",
        "source_dirs": ["docs/manifests"],
        "prefix": "manifests--",
        "only": {"file-manifest.csv", "script-content.json"},
    },
    {
        "name": "scripts",
        "description": "ilsaux Documentation Generator Scripts - CHPL Sierra ILS",
        "source_dirs": ["scripts"],
        "prefix": "scripts--",
    },
]

MASTER_TOC_PLACEHOLDER = "{{MASTER_TOC_URL}}"


def run_sanitizer():
    """Run sanitize-for-gist.py and return the staging directory path."""
    sanitize_script = os.path.join(SCRIPT_DIR, "sanitize-for-gist.py")
    result = subprocess.run(
        [sys.executable, sanitize_script],
        capture_output=True,
        text=True,
    )
    # Sanitizer prints status to stderr, staging dir path to stdout
    sys.stderr.write(result.stderr)
    if result.returncode != 0:
        print("ERROR: Sanitization failed.", file=sys.stderr)
        sys.exit(1)
    staging = result.stdout.strip()
    if not os.path.isdir(staging):
        print(f"ERROR: Staging directory not found: {staging}", file=sys.stderr)
        sys.exit(1)
    return staging


def collect_group_files(group, staging_dir):
    """Collect files for a gist group. Returns dict of {gist_filename: local_path}."""
    files = {}
    prefix = group.get("prefix", "")
    exclude = group.get("exclude", set())
    only = group.get("only", None)

    for src_dir in group.get("source_dirs", []):
        full_dir = os.path.join(staging_dir, src_dir)
        if not os.path.isdir(full_dir):
            print(f"  WARNING: {full_dir} not found", file=sys.stderr)
            continue
        for name in sorted(os.listdir(full_dir)):
            path = os.path.join(full_dir, name)
            if not os.path.isfile(path):
                continue
            if only is not None and name not in only:
                continue
            if name in exclude:
                continue
            gist_name = f"{prefix}{name}"
            files[gist_name] = path

    # Extra individual files
    for extra in group.get("extra_files", []):
        path = os.path.join(staging_dir, extra)
        if os.path.isfile(path):
            gist_name = f"{prefix}{os.path.basename(path)}"
            files[gist_name] = path

    return files


def filename_to_anchor(filename):
    """Convert a gist filename to GitHub's anchor slug format.

    GitHub gist anchors: lowercase, dots become hyphens, collapse multiple hyphens.
    Format: #file-{slug}
    """
    slug = filename.lower()
    slug = slug.replace(".", "-")
    # Collapse consecutive hyphens
    slug = re.sub(r"-+", "-", slug)
    slug = slug.strip("-")
    return f"#file-{slug}"


def gist_file_url(gist_url, filename):
    """Build a deep link URL to a specific file in a gist."""
    anchor = filename_to_anchor(filename)
    return f"{gist_url}{anchor}"


def generate_group_readme(group, files, master_toc_url=None):
    """Generate a 00-README.md for a sub-gist group."""
    toc_link = master_toc_url or MASTER_TOC_PLACEHOLDER
    name = group["name"]
    desc = group["description"]

    lines = [
        f"# {desc}",
        "",
        f"**Category:** {name}",
        f"**Files:** {len(files)}",
        f"**Master Index:** [{toc_link}]({toc_link})",
        "",
        "---",
        "",
        "## Files in This Gist",
        "",
    ]

    for fname in sorted(files.keys()):
        anchor = filename_to_anchor(fname)
        lines.append(f"- [{fname}]({anchor})")

    lines.append("")
    return "\n".join(lines)


def load_manifest():
    """Load existing gist manifest, or return empty structure."""
    if os.path.exists(MANIFEST_PATH):
        with open(MANIFEST_PATH) as f:
            return json.load(f)
    return {"version": 1, "groups": {}, "old_single_gist_id": OLD_SINGLE_GIST_ID}


def save_manifest(manifest):
    """Save gist manifest to disk."""
    with open(MANIFEST_PATH, "w") as f:
        json.dump(manifest, f, indent=2)
        f.write("\n")


def gist_exists(gist_id):
    """Check if a gist exists (returns True/False)."""
    result = subprocess.run(
        ["gh", "api", f"/gists/{gist_id}", "--silent"],
        capture_output=True,
    )
    return result.returncode == 0


def create_gist(files_dict, description, dry_run=False):
    """Create a new public gist. Returns (gist_id, gist_url)."""
    if dry_run:
        print(f"  [DRY RUN] Would create gist: {description}", file=sys.stderr)
        print(f"  [DRY RUN]   Files: {len(files_dict)}", file=sys.stderr)
        return "dry-run-id", "https://gist.github.com/dry-run"

    # Write files to a temp directory for gh gist create
    import tempfile
    with tempfile.TemporaryDirectory() as tmpdir:
        paths = []
        for gist_name, content_or_path in files_dict.items():
            dest = os.path.join(tmpdir, gist_name)
            if isinstance(content_or_path, str) and os.path.isfile(content_or_path):
                # It's a file path, copy it
                import shutil
                shutil.copy2(content_or_path, dest)
            else:
                # It's content string
                with open(dest, "w") as f:
                    f.write(content_or_path)
            paths.append(dest)

        result = subprocess.run(
            ["gh", "gist", "create", "--public", "--desc", description] + paths,
            capture_output=True,
            text=True,
        )

    if result.returncode != 0:
        print(f"ERROR: gh gist create failed: {result.stderr}", file=sys.stderr)
        return None, None

    gist_url = result.stdout.strip()
    # Extract gist ID from URL
    gist_id = gist_url.rstrip("/").split("/")[-1]
    return gist_id, gist_url


def update_gist(gist_id, files_dict, description=None, dry_run=False):
    """Update an existing gist with new file contents. Returns success bool."""
    if dry_run:
        print(f"  [DRY RUN] Would update gist {gist_id}", file=sys.stderr)
        print(f"  [DRY RUN]   Files: {len(files_dict)}", file=sys.stderr)
        return True

    # Build the JSON payload for gh api PATCH
    payload = {"files": {}}
    if description:
        payload["description"] = description

    for gist_name, content_or_path in files_dict.items():
        if isinstance(content_or_path, str) and os.path.isfile(content_or_path):
            with open(content_or_path, "r", errors="replace") as f:
                content = f.read()
        else:
            content = content_or_path
        payload["files"][gist_name] = {"content": content}

    result = subprocess.run(
        ["gh", "api", "--method", "PATCH", f"/gists/{gist_id}", "--input", "-"],
        input=json.dumps(payload),
        capture_output=True,
        text=True,
    )

    if result.returncode != 0:
        print(f"ERROR: Failed to update gist {gist_id}: {result.stderr}", file=sys.stderr)
        return False

    return True


def create_or_update_gist(gist_id, files_dict, description, dry_run=False):
    """Create or update a gist. Returns (gist_id, gist_url, created_new)."""
    if gist_id and gist_exists(gist_id):
        ok = update_gist(gist_id, files_dict, description, dry_run)
        if ok:
            url = f"https://gist.github.com/{GITHUB_USER}/{gist_id}"
            return gist_id, url, False
        else:
            print(f"  Update failed for {gist_id}, will recreate", file=sys.stderr)

    # Create new
    new_id, new_url = create_gist(files_dict, description, dry_run)
    return new_id, new_url, True


def generate_master_toc(group_data, staging_dir):
    """Generate the master 00-INDEX.md with real gist URLs and deep links."""
    # Read the original 00-INDEX.md as a base for content
    original_index = os.path.join(staging_dir, "docs/00-INDEX.md")
    if not os.path.exists(original_index):
        print("ERROR: docs/00-INDEX.md not found in staging", file=sys.stderr)
        sys.exit(1)

    with open(original_index) as f:
        original_content = f.read()

    # Now rewrite the index with multi-gist links
    lines = []
    lines.append("# ilsaux -- ILS Auxiliary Server Documentation")
    lines.append("")
    lines.append("**System:** Sierra ILS report automation server at Cincinnati & Hamilton County Public Library (CHPL)")
    lines.append("**Contents:** 94 documentation files covering 51 reports, 16 Perl modules, cron framework, and migration plan")
    lines.append("**Credentials:** All sensitive values replaced with `[REDACTED-*]` markers (see bottom of this file)")
    lines.append("")
    lines.append("---")
    lines.append("")
    lines.append("## Documentation Gists")
    lines.append("")
    lines.append("This documentation is split across multiple gists to stay within GitHub's rendering limits.")
    lines.append("")
    lines.append("| Category | Files | Description | Link |")
    lines.append("|----------|-------|-------------|------|")

    group_order = ["reports", "modules", "framework", "manifests-small", "manifests-large", "scripts"]
    group_labels = {
        "reports": ("Reports", "52 report docs (51 reports + template)"),
        "modules": ("Modules", "16 Sierra:: Perl module docs"),
        "framework": ("Framework", "Cron framework, config format, archive plan"),
        "manifests-small": ("Manifests (Small)", "7 renderable CSVs and text files"),
        "manifests-large": ("Manifests (Large)", "Full file listing + script content JSON"),
        "scripts": ("Scripts", "13 Python/bash generator scripts"),
    }

    for gname in group_order:
        gd = group_data[gname]
        label, desc = group_labels[gname]
        url = gd["gist_url"]
        count = gd["file_count"]
        lines.append(f"| **{label}** | {count} | {desc} | [View gist]({url}) |")

    lines.append("")
    lines.append("---")
    lines.append("")

    # Extract and rewrite the report tables with deep links into the reports gist
    reports_url = group_data["reports"]["gist_url"]
    modules_url = group_data["modules"]["gist_url"]
    framework_url = group_data["framework"]["gist_url"]
    manifests_small_url = group_data["manifests-small"]["gist_url"]
    manifests_large_url = group_data["manifests-large"]["gist_url"]
    scripts_url = group_data["scripts"]["gist_url"]

    # Rewrite the Active Reports section with deep links
    lines.append("## Active Reports -- Quick Reference")
    lines.append("")
    lines.append("### Shelf-List Reports (6 -- HIGH PRIORITY)")
    lines.append("")
    lines.append("These are the highest-value reports, actively used for collection management.")
    lines.append("")
    lines.append("| Report | Full Name | Schedule | Last Run | Doc File |")
    lines.append("|--------|-----------|----------|----------|----------|")

    # Parse original content for report tables
    # We'll extract from the original and rewrite links
    _rewrite_report_tables(original_content, lines, reports_url)

    lines.append("")
    lines.append("---")
    lines.append("")

    # Sierra:: Modules section
    lines.append("## Sierra:: Modules (16)")
    lines.append("")
    lines.append("Custom Perl modules in `Modules/Sierra/` providing database access, location mapping, and ILS integration.")
    lines.append("")
    lines.append("| Module | Purpose | Used By | Doc File |")
    lines.append("|--------|---------|---------|----------|")

    _rewrite_module_table(original_content, lines, modules_url)

    lines.append("")
    lines.append("---")
    lines.append("")

    # Manifest files section
    lines.append("## Manifest Files (9)")
    lines.append("")
    lines.append("Machine-readable data files generated by the analysis scripts.")
    lines.append("")
    lines.append("| File | Format | Contents | Gist |")
    lines.append("|------|--------|----------|------|")

    _rewrite_manifest_table(original_content, lines, manifests_small_url, manifests_large_url)

    lines.append("")
    lines.append("---")
    lines.append("")

    # Scripts section
    lines.append("## Scripts (13)")
    lines.append("")
    lines.append("Python scripts (stdlib-only, rerunnable) that generated this documentation from the live ilsaux server.")
    lines.append("")

    _rewrite_scripts_section(original_content, lines, scripts_url)

    lines.append("")
    lines.append("---")
    lines.append("")

    # Framework section
    lines.append("## Framework & Architecture")
    lines.append("")
    lines.append("All reports follow the same execution pattern through `generic-cron.sh`:")
    lines.append("")
    lines.append("```")
    lines.append("cron schedule -> <report>-cron.sh -> generic-cron.sh -> perl ./$SOURCEFILE -> Sierra::DB -> PostgreSQL")
    lines.append("```")
    lines.append("")
    fw_cron = gist_file_url(framework_url, "framework--generic-cron-framework.md")
    fw_cfg = gist_file_url(framework_url, "framework--config-file-format.md")
    lines.append(f"See [generic-cron-framework.md]({fw_cron}) for the full execution flow.")
    lines.append(f"See [config-file-format.md]({fw_cfg}) for the Config::Simple `.cfg` credential format.")
    lines.append("")
    lines.append("---")
    lines.append("")

    # Migration section
    lines.append("## Migration & Archive Plan")
    lines.append("")
    archive_link = gist_file_url(framework_url, "framework--archive-plan.md")
    lines.append(f"See [archive-plan.md]({archive_link}) for the full classification and migration priorities.")
    lines.append("")
    lines.append("**Key numbers:**")
    lines.append("- **6** active-critical shelf-list reports (migrate first)")
    lines.append("- **16** other active reports (evaluate for migration)")
    lines.append("- **2** inactive-recent (review with stakeholders)")
    lines.append("- **27** obsolete (archive as historical record)")
    lines.append("- **49.6 GB** total server size; 65.9% is active report data")
    lines.append("- **316** credential references that need rotation before any migration")
    lines.append("")
    lines.append("---")
    lines.append("")

    # Credential safety note
    lines.append("## Credential Safety Note")
    lines.append("")
    sanitize_link = gist_file_url(scripts_url, "scripts--sanitize-for-gist.py")
    lines.append(f"All sensitive values in these gists have been replaced by the sanitizer ([sanitize-for-gist.py]({sanitize_link})):")
    lines.append("")
    lines.append("| Marker | Meaning |")
    lines.append("|--------|---------|")
    lines.append("| `[REDACTED-PASSWORD]` | Database or service password |")
    lines.append("| `[REDACTED-USER]` | Database username or service account |")
    lines.append("| `[REDACTED-HOST]` | Internal hostname or domain (*.plch.net, *.iii.com, etc.) |")
    lines.append("| `[REDACTED-EMAIL]` | Internal email address |")
    lines.append("| `[INTERNAL-HOST]` | Short internal hostname reference |")
    lines.append("")
    cred_link = gist_file_url(manifests_small_url, "manifests--credential-locations.csv")
    lines.append(f"The [credential-locations.csv]({cred_link}) file lists where credentials appear (file + line + type) but contains **no actual credential values**.")

    return "\n".join(lines) + "\n"


def _rewrite_report_tables(original, lines, reports_url):
    """Extract report table rows from original and rewrite links."""
    # Match table rows like: | slitemdata | Item Data ... | ... | [reports--slitemdata.md](...) |
    pattern = re.compile(
        r"^\| (\S+) \| (.+?) \| (.+?) \| (.+?) \| \[reports--(\S+?)\]\([^)]*\) \|$",
        re.MULTILINE,
    )

    in_shelf = True  # First table is shelf-list
    found_other = False

    for m in pattern.finditer(original):
        report, full_name, schedule, last_run, filename = m.groups()
        gist_fname = f"reports--{filename}"
        deep_link = gist_file_url(reports_url, gist_fname)

        # Detect transition from shelf-list to other reports
        line_pos = m.start()
        preceding = original[max(0, line_pos - 200):line_pos]
        if "Other Active Reports" in preceding and not found_other:
            found_other = True
            in_shelf = False
            lines.append("")
            lines.append("### Other Active Reports (16)")
            lines.append("")
            lines.append("| Report | Full Name | Schedule | Last Run | Doc File |")
            lines.append("|--------|-----------|----------|----------|----------|")

        lines.append(f"| {report} | {full_name} | {schedule} | {last_run} | [{gist_fname}]({deep_link}) |")

    # Add note about inactive
    lines.append("")
    archive_link = gist_file_url(
        # framework gist has archive-plan
        reports_url.replace(reports_url.split("/")[-1], ""),  # won't work, use group_data
        "framework--archive-plan.md",
    )
    lines.append("**Inactive and obsolete reports** (2 inactive-recent + 27 obsolete) are classified in the Framework gist.")


def _rewrite_module_table(original, lines, modules_url):
    """Extract module table rows and rewrite links."""
    pattern = re.compile(
        r"^\| (Sierra::\S+) \| (.+?) \| (.+?) \| \[modules--(\S+?)\]\([^)]*\) \|$",
        re.MULTILINE,
    )
    for m in pattern.finditer(original):
        module, purpose, used_by, filename = m.groups()
        gist_fname = f"modules--{filename}"
        deep_link = gist_file_url(modules_url, gist_fname)
        lines.append(f"| {module} | {purpose} | {used_by} | [{gist_fname}]({deep_link}) |")


def _rewrite_manifest_table(original, lines, small_url, large_url):
    """Extract manifest table rows and rewrite links."""
    pattern = re.compile(
        r"^\| \[manifests--(\S+?)\]\([^)]*\) \| (\S+) \| (.+?) \|$",
        re.MULTILINE,
    )

    large_files = {"file-manifest.csv", "script-content.json"}

    for m in pattern.finditer(original):
        filename, fmt, contents = m.groups()
        gist_fname = f"manifests--{filename}"
        url = large_url if filename in large_files else small_url
        deep_link = gist_file_url(url, gist_fname)
        gist_label = "Large" if filename in large_files else "Small"
        lines.append(f"| [{gist_fname}]({deep_link}) | {fmt} | {contents} | {gist_label} |")


def _rewrite_scripts_section(original, lines, scripts_url):
    """Rewrite scripts section with deep links."""
    # Manifest generators
    lines.append("### Manifest Generators (7)")
    lines.append("")
    lines.append("| Script | Output |")
    lines.append("|--------|--------|")

    script_manifest = [
        ("manifest-tree.py", "`file-manifest.csv`, `directory-tree.txt`"),
        ("manifest-perl-deps.py", "`perl-dependencies.csv`"),
        ("manifest-script-content.py", "`script-content.json`"),
        ("manifest-cron.py", "`cron-schedule.csv`"),
        ("manifest-report-status.py", "`report-status.csv`"),
        ("manifest-git.py", "`git-summaries.csv`"),
        ("manifest-summary.py", "`summary-report.txt`"),
    ]

    for script, output in script_manifest:
        gist_fname = f"scripts--{script}"
        deep_link = gist_file_url(scripts_url, gist_fname)
        lines.append(f"| [{gist_fname}]({deep_link}) | {output} |")

    lines.append("")
    lines.append("### Documentation Generators (3)")
    lines.append("")
    lines.append("| Script | Output |")
    lines.append("|--------|--------|")

    doc_generators = [
        ("generate-report-docs.py", "51 report docs in `reports--*.md`"),
        ("generate-module-docs.py", "16 module docs in `modules--*.md`"),
        ("generate-framework-doc.py", "Framework docs"),
    ]

    for script, output in doc_generators:
        gist_fname = f"scripts--{script}"
        deep_link = gist_file_url(scripts_url, gist_fname)
        lines.append(f"| [{gist_fname}]({deep_link}) | {output} |")

    lines.append("")
    lines.append("### Utilities (3)")
    lines.append("")
    lines.append("| Script | Purpose |")
    lines.append("|--------|---------|")

    utilities = [
        ("generate-archive-plan.py", "Generates archive-plan.md (migration classification and priorities)"),
        ("sanitize-for-gist.py", "Redacts credentials and internal hostnames for safe publishing"),
        ("publish-ilsaux-gist.sh", "Original single-gist publisher (kept for reference)"),
    ]

    for script, purpose in utilities:
        gist_fname = f"scripts--{script}"
        deep_link = gist_file_url(scripts_url, gist_fname)
        lines.append(f"| [{gist_fname}]({deep_link}) | {purpose} |")


def delete_gist(gist_id, dry_run=False):
    """Delete a gist by ID."""
    if dry_run:
        print(f"  [DRY RUN] Would delete gist {gist_id}", file=sys.stderr)
        return True
    result = subprocess.run(
        ["gh", "gist", "delete", gist_id],
        capture_output=True,
        text=True,
    )
    if result.returncode != 0:
        print(f"ERROR: Failed to delete gist {gist_id}: {result.stderr}", file=sys.stderr)
        return False
    return True


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Publish ilsaux docs as multiple gists")
    parser.add_argument("--dry-run", action="store_true", help="Show what would happen without API calls")
    parser.add_argument("--delete-old", action="store_true", help="Delete the old single gist after publish")
    args = parser.parse_args()

    dry_run = args.dry_run

    print("=== ilsaux Multi-Gist Publisher ===\n")

    # Phase 1: Sanitize
    print("[1/8] Sanitizing documentation ...")
    staging_dir = run_sanitizer()
    print(f"  Staging: {staging_dir}\n")

    # Phase 2: Group files
    print("[2/8] Grouping files ...")
    grouped = {}
    for group in GIST_GROUPS:
        name = group["name"]
        files = collect_group_files(group, staging_dir)
        grouped[name] = {"group": group, "files": files}
        print(f"  {name}: {len(files)} files")
    print()

    # Phase 3: Generate per-gist 00-README.md (with placeholder)
    print("[3/8] Generating per-gist README files ...")
    for name, data in grouped.items():
        readme_content = generate_group_readme(data["group"], data["files"])
        data["files"]["00-README.md"] = readme_content
        print(f"  {name}: 00-README.md added ({len(data['files'])} total)")
    print()

    # Phase 4: Create or update sub-gists
    print("[4/8] Creating/updating sub-gists ...")
    manifest = load_manifest()
    group_data = {}

    for name, data in grouped.items():
        existing_id = manifest.get("groups", {}).get(name, {}).get("gist_id")
        desc = data["group"]["description"]

        gist_id, gist_url, created = create_or_update_gist(
            existing_id, data["files"], desc, dry_run
        )

        if not gist_id:
            print(f"ERROR: Failed to create/update gist for {name}", file=sys.stderr)
            sys.exit(1)

        action = "Created" if created else "Updated"
        print(f"  {action} {name}: {gist_url}")

        group_data[name] = {
            "gist_id": gist_id,
            "gist_url": gist_url,
            "file_count": len(data["files"]),
        }
    print()

    # Phase 5: Generate master TOC
    print("[5/8] Generating master TOC ...")
    master_toc_content = generate_master_toc(group_data, staging_dir)
    if dry_run:
        print(f"  [DRY RUN] Master TOC: {len(master_toc_content)} bytes")
    print()

    # Phase 6: Create or update master TOC gist
    print("[6/8] Creating/updating master TOC gist ...")
    master_id = manifest.get("master_toc_gist_id")
    master_files = {"00-INDEX.md": master_toc_content}
    master_desc = "ilsaux ILS Auxiliary Server - Master Documentation Index (CHPL)"

    master_gist_id, master_gist_url, master_created = create_or_update_gist(
        master_id, master_files, master_desc, dry_run
    )

    if not master_gist_id:
        print("ERROR: Failed to create/update master TOC gist", file=sys.stderr)
        sys.exit(1)

    action = "Created" if master_created else "Updated"
    print(f"  {action} master TOC: {master_gist_url}\n")

    # Phase 7: Back-patch sub-gist READMEs
    print("[7/8] Back-patching sub-gist READMEs with master TOC URL ...")
    for name, data in grouped.items():
        readme = generate_group_readme(
            data["group"], data["files"], master_toc_url=master_gist_url
        )
        gist_id = group_data[name]["gist_id"]

        if dry_run:
            print(f"  [DRY RUN] Would patch {name} 00-README.md")
        else:
            ok = update_gist(gist_id, {"00-README.md": readme}, dry_run=False)
            if ok:
                print(f"  Patched {name}")
            else:
                print(f"  WARNING: Failed to patch {name} README", file=sys.stderr)
    print()

    # Phase 8: Save manifest
    print("[8/8] Saving manifest ...")
    manifest["version"] = 1
    manifest["master_toc_gist_id"] = master_gist_id
    manifest["master_toc_gist_url"] = master_gist_url
    manifest["old_single_gist_id"] = OLD_SINGLE_GIST_ID
    manifest["groups"] = group_data

    if dry_run:
        print(f"  [DRY RUN] Would save manifest to {MANIFEST_PATH}")
        print(f"  [DRY RUN] Manifest content:")
        print(json.dumps(manifest, indent=2))
    else:
        save_manifest(manifest)
        print(f"  Saved: {MANIFEST_PATH}")
    print()

    # Optional: delete old gist
    if args.delete_old:
        print(f"Deleting old single gist {OLD_SINGLE_GIST_ID} ...")
        if delete_gist(OLD_SINGLE_GIST_ID, dry_run):
            print("  Deleted.")
        else:
            print("  WARNING: Could not delete old gist.", file=sys.stderr)
        print()

    # Summary
    print("=== DONE ===")
    print(f"Master TOC: {master_gist_url}")
    for name in ["reports", "modules", "framework", "manifests-small", "manifests-large", "scripts"]:
        gd = group_data[name]
        print(f"  {name} ({gd['file_count']} files): {gd['gist_url']}")
    if not dry_run:
        print(f"\nManifest saved to: {MANIFEST_PATH}")
        print("Commit it with: git add scripts/ilsaux/gist-manifest.json")


if __name__ == "__main__":
    main()

## scripts--sanitize-for-gist.py
#!/usr/bin/env python3
"""Sanitize ilsaux documentation for gist publishing.

Copies docs/ilsaux/ and scripts/ilsaux/ to a staging directory, then scans
and redacts credentials, internal hostnames, and other sensitive data.

Output: /tmp/ilsaux-gist/ (staging directory ready for gist publish)
"""

import os
import re
import shutil
import sys

WORKSPACE = "/home/ray/claude"
STAGING_DIR = "/tmp/ilsaux-gist"

# Source directories to copy
SOURCES = [
    ("docs/ilsaux", "docs"),
    ("scripts/ilsaux", "scripts"),
]

# --- Redaction rules ---

# Literal strings to redact (case-insensitive matching, replaced entirely)
LITERAL_PASSWORDS = [
    "[REDACTED-PASSWORD]",
    "[REDACTED-PASSWORD]",
    "[REDACTED-PASSWORD]",
]

# DB usernames used in DBI->connect strings (not generic words)
DB_USERNAMES = [
    "[REDACTED-USER]",
    "[REDACTED-USER]",
    "[REDACTED-USER]",
    "[REDACTED-USER]",
    "[REDACTED-USER]",
    "[REDACTED-USER]",
]

# Internal hostnames / domains to redact
# Order matters: more specific patterns first
HOST_PATTERNS = [
    (re.compile(r'sierra-train\.cincinnatilibrary\.org', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'sierra-train\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'sierra-db\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'cinci-db\.iii\.com', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'cinci\.iii\.com', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'partner\.iii\.com', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'host-ilsaux\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'webtools\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'[INTERNAL-HOST]\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'www2\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
    # Catch-all for any remaining *.plch.net or *.iii.com subdomains
    (re.compile(r'\b[\w.-]+\.plch\.net\b', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'\b[\w.-]+\.iii\.com\b', re.IGNORECASE), "[REDACTED-HOST]"),
    (re.compile(r'\b[\w.-]+\.cincinnatilibrary\.org\b', re.IGNORECASE), "[REDACTED-HOST]"),
]

# Regex patterns for credential constructs
CREDENTIAL_REGEXES = [
    # DBI->connect with inline credentials: redact user and password args
    # Matches: DBI->connect("dsn","[REDACTED-USER]","[REDACTED-PASSWORD]", ...)
    (
        re.compile(
            r'(DBI->connect\(\s*"[^"]*"\s*,\s*)"([^"]*?)"\s*,\s*"([^"]*?)"',
            re.IGNORECASE,
        ),
        r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"',
    ),
    # $ua->credentials('host:port', 'realm', '[REDACTED-USER]', '[REDACTED-PASSWORD]')
    (
        re.compile(
            r"(\$ua->credentials\(\s*'[^']*'\s*,\s*'[^']*'\s*,\s*)'([^']*)'\s*,\s*'([^']*)'",
            re.IGNORECASE,
        ),
        r"\1'[REDACTED-USER]', '[REDACTED-PASSWORD]'",
    ),
    # my $password = "[REDACTED-PASSWORD]";
    (
        re.compile(r'(my\s+\$password\s*=\s*)"[^"]*"', re.IGNORECASE),
        r'\1"[REDACTED-PASSWORD]"',
    ),
    # my $username = "[REDACTED-USER]"; (only in credential context, not generic)
    (
        re.compile(r'(my\s+\$username\s*=\s*)"[^"]*"', re.IGNORECASE),
        r'\1"[REDACTED-USER]"',
    ),
]

# Email addresses at internal domains
EMAIL_PATTERNS = [
    (re.compile(r'\b[\w.+-]+@cincinnatilibrary\.org\b', re.IGNORECASE), "[REDACTED-EMAIL]"),
    (re.compile(r'\b[\w.+-]+@plch\.net\b', re.IGNORECASE), "[REDACTED-EMAIL]"),
]

# Hostname references that appear as just the short name (e.g., "[INTERNAL-HOST]" in print stmts)
# These are less sensitive but still internal infrastructure names
SHORT_HOST_REFS = [
    (re.compile(r'\bMain12\b', re.IGNORECASE), "[INTERNAL-HOST]"),
]

# Escaped variants for JSON content (backslash-escaped quotes)
JSON_CREDENTIAL_REGEXES = [
    # DBI->connect with escaped quotes in JSON
    (
        re.compile(
            r'(DBI->connect\(\\?"[^"\\]*(?:\\.[^"\\]*)*\\?"\s*,\s*\\?)"([^"\\]*)"(\\?\s*,\s*\\?)"([^"\\]*)"',
            re.IGNORECASE,
        ),
        r'\1"[REDACTED-USER]"\3"[REDACTED-PASSWORD]"',
    ),
]


def redact_line(line, filepath):
    """Apply all redaction rules to a single line. Returns (new_line, list_of_redactions)."""
    redactions = []
    original = line

    # 1. Literal password redaction
    for pw in LITERAL_PASSWORDS:
        if pw.lower() in line.lower():
            # Use case-insensitive replacement
            pattern = re.compile(re.escape(pw), re.IGNORECASE)
            if pattern.search(line):
                line = pattern.sub("[REDACTED-PASSWORD]", line)
                redactions.append(f"literal-password: {pw}")

    # 2. DB username redaction (as standalone quoted strings)
    for uname in DB_USERNAMES:
        # Match "username" or 'username' as standalone tokens
        for quote in ['"', "'"]:
            token = f"{quote}{uname}{quote}"
            if token in line:
                line = line.replace(token, f'{quote}[REDACTED-USER]{quote}')
                redactions.append(f"db-username: {uname}")
        # Also handle escaped quotes in JSON: \"username\"
        escaped_token = f'\\"{uname}\\"'
        if escaped_token in line:
            line = line.replace(escaped_token, '\\"[REDACTED-USER]\\"')
            redactions.append(f"db-username-json: {uname}")
        # Also [REDACTED-USER] style
        escaped_backslash = f"plchnet\\\\{uname}" if uname == "[REDACTED-USER]" else None
        if escaped_backslash and escaped_backslash in line:
            line = line.replace(escaped_backslash, "[REDACTED-USER]")
            redactions.append(f"service-account: plchnet\\{uname}")

    # 3. Email address redaction (before host patterns, since emails contain domains)
    for pattern, replacement in EMAIL_PATTERNS:
        if pattern.search(line):
            line = pattern.sub(replacement, line)
            redactions.append(f"email: {pattern.pattern}")

    # 4. Host pattern redaction
    for pattern, replacement in HOST_PATTERNS:
        if pattern.search(line):
            line = pattern.sub(replacement, line)
            redactions.append(f"hostname: {pattern.pattern}")

    # 5. Credential regex redaction
    for pattern, replacement in CREDENTIAL_REGEXES:
        if pattern.search(line):
            line = pattern.sub(replacement, line)
            redactions.append(f"credential-pattern: {pattern.pattern[:60]}")

    # 6. Short host references
    for pattern, replacement in SHORT_HOST_REFS:
        if pattern.search(line):
            line = pattern.sub(replacement, line)
            redactions.append(f"short-host: {pattern.pattern}")

    if line != original:
        return line, redactions
    return line, []


def process_file(filepath):
    """Sanitize a single file in-place. Returns count of redacted lines."""
    try:
        with open(filepath, "r", errors="replace") as f:
            lines = f.readlines()
    except OSError as e:
        print(f"  SKIP: {filepath}: {e}", file=sys.stderr)
        return 0

    new_lines = []
    total_redactions = 0
    rel_path = os.path.relpath(filepath, STAGING_DIR)

    for i, line in enumerate(lines, 1):
        new_line, redactions = redact_line(line, filepath)
        new_lines.append(new_line)
        if redactions:
            total_redactions += 1
            for r in redactions:
                print(f"  REDACT {rel_path}:{i} [{r}]", file=sys.stderr)

    if total_redactions > 0:
        with open(filepath, "w") as f:
            f.writelines(new_lines)

    return total_redactions


def verify_clean(staging_dir):
    """Re-scan staging directory for any remaining sensitive patterns. Returns list of findings."""
    # Patterns that should NOT appear in clean output
    verify_patterns = [
        re.compile(r'[REDACTED-PASSWORD]', re.IGNORECASE),
        re.compile(r'[REDACTED-PASSWORD]', re.IGNORECASE),
        re.compile(r'eS3cuRe', re.IGNORECASE),
        re.compile(r'[REDACTED-USER]', re.IGNORECASE),
        re.compile(r'sqllabels\d*', re.IGNORECASE),
        re.compile(r'[REDACTED-USER]', re.IGNORECASE),
        re.compile(r'svc_vmsp', re.IGNORECASE),
        re.compile(r'cinci-db\.iii\.com', re.IGNORECASE),
        re.compile(r'sierra-db\.plch\.net', re.IGNORECASE),
        re.compile(r'sierra-train', re.IGNORECASE),
        re.compile(r'host-ilsaux', re.IGNORECASE),
        re.compile(r'webtools\.plch\.net', re.IGNORECASE),
        re.compile(r'partner\.iii\.com', re.IGNORECASE),
        re.compile(r'cincinnatilibrary\.org', re.IGNORECASE),
        # Catch-all domain patterns
        re.compile(r'\b\w+\.plch\.net\b', re.IGNORECASE),
        re.compile(r'\b\w+\.iii\.com\b', re.IGNORECASE),
    ]

    # Skip Python/bash scripts that contain pattern definitions or grep strings.
    # These files get redacted too, but residual fragments in regex strings and
    # grep patterns can trigger false positives in verification.
    skip_extensions = {".py", ".sh"}

    findings = []
    for root, _dirs, files in os.walk(staging_dir):
        for name in files:
            _, ext = os.path.splitext(name)
            if ext in skip_extensions:
                continue
            path = os.path.join(root, name)
            rel = os.path.relpath(path, staging_dir)
            try:
                with open(path, "r", errors="replace") as f:
                    for i, line in enumerate(f, 1):
                        for pat in verify_patterns:
                            m = pat.search(line)
                            if m:
                                findings.append(f"{rel}:{i}: {m.group()} (pattern: {pat.pattern})")
            except OSError:
                continue

    return findings


def main():
    print("[sanitize-for-gist] Starting sanitization ...", file=sys.stderr)

    # Clean and create staging directory
    if os.path.exists(STAGING_DIR):
        shutil.rmtree(STAGING_DIR)
    os.makedirs(STAGING_DIR)

    # Copy source directories
    for src_rel, dst_name in SOURCES:
        src = os.path.join(WORKSPACE, src_rel)
        dst = os.path.join(STAGING_DIR, dst_name)
        if os.path.isdir(src):
            shutil.copytree(src, dst)
            print(f"  Copied {src_rel} -> {dst_name}/", file=sys.stderr)
        else:
            print(f"  WARNING: {src} not found, skipping", file=sys.stderr)

    # Also copy the top-level ilsaux documentation file if it exists
    ilsaux_doc = os.path.join(WORKSPACE, "llore/ilsaux-documentation.md")
    if os.path.exists(ilsaux_doc):
        shutil.copy2(ilsaux_doc, os.path.join(STAGING_DIR, "ilsaux-documentation.md"))
        print(f"  Copied llore/ilsaux-documentation.md", file=sys.stderr)

    # Process all text files in staging
    total_files = 0
    total_redacted_lines = 0

    for root, _dirs, files in os.walk(STAGING_DIR):
        for name in files:
            path = os.path.join(root, name)
            total_files += 1
            total_redacted_lines += process_file(path)

    print(f"\n  Processed {total_files} files, redacted {total_redacted_lines} lines", file=sys.stderr)

    # Verify clean
    print("\n[sanitize-for-gist] Verification scan ...", file=sys.stderr)
    findings = verify_clean(STAGING_DIR)

    if findings:
        print(f"\n  WARNING: {len(findings)} remaining sensitive patterns found!", file=sys.stderr)
        for f in findings:
            print(f"    {f}", file=sys.stderr)
        print("\n  Staging directory NOT clean. Review and update redaction rules.", file=sys.stderr)
        sys.exit(1)
    else:
        print("  CLEAN: No sensitive patterns found in staging directory.", file=sys.stderr)

    print(f"\n  Staging directory: {STAGING_DIR}", file=sys.stderr)
    # Print to stdout for script consumption
    print(STAGING_DIR)


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""Generate archive classification and migration plan from all manifests.

	Output: docs/ilsaux/archive-plan.md
	"""

	import csv
	import os
	import sys
	import time
	from collections import Counter, defaultdict
	from datetime import datetime

	MANIFEST_DIR = "/home/ray/claude/docs/ilsaux/manifests"
	OUT_PATH = "/home/ray/claude/docs/ilsaux/archive-plan.md"


	def human_size(nbytes):
	for unit in ("B", "KB", "MB", "GB", "TB"):
	if abs(nbytes) < 1024:
	return f"{nbytes:.1f} {unit}"
	nbytes /= 1024
	return f"{nbytes:.1f} PB"


	def read_csv_file(filename):
	path = os.path.join(MANIFEST_DIR, filename)
	if not os.path.exists(path):
	print(f" WARNING: {path} not found", file=sys.stderr)
	return []
	with open(path, newline="") as f:
	return list(csv.DictReader(f))


	def main():
	t_start = time.monotonic()
	print("[generate-archive-plan] Generating archive classification ...", file=sys.stderr)

	report_status = read_csv_file("report-status.csv")
	file_manifest = read_csv_file("file-manifest.csv")
	credential_locs = read_csv_file("credential-locations.csv")
	git_summaries = read_csv_file("git-summaries.csv")

	# Build size map per report directory
	report_sizes = defaultdict(int)
	report_file_counts = defaultdict(int)
	top_dir_sizes = defaultdict(int)
	for f in file_manifest:
	parts = f["parent_dir"].split("/")
	if len(parts) >= 2 and parts[0] == "Reports":
	report_sizes[parts[1]] += int(f["size_bytes"])
	report_file_counts[parts[1]] += 1
	top = parts[0] if parts[0] != "." else "(root)"
	top_dir_sizes[top] += int(f["size_bytes"])

	# Classify reports
	categories = {
	"active-critical": [],
	"active": [],
	"inactive-recent": [],
	"inactive-legacy": [],
	"obsolete": [],
	}

	for r in report_status:
	name = r["report_name"]
	status = r["status"]
	is_sl = name.startswith("sl")

	if status == "active" and is_sl:
	categories["active-critical"].append(r)
	elif status == "active":
	categories["active"].append(r)
	elif status == "inactive-recent":
	categories["inactive-recent"].append(r)
	elif status == "obsolete":
	# Further classify by last run date
	last = r.get("last_run_date", "")
	if last:
	try:
	dt = datetime.strptime(last, "%Y-%m-%d")
	if dt.year >= 2020:
	categories["inactive-recent"].append(r)
	else:
	categories["inactive-legacy"].append(r)
	continue
	except ValueError:
	pass
	categories["obsolete"].append(r)
	else:
	categories["obsolete"].append(r)

	# Credential files per report
	cred_by_report = defaultdict(set)
	for c in credential_locs:
	parts = c["file"].split("/")
	if len(parts) >= 2:
	# Reports/name/file -> name
	if parts[0] == "Reports":
	cred_by_report[parts[1]].add(c["credential_type"])

	with open(OUT_PATH, "w") as f:
	def w(line=""):
	f.write(line + "\n")

	w("# ILS Auxiliary Server Archive & Migration Plan")
	w()
	w(f"Generated: {datetime.now().isoformat(timespec='seconds')}")
	w()
	w("---")
	w()

	# Classification table
	w("## Report Classification")
	w()

	for cat, label, desc in [
	("active-critical", "Active-Critical", "Running in 2026, shelf list reports -- document fully, migrate first"),
	("active", "Active", "Running in 2026, non-shelf-list -- document, evaluate for migration"),
	("inactive-recent", "Inactive-Recent", "Last run 2020-2025 -- review with stakeholders before archiving"),
	("inactive-legacy", "Inactive-Legacy", "Last run before 2020 -- archive only, low priority"),
	("obsolete", "Obsolete", "No output found or commented out -- archive as historical record"),
	]:
	reports = categories[cat]
	if not reports:
	continue

	total_size = sum(report_sizes.get(r["report_name"], 0) for r in reports)
	total_files = sum(report_file_counts.get(r["report_name"], 0) for r in reports)

	w(f"### {label} ({len(reports)} reports, {human_size(total_size)}, {total_files:,} files)")
	w()
	w(f"> {desc}")
	w()
	w(f"\| Report \| Full Name \| Last Run \| Size \| Credentials \|")
	w(f"\|--------\|-----------\|----------\|------\|-------------\|")

	for r in sorted(reports, key=lambda x: x["report_name"]):
	name = r["report_name"]
	fullname = r.get("fullname", "") or ""
	last = r.get("last_run_date", "") or "never"
	size = human_size(report_sizes.get(name, 0))
	creds = ", ".join(sorted(cred_by_report.get(name, []))) or "none"
	w(f"\| `{name}` \| {fullname} \| {last} \| {size} \| {creds} \|")
	w()

	# Historical/non-report directories
	w("### Historical Directories")
	w()
	w("These are not report directories but contain historical data:")
	w()
	historical_dirs = ["Symphony_Hist", "Symphony_Bincustom", "webpac"]
	for d in historical_dirs:
	size = top_dir_sizes.get(d, 0)
	if size > 0:
	w(f"- {d}: {human_size(size)} -- archive as historical record")
	w()

	# Credential rotation
	w("## Credential Rotation Requirements")
	w()
	w(f"Total credential references found: {len(credential_locs)}")
	w()
	cred_types = Counter(c["credential_type"] for c in credential_locs)
	w("\| Type \| Count \| Action \|")
	w("\|------\|-------\|--------\|")
	for ctype, count in cred_types.most_common():
	action = "Rotate immediately" if ctype in ("password", "db_connection") else "Review"
	w(f"\| {ctype} \| {count} \| {action} \|")
	w()
	w("All `.cfg` files contain plaintext credentials and must NOT be migrated as-is.")
	w()

	# Size breakdown
	w("## Size Breakdown")
	w()
	total_size = sum(int(fi["size_bytes"]) for fi in file_manifest)
	w(f"Total ilsaux size: {human_size(total_size)}")
	w()

	active_size = sum(
	report_sizes.get(r["report_name"], 0)
	for r in categories["active-critical"] + categories["active"]
	)
	inactive_size = sum(
	report_sizes.get(r["report_name"], 0)
	for r in categories["inactive-recent"] + categories["inactive-legacy"]
	)
	obsolete_size = sum(
	report_sizes.get(r["report_name"], 0)
	for r in categories["obsolete"]
	)

	w(f"\| Category \| Size \| Percentage \|")
	w(f"\|----------\|------\|------------\|")
	for label, size in [
	("Active (critical + other)", active_size),
	("Inactive (recent + legacy)", inactive_size),
	("Obsolete", obsolete_size),
	("Non-report dirs", total_size - active_size - inactive_size - obsolete_size),
	]:
	pct = (size / total_size * 100) if total_size > 0 else 0
	w(f"\| {label} \| {human_size(size)} \| {pct:.1f}% \|")
	w()

	# Migration priorities
	w("## Migration Priorities")
	w()
	w("### Priority 1: Active-Critical (Shelf List Reports)")
	w()
	for r in categories["active-critical"]:
	w(f"1. `{r['report_name']}` -- {r.get('fullname', '')}")
	w()
	w("These reports are actively running and serve shelf list operations.")
	w("Document fully, test migration, coordinate with staff.")
	w()

	w("### Priority 2: Active (Other Reports)")
	w()
	for r in categories["active"]:
	w(f"1. `{r['report_name']}` -- {r.get('fullname', '')}")
	w()
	w("Running in production. Evaluate each for continued need.")
	w()

	w("### Priority 3: Inactive-Recent")
	w()
	w("Review with stakeholders. Some may need reactivation, others can be archived.")
	w()

	w("### Priority 4: Legacy & Historical")
	w()
	w("Archive for reference. No migration needed.")
	w()

	# Recommendations
	w("## Recommendations")
	w()
	w("1. Credential rotation: All plaintext credentials must be rotated before any migration")
	w("2. Config modernization: Replace Config::Simple `.cfg` with environment variables or vault")
	w("3. Consolidate git: Merge per-report repos into monorepo for easier management")
	w("4. Archive Symphony data: The 13 GB of historical Symphony logs can be compressed and cold-stored")
	w("5. Document sl-reports first: These are the highest-value, most-used reports")
	w("6. Test framework: The generic-cron.sh pattern is sound but should be modernized (systemd timers, structured logging)")

	print(f" Wrote {OUT_PATH}", file=sys.stderr)

	elapsed = time.monotonic() - t_start
	print(f" Done in {elapsed:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	"""Document the generic-cron.sh execution framework and .cfg file format.

	Output: docs/ilsaux/framework/generic-cron-framework.md, config-file-format.md
	"""

	import os
	import re
	import sys
	import time

	BASE_DIR = "/home/ray/Documents/ilsaux"
	OUT_DIR = "/home/ray/claude/docs/ilsaux/framework"


	def document_generic_cron():
	"""Read and document the generic-cron.sh framework."""
	cron_path = os.path.join(BASE_DIR, "Reports/generic/generic-cron.sh")

	try:
	with open(cron_path, "r") as f:
	content = f.read()
	except OSError as e:
	print(f" ERROR: {e}", file=sys.stderr)
	return

	lines = content.split("\n")

	doc_path = os.path.join(OUT_DIR, "generic-cron-framework.md")
	with open(doc_path, "w") as f:
	f.write("# Generic Cron Framework\n\n")
	f.write(f"File: `Reports/generic/generic-cron.sh`\n")
	f.write(f"Lines: {len(lines)}\n\n")
	f.write("---\n\n")

	f.write("## Overview\n\n")
	f.write("The generic-cron.sh script is the execution framework for all ilsaux reports.\n")
	f.write("Each report has a thin wrapper (`<name>-cron.sh`) that sets variables and sources this script.\n\n")

	f.write("## Execution Flow\n\n")
	f.write("1. Variable Setup -- Date variables (TODAY, WEEKAGO, MONTHAGO, YEARAGO)\n")
	f.write("2. Defaults -- Sets REPORTNAME, LOGFILE, JSONFILE, KEEPPERIOD, LINK if not provided by wrapper\n")
	f.write("3. Run Report -- `cd` to report dir, run `perl ./$SOURCEFILE >> $LOGFILE-$TODAY.txt`\n")
	f.write("4. JSON Metadata -- Creates timestamped JSON with fullName, name, date, timeStarted, timeFinished, logFile, link\n")
	f.write("5. Cleanup -- Deletes old log files and JSON based on KEEPPERIOD (WEEK/MONTH/YEAR)\n")
	f.write("6. MESA Integration -- Copies JSON + log to `/var/www/html/mesa/`, cleans old files, rebuilds index\n\n")

	f.write("## Required Variables (set by wrapper)\n\n")
	f.write("\| Variable \| Required \| Default \| Description \|\n")
	f.write("\|----------\|----------\|---------\|-------------\|\n")
	f.write("\| `REPORTNAME` \| Yes \| `generic` \| Directory name and base filename \|\n")
	f.write("\| `FULLNAME` \| Yes \| `Generic Report` \| Human-readable name for JSON/MESA \|\n")
	f.write("\| `SOURCEFILE` \| Yes \| `SierraGenericReport.pl` \| Perl script filename \|\n")
	f.write("\| `LINK` \| No \| `nil` \| URL for the report output \|\n")
	f.write("\| `KEEPPERIOD` \| No \| `YEAR` \| Retention: WEEK, MONTH, or YEAR \|\n")
	f.write("\| `LOGFILE` \| No \| `$REPORTNAME-log` \| Log file basename \|\n")
	f.write("\| `JSONFILE` \| No \| `$REPORTNAME` \| JSON metadata basename \|\n\n")

	f.write("## Cron Wrapper Pattern\n\n")
	f.write("Every report follows this pattern:\n\n")
	f.write("```bash\n")
	f.write('#!/bin/bash\n\n')
	f.write('REPORTNAME=slmainmissing\n')
	f.write('FULLNAME="Shelflist - Main Missing"\n')
	f.write('SOURCEFILE=SierraShelfListMainMissing.pl\n')
	f.write('LINK="http://[REDACTED-HOST]/ils/shelflists/mainmissing.asp"\n')
	f.write('KEEPPERIOD=MONTH\n\n')
	f.write('source ~/Reports/generic/generic-cron.sh\n')
	f.write("```\n\n")

	f.write("## JSON Metadata Format\n\n")
	f.write("```json\n")
	f.write('{\n')
	f.write(' "fullName": "Shelflist - Main Missing",\n')
	f.write(' "name": "slmainmissing",\n')
	f.write(' "date": "2026-01-15",\n')
	f.write(' "timeStarted": "1737000000",\n')
	f.write(' "timeFinished": "1737000300",\n')
	f.write(' "logFile": "slmainmissing-log-20260115.txt",\n')
	f.write(' "link": "http://[REDACTED-HOST]/ils/shelflists/mainmissing.asp"\n')
	f.write('}\n')
	f.write("```\n\n")

	f.write("## MESA Dashboard Integration\n\n")
	f.write("- JSON metadata copied to `/var/www/html/mesa/finished/`\n")
	f.write("- Log files copied to `/var/www/html/mesa/logs/`\n")
	f.write("- `json-wn.pl` generates `/var/www/html/mesa/upcoming.json`\n")
	f.write("- `json-index.pl` generates `/var/www/html/mesa/finished/index.json`\n")
	f.write("- Old MESA files cleaned after 32 days\n\n")

	f.write("## Retention Periods\n\n")
	f.write("\| Period \| Log Cleanup \| JSON Cleanup \|\n")
	f.write("\|--------\|-------------\|-------------\|\n")
	f.write("\| WEEK \| 7 days \| 8 days \|\n")
	f.write("\| MONTH \| 30 days \| 32 days \|\n")
	f.write("\| YEAR \| 365 days \| 366 days \|\n\n")

	f.write("## Historical Note\n\n")
	f.write("The script contains commented-out FTP code that previously transferred files to `[REDACTED-HOST]`.\n")
	f.write("This was replaced by direct file copy to the MESA web directory on the same server.\n")

	print(f" Wrote {doc_path}", file=sys.stderr)


	def document_config_format():
	"""Document the .cfg file format used by reports."""
	doc_path = os.path.join(OUT_DIR, "config-file-format.md")

	# Scan for .cfg files to find common keys
	config_keys = {} # key -> [files]
	reports_dir = os.path.join(BASE_DIR, "Reports")

	for root, dirs, files in os.walk(reports_dir):
	dirs[:] = [d for d in dirs if d != ".git"]
	for name in files:
	if not name.endswith(".cfg"):
	continue
	path = os.path.join(root, name)
	rel = os.path.relpath(path, BASE_DIR)
	try:
	with open(path, "r", errors="replace") as f:
	for line in f:
	line = line.strip()
	# Config::Simple format: key value or key=value
	m = re.match(r'^(\w+)\s[=:]\s(.+)', line)
	if m:
	key = m.group(1)
	config_keys.setdefault(key, []).append(rel)
	except OSError:
	pass

	with open(doc_path, "w") as f:
	f.write("# Config File Format (.cfg)\n\n")
	f.write("Reports use `Config::Simple` to read `.cfg` files.\n")
	f.write("Format: `key value` or `key=value` (one per line).\n\n")
	f.write("---\n\n")

	f.write("## Common Configuration Keys\n\n")
	f.write("\| Key \| Used By (count) \| Description \|\n")
	f.write("\|-----\|----------------\|-------------\|\n")
	for key in sorted(config_keys, key=lambda k: len(config_keys[k]), reverse=True):
	count = len(config_keys[key])
	desc = ""
	kl = key.lower()
	if "module" in kl:
	desc = "Path to Sierra:: Perl modules"
	elif "host" in kl or "server" in kl:
	desc = "Database or server hostname"
	elif "database" in kl or "dbname" in kl:
	desc = "Database name"
	elif "user" in kl:
	desc = "Database or service username"
	elif "password" in kl or "passwd" in kl:
	desc = "CREDENTIAL -- database or service password"
	elif "port" in kl:
	desc = "Service port number"
	elif "ftp" in kl:
	desc = "FTP-related setting"
	elif "output" in kl or "file" in kl:
	desc = "Output file path"
	f.write(f"\| `{key}` \| {count} \| {desc} \|\n")

	f.write("\n## Security Note\n\n")
	f.write("Many `.cfg` files contain plaintext credentials (database passwords, FTP credentials).\n")
	f.write("These are NOT documented here and must be rotated as part of any migration.\n")
	f.write("See `credential-locations.csv` for an inventory of affected files.\n")

	print(f" Wrote {doc_path}", file=sys.stderr)


	def main():
	t_start = time.monotonic()
	print("[generate-framework-doc] Generating framework documentation ...", file=sys.stderr)

	document_generic_cron()
	document_config_format()

	elapsed = time.monotonic() - t_start
	print(f" Done in {elapsed:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	"""Generate documentation for each Sierra:: Perl module.

	Reads the modules directly plus perl-dependencies.csv for reverse dependency map.
	Output: docs/ilsaux/modules/ (one .md per module)
	"""

	import csv
	import os
	import re
	import sys
	import time

	BASE_DIR = "/home/ray/Documents/ilsaux/Modules/Sierra"
	MANIFEST_DIR = "/home/ray/claude/docs/ilsaux/manifests"
	OUT_DIR = "/home/ray/claude/docs/ilsaux/modules"

	CREDENTIAL_PATTERNS = [
	re.compile(r'password', re.IGNORECASE),
	re.compile(r'passwd', re.IGNORECASE),
	re.compile(r'secret', re.IGNORECASE),
	re.compile(r'DBI->connect', re.IGNORECASE),
	]


	def extract_module_info(filepath):
	"""Extract package, exports, subs, and credential flags from a .pm file."""
	with open(filepath, "r", errors="replace") as f:
	content = f.read()

	lines = content.split("\n")

	# Package name
	package = ""
	m = re.search(r'^package\s+([\w:]+)', content, re.MULTILINE)
	if m:
	package = m.group(1)

	# Exports
	export_ok = []
	export = []
	for m in re.finditer(r'@EXPORT_OK\s=\sqw\(\s(.?)\s*\)', content, re.DOTALL):
	export_ok.extend(m.group(1).split())
	for m in re.finditer(r'@EXPORT\s=\sqw\(\s(.?)\s*\)', content, re.DOTALL):
	export.extend(m.group(1).split())

	# Subroutines
	subs = []
	for i, line in enumerate(lines):
	sm = re.match(r'^sub\s+(\w+)', line)
	if sm:
	sub_name = sm.group(1)
	# Look back for leading comments
	comments = []
	j = i - 1
	while j >= 0 and lines[j].strip().startswith("#"):
	comments.insert(0, lines[j].strip().lstrip("#").strip())
	j -= 1

	# Parameter unpacking
	params = ""
	for k in range(i, min(i + 10, len(lines))):
	pm = re.search(r'my\s\(([^)]+)\)\s=\s*@_', lines[k])
	if pm:
	params = pm.group(1).strip()
	break

	subs.append({
	"name": sub_name,
	"line": i + 1,
	"comments": comments,
	"params": params,
	})

	# Credential flags
	has_credentials = False
	cred_lines = []
	for i, line in enumerate(lines, 1):
	for pat in CREDENTIAL_PATTERNS:
	if pat.search(line):
	has_credentials = True
	cred_lines.append(i)
	break

	# Data structures (hashes)
	data_maps = []
	for i, line in enumerate(lines):
	hm = re.match(r'my\s+(%\w+)\s=\s\(', line)
	if hm:
	data_maps.append({"name": hm.group(1), "line": i + 1})

	return {
	"package": package,
	"export_ok": export_ok,
	"export": export,
	"subs": subs,
	"has_credentials": has_credentials,
	"credential_lines": cred_lines,
	"data_maps": data_maps,
	"line_count": len(lines),
	}


	def main():
	if not os.path.isdir(BASE_DIR):
	print(f"Error: {BASE_DIR} not found", file=sys.stderr)
	sys.exit(1)

	t_start = time.monotonic()
	print("[generate-module-docs] Generating module documentation ...", file=sys.stderr)

	# Build reverse dependency map from perl-dependencies.csv
	reverse_deps = {} # module_name -> [files that use it]
	deps_path = os.path.join(MANIFEST_DIR, "perl-dependencies.csv")
	if os.path.exists(deps_path):
	with open(deps_path, newline="") as f:
	for row in csv.DictReader(f):
	if row["classification"] == "local":
	reverse_deps.setdefault(row["module"], []).append(row["file"])

	# Process each .pm file
	count = 0
	for name in sorted(os.listdir(BASE_DIR)):
	if not name.endswith(".pm"):
	continue
	# Skip backup files
	if "backup" in name or name.endswith(".orig"):
	continue

	filepath = os.path.join(BASE_DIR, name)
	print(f" {name} ...", file=sys.stderr)

	info = extract_module_info(filepath)

	# Generate slug for output filename
	slug = name.replace(".pm", "").lower()
	slug = f"sierra-{slug}"

	# Find users of this module
	module_name = info["package"] or f"Sierra::{name.replace('.pm', '')}"
	users = reverse_deps.get(module_name, [])

	doc_path = os.path.join(OUT_DIR, f"{slug}.md")
	with open(doc_path, "w") as f:
	f.write(f"# {module_name}\n\n")
	f.write(f"File: `Modules/Sierra/{name}`\n")
	f.write(f"Lines: {info['line_count']}\n")
	if info["has_credentials"]:
	f.write(f"WARNING: Contains credential references (lines: {', '.join(map(str, info['credential_lines']))})\n")
	f.write("\n---\n\n")

	# Exports
	f.write("## Exports\n\n")
	if info["export_ok"]:
	f.write("@EXPORT_OK:\n")
	for sym in info["export_ok"]:
	f.write(f"- `{sym}`\n")
	if info["export"]:
	f.write("\n@EXPORT (auto-imported):\n")
	for sym in info["export"]:
	f.write(f"- `{sym}`\n")
	if not info["export_ok"] and not info["export"]:
	f.write("No exports defined.\n")
	f.write("\n")

	# Subroutines
	f.write("## Subroutines\n\n")
	if info["subs"]:
	for s in info["subs"]:
	params = f"({s['params']})" if s["params"] else "()"
	f.write(f"### `{s['name']}`{params}\n\n")
	f.write(f"Line {s['line']}\n\n")
	if s["comments"]:
	f.write("> " + " ".join(s["comments"]) + "\n\n")
	else:
	f.write("No subroutines found.\n")
	f.write("\n")

	# Data structures
	if info["data_maps"]:
	f.write("## Data Structures\n\n")
	for dm in info["data_maps"]:
	f.write(f"- `{dm['name']}` (line {dm['line']})\n")
	f.write("\n")

	# Used by
	f.write("## Used By\n\n")
	if users:
	for u in sorted(users):
	f.write(f"- `{u}`\n")
	else:
	f.write("No known users found in dependency scan.\n")
	f.write("\n")

	count += 1

	print(f"\n Generated {count} module docs in {OUT_DIR}", file=sys.stderr)

	elapsed = time.monotonic() - t_start
	print(f" Done in {elapsed:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	"""Generate per-report markdown documentation from Phase 1 manifests.

	Reads report-status.csv, perl-dependencies.csv, git-summaries.csv, and script-content.json
	to produce pre-populated documentation for each report.

	Output: docs/ilsaux/reports/_template.md + one .md per report
	"""

	import csv
	import json
	import os
	import re
	import sys
	import time
	from datetime import datetime

	BASE_DIR = "/home/ray/Documents/ilsaux"
	MANIFEST_DIR = "/home/ray/claude/docs/ilsaux/manifests"
	OUT_DIR = "/home/ray/claude/docs/ilsaux/reports"

	SL_PREFIX = "sl"

	TEMPLATE = """# {fullname}

	Report: `{report_name}`
	Status: {status}{priority}
	Last Run: {last_run_date}
	Retention: {keepperiod}
	Perl Script: `{pl_file}` ({pl_lines} lines)
	Link: {link}
	Has Git: {has_git}

	---

	## Purpose

	{purpose}

	---

	## Execution Flow

	{execution_narrative}

	---

	## Dependencies

	### Sierra:: Modules
	{sierra_deps}

	### CPAN Modules
	{cpan_deps}

	---

	## Subroutines

	{subroutines}

	---

	## SQL Queries

	{sql_queries}

	---

	## Domain Data Maps

	{data_maps}

	---

	## Configuration Keys

	{config_keys}

	---

	## Known Issues / TODOs

	{todos}

	---

	## Historical Notes (Commented-out Code)

	{historical_notes}

	---

	## Git History

	{git_history}

	---

	## Database Connections

	{db_connections}

	---

	<!-- TODO: Add business context -->
	<!-- TODO: Add stakeholders -->
	<!-- TODO: Add migration plan -->
	"""


	def read_csv_file(filename):
	path = os.path.join(MANIFEST_DIR, filename)
	if not os.path.exists(path):
	print(f" WARNING: {path} not found", file=sys.stderr)
	return []
	with open(path, newline="") as f:
	return list(csv.DictReader(f))


	def read_json_file(filename):
	path = os.path.join(MANIFEST_DIR, filename)
	if not os.path.exists(path):
	print(f" WARNING: {path} not found", file=sys.stderr)
	return []
	with open(path) as f:
	return json.load(f)


	def format_subroutines(subs):
	if not subs:
	return "No subroutines found."
	lines = []
	for s in subs:
	params = f"({s['parameters']})" if s.get("parameters") else "()"
	comment = ""
	if s.get("leading_comments"):
	comment = " -- " + " ".join(s["leading_comments"])
	lines.append(f"- `{s['name']}`{params} (line {s['line']}, ~{s['line_count']} lines){comment}")
	return "\n".join(lines)


	def format_sql(queries):
	if not queries:
	return "No SQL queries extracted."
	lines = []
	for i, q in enumerate(queries, 1):
	lines.append(f"### Query {i} (`${q['variable']}`, line {q['start_line']})")
	lines.append("```sql")
	lines.append(q["sql"])
	lines.append("```")
	lines.append("")
	return "\n".join(lines)


	def format_data_maps(maps):
	if not maps:
	return "No data maps found."
	lines = []
	for m in maps:
	keys_sample = ", ".join(m["sample_keys"])
	lines.append(f"- `{m['variable']}` (line {m['line']}, {m['key_count']} keys) -- sample: {keys_sample}")
	return "\n".join(lines)


	def format_config_keys(refs):
	if not refs:
	return "No config keys found."
	seen = {}
	for r in refs:
	if r["key"] not in seen:
	seen[r["key"]] = r["line"]
	lines = []
	for key, line in sorted(seen.items()):
	lines.append(f"- `{key}` (first used line {line})")
	return "\n".join(lines)


	def format_execution_narrative(prints):
	if not prints:
	return "No print statements extracted."
	lines = []
	for p in prints:
	text = sanitize_text(p["text"])
	if text and not all(c in "+-=." for c in text):
	lines.append(f"{p['line']:>5}: {text}")
	if not lines:
	return "No meaningful print statements."
	return "```\n" + "\n".join(lines[:40]) + "\n```"


	def format_todos(todos):
	if not todos:
	return "None found."
	lines = []
	for t in todos:
	lines.append(f"- Line {t['line']}: {t['text']}")
	return "\n".join(lines)


	def format_historical_notes(blocks):
	if not blocks:
	return "No significant commented-out code blocks found."
	lines = []
	for b in blocks:
	sample = sanitize_text(" / ".join(b["sample"][:2]))
	lines.append(f"- Lines {b['start_line']}-{b['end_line']} ({b['line_count']} lines): `{sample}`")
	return "\n".join(lines)


	def sanitize_text(text):
	"""Belt-and-suspenders credential redaction for generated output."""
	# Literal passwords
	text = re.sub(r'\b[REDACTED-PASSWORD]\b', '[REDACTED-PASSWORD]', text, flags=re.IGNORECASE)
	text = re.sub(r'\b[REDACTED-PASSWORD]\b', '[REDACTED-PASSWORD]', text, flags=re.IGNORECASE)
	text = re.sub(r'[REDACTED-PASSWORD]', '[REDACTED-PASSWORD]', text, flags=re.IGNORECASE)
	# DB usernames as quoted strings
	text = re.sub(r'(?<=["\'])(?:sqlaccess\|sqllabels\d*\|sqldataentryerrors\|svc_vmsp1)(?=["\'])',
	'[REDACTED-USER]', text, flags=re.IGNORECASE)
	# Internal hostnames
	text = re.sub(r'\b[\w.-]+\.plch\.net\b', '[REDACTED-HOST]', text, flags=re.IGNORECASE)
	text = re.sub(r'\b[\w.-]+\.iii\.com\b', '[REDACTED-HOST]', text, flags=re.IGNORECASE)
	text = re.sub(r'\b[\w.-]+\.cincinnatilibrary\.org\b', '[REDACTED-HOST]', text, flags=re.IGNORECASE)
	# Internal emails
	text = re.sub(r'\b[\w.+-]+@cincinnatilibrary\.org\b', '[REDACTED-EMAIL]', text, flags=re.IGNORECASE)
	text = re.sub(r'\b[\w.+-]+@plch\.net\b', '[REDACTED-EMAIL]', text, flags=re.IGNORECASE)
	# DBI->connect credential args
	text = re.sub(r'(DBI->connect\(\s"[^"]"\s,\s)"[^"]"\s,\s"[^"]"',
	r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"', text, flags=re.IGNORECASE)
	# $password/$username = "..."
	text = re.sub(r'(\$password\s=\s)"[^"]*"', r'\1"[REDACTED-PASSWORD]"', text, flags=re.IGNORECASE)
	text = re.sub(r'(\$username\s=\s)"[^"]*"', r'\1"[REDACTED-USER]"', text, flags=re.IGNORECASE)
	return text


	def format_db_connections(conns):
	if not conns:
	return "No direct DB connections found (may use Sierra::DB module)."
	lines = []
	for c in conns:
	pattern = sanitize_text(c['pattern'])
	lines.append(f"- Line {c['line']}: `{pattern}`")
	return "\n".join(lines)


	def find_script_content(script_contents, report_name):
	"""Find matching script-content entries for a report."""
	matches = []
	for sc in script_contents:
	file_path = sc["file"]
	# Match by report directory
	if file_path.startswith(f"Reports/{report_name}/"):
	matches.append(sc)
	return matches


	def main():
	t_start = time.monotonic()
	print("[generate-report-docs] Generating report documentation ...", file=sys.stderr)

	# Read manifests
	report_status = read_csv_file("report-status.csv")
	perl_deps = read_csv_file("perl-dependencies.csv")
	git_summaries = read_csv_file("git-summaries.csv")
	script_contents = read_json_file("script-content.json")

	# Build lookup maps
	deps_by_file = {}
	for r in perl_deps:
	deps_by_file.setdefault(r["file"], []).append(r)

	git_by_path = {}
	for g in git_summaries:
	# Normalize path
	path = g["repo_path"]
	if path.startswith("Reports/"):
	name = path.split("/")[1] if "/" in path else path
	git_by_path[name] = g

	# Write template
	template_path = os.path.join(OUT_DIR, "_template.md")
	with open(template_path, "w") as f:
	f.write(TEMPLATE.replace("{", "{{").replace("}", "}}").replace("{{{{", "{").replace("}}}}", "}"))
	# Actually just write a reference template
	with open(template_path, "w") as f:
	f.write("# Report Documentation Template\n\n")
	f.write("This template is used by `generate-report-docs.py` to create per-report docs.\n")
	f.write("See any generated report file for the actual structure.\n")
	print(f" Wrote {template_path}", file=sys.stderr)

	# Generate per-report docs
	for report in report_status:
	name = report["report_name"]
	print(f" {name} ...", file=sys.stderr)

	# Find script content
	scs = find_script_content(script_contents, name)

	# Aggregate content from all matching scripts
	all_subs = []
	all_sql = []
	all_maps = []
	all_config = []
	all_prints = []
	all_todos = []
	all_historical = []
	all_db = []
	purpose = ""

	for sc in scs:
	if sc.get("report_identity") and not purpose:
	purpose = sc["report_identity"]
	all_subs.extend(sc.get("subroutines", []))
	all_sql.extend(sc.get("sql_queries", []))
	all_maps.extend(sc.get("data_maps", []))
	all_config.extend(sc.get("config_refs", []))
	all_prints.extend(sc.get("print_statements", []))
	all_todos.extend(sc.get("todo_comments", []))
	all_historical.extend(sc.get("commented_code_blocks", []))
	all_db.extend(sc.get("db_connections", []))

	if not purpose:
	purpose = report.get("fullname") or f"Report: {name}"

	# Collect dependencies
	sierra_deps = []
	cpan_deps = []
	for dep_list in deps_by_file.values():
	for d in dep_list:
	if d["file"].startswith(f"Reports/{name}/"):
	if d["classification"] == "local":
	sierra_deps.append(f"- `{d['module']}` ({d['imported_symbols'] or 'default'})")
	elif d["classification"] == "cpan":
	cpan_deps.append(f"- `{d['module']}`")

	sierra_deps_str = "\n".join(sorted(set(sierra_deps))) if sierra_deps else "None"
	cpan_deps_str = "\n".join(sorted(set(cpan_deps))) if cpan_deps else "None"

	# Git history
	git_info = git_by_path.get(name)
	if git_info:
	git_history = (
	f"- Commits: {git_info['total_commits']}\n"
	f"- First commit: {git_info['first_commit_date']}\n"
	f"- Last commit: {git_info['last_commit_date']}\n"
	f"- Branches: {git_info['branches']}\n"
	f"- Last message: {git_info['last_commit_message']}"
	)
	else:
	git_history = "No git repository found for this report."

	priority = " [HIGH PRIORITY - Shelf List]" if name.startswith(SL_PREFIX) else ""

	content = TEMPLATE.format(
	fullname=report.get("fullname") or name,
	report_name=name,
	status=report["status"],
	priority=priority,
	last_run_date=report["last_run_date"] or "Unknown",
	keepperiod=report.get("keepperiod") or "Unknown",
	pl_file=report.get("pl_file") or "Unknown",
	pl_lines=report.get("pl_lines", 0),
	link=sanitize_text(report.get("link") or "N/A"),
	has_git=report.get("has_git", False),
	purpose=sanitize_text(purpose),
	execution_narrative=format_execution_narrative(all_prints),
	sierra_deps=sierra_deps_str,
	cpan_deps=cpan_deps_str,
	subroutines=format_subroutines(all_subs),
	sql_queries=format_sql(all_sql),
	data_maps=format_data_maps(all_maps),
	config_keys=format_config_keys(all_config),
	todos=format_todos(all_todos),
	historical_notes=format_historical_notes(all_historical),
	git_history=git_history,
	db_connections=format_db_connections(all_db),
	)

	doc_path = os.path.join(OUT_DIR, f"{name}.md")
	with open(doc_path, "w") as f:
	f.write(content)

	print(f"\n Generated {len(report_status)} report docs in {OUT_DIR}", file=sys.stderr)

	elapsed = time.monotonic() - t_start
	print(f" Done in {elapsed:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()
	{
	"version": 1,
	"groups": {
	"reports": {
	"gist_id": "8aad3cf2c3d6c80742604fa76e9045bd",
	"gist_url": "https://gist.github.com/rayvoelker/8aad3cf2c3d6c80742604fa76e9045bd",
	"file_count": 53
	},
	"modules": {
	"gist_id": "66b8bf43f5d840f9c71724433b20ba56",
	"gist_url": "https://gist.github.com/rayvoelker/66b8bf43f5d840f9c71724433b20ba56",
	"file_count": 17
	},
	"framework": {
	"gist_id": "5aae40b92a5758a8713360931b2df2e5",
	"gist_url": "https://gist.github.com/rayvoelker/5aae40b92a5758a8713360931b2df2e5",
	"file_count": 4
	},
	"manifests-small": {
	"gist_id": "28151f90732c9f2484c205602ba17852",
	"gist_url": "https://gist.github.com/rayvoelker/28151f90732c9f2484c205602ba17852",
	"file_count": 8
	},
	"manifests-large": {
	"gist_id": "c060b43289f560745f77008c56e0a4ee",
	"gist_url": "https://gist.github.com/rayvoelker/c060b43289f560745f77008c56e0a4ee",
	"file_count": 3
	},
	"scripts": {
	"gist_id": "558ac1812ff2dc29c22ba4ed9cc1a72c",
	"gist_url": "https://gist.github.com/rayvoelker/558ac1812ff2dc29c22ba4ed9cc1a72c",
	"file_count": 15
	}
	},
	"old_single_gist_id": "cce2e74ff232c461e6c6b0e9a620a24f",
	"master_toc_gist_id": "cdb532b9b3d535e76dabf784d09ca4b9",
	"master_toc_gist_url": "https://gist.github.com/rayvoelker/cdb532b9b3d535e76dabf784d09ca4b9"
	}
	#!/usr/bin/env python3
	"""Parse all crontab backup files and extract schedule entries.

	Output: docs/ilsaux/manifests/cron-schedule.csv
	"""

	import csv
	import os
	import re
	import sys
	import time

	BASE_DIR = "/home/ray/Documents/ilsaux/crontab_files"
	OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"

	DOW_NAMES = {0: "Sun", 1: "Mon", 2: "Tue", 3: "Wed", 4: "Thu", 5: "Fri", 6: "Sat", 7: "Sun"}


	def human_schedule(minute, hour, dom, month, dow):
	"""Convert cron fields to a human-readable schedule description."""
	parts = []

	# Day of week
	if dow != "*":
	if dow in DOW_NAMES:
	parts.append(f"on {DOW_NAMES[dow]}")
	elif "-" in str(dow):
	parts.append(f"days {dow}")
	else:
	parts.append(f"dow={dow}")
	elif dom != "*":
	parts.append(f"on day {dom}")

	# Time
	if hour != "" and minute != "":
	try:
	h = int(hour)
	m = int(minute)
	ampm = "AM" if h < 12 else "PM"
	h12 = h % 12 or 12
	parts.append(f"at {h12}:{m:02d} {ampm}")
	except (ValueError, TypeError):
	parts.append(f"at {hour}:{minute}")
	elif hour != "*":
	parts.append(f"at hour {hour}")

	if month != "*":
	parts.append(f"month={month}")

	return " ".join(parts) if parts else "every minute"


	def extract_report_name(command):
	"""Try to extract report name from cron command."""
	# Match patterns like Reports/slitemdata/slitemdata-cron.sh
	m = re.search(r'Reports/(\w+)/\w+-cron\.sh', command)
	if m:
	return m.group(1)

	# Match patterns like /path/to/reportname-cron.sh
	m = re.search(r'/(\w+)-cron\.sh', command)
	if m:
	return m.group(1)

	# Match perl scripts
	m = re.search(r'Sierra(\w+)\.pl', command)
	if m:
	return m.group(1).lower()

	return ""


	def parse_crontab(filepath):
	"""Parse a single crontab file and return list of entries."""
	filename = os.path.basename(filepath)
	# Extract date from filename: crontab.backup.YYYYMMDD
	m = re.search(r'(\d{8})$', filename)
	crontab_date = m.group(1) if m else ""

	entries = []
	try:
	with open(filepath, "r", errors="replace") as f:
	lines = f.readlines()
	except OSError as e:
	print(f" SKIP: {filepath}: {e}", file=sys.stderr)
	return entries

	for line in lines:
	stripped = line.strip()
	if not stripped:
	continue

	# Skip variable assignments and shell settings
	if re.match(r'^(SHELL\|PATH\|MAILTO\|HOME\|#\s*m\s+h)', stripped):
	continue

	is_commented = stripped.startswith("#")
	notes = ""

	# Extract inline comment/notes
	if is_commented:
	# Remove leading # and check if it's a cron entry
	uncommented = stripped.lstrip("#").strip()
	# Check if there's a note after the command
	if re.match(r'^\d', uncommented) or re.match(r'^\*', uncommented):
	stripped = uncommented
	else:
	# Pure comment line - extract as note if relevant
	continue

	# Parse cron fields
	m = re.match(
	r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$',
	stripped
	)
	if not m:
	continue

	minute, hour, dom, month, dow, command = m.groups()

	# Check for inline comment in command
	if "#" in command:
	command, _, notes = command.partition("#")
	command = command.strip()
	notes = notes.strip()

	report_name = extract_report_name(command)

	entries.append({
	"crontab_file": filename,
	"crontab_date": crontab_date,
	"minute": minute,
	"hour": hour,
	"dom": dom,
	"month": month,
	"dow": dow,
	"command": command,
	"report_name": report_name,
	"is_commented": is_commented,
	"human_schedule": human_schedule(minute, hour, dom, month, dow),
	"notes": notes,
	})

	return entries


	def main():
	if not os.path.isdir(BASE_DIR):
	print(f"Error: {BASE_DIR} not found", file=sys.stderr)
	sys.exit(1)

	t_start = time.monotonic()
	print(f"[manifest-cron] Parsing crontab files in {BASE_DIR} ...", file=sys.stderr)

	all_entries = []

	crontab_files = sorted(os.listdir(BASE_DIR))
	for name in crontab_files:
	path = os.path.join(BASE_DIR, name)
	if os.path.isfile(path):
	entries = parse_crontab(path)
	all_entries.extend(entries)
	print(f" {name}: {len(entries)} entries", file=sys.stderr)

	csv_path = os.path.join(OUT_DIR, "cron-schedule.csv")
	with open(csv_path, "w", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=[
	"crontab_file", "crontab_date", "minute", "hour", "dom", "month",
	"dow", "command", "report_name", "is_commented", "human_schedule", "notes"
	])
	writer.writeheader()
	writer.writerows(all_entries)

	print(f" Wrote {csv_path} ({len(all_entries)} rows)", file=sys.stderr)

	# Summary of latest crontab
	latest = [e for e in all_entries if e["crontab_date"] == "20190709"]
	if latest:
	active = [e for e in latest if not e["is_commented"]]
	commented = [e for e in latest if e["is_commented"]]
	print(f"\n Latest crontab (20190709): {len(active)} active, {len(commented)} commented", file=sys.stderr)
	for e in active:
	print(f" {e['human_schedule']}: {e['report_name'] or e['command'][:50]}", file=sys.stderr)

	elapsed = time.monotonic() - t_start
	print(f"\n Done in {elapsed:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	"""Find all git repos in ilsaux and extract summary information.

	Output: docs/ilsaux/manifests/git-summaries.csv
	"""

	import csv
	import os
	import subprocess
	import sys
	import time

	BASE_DIR = "/home/ray/Documents/ilsaux"
	OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"
	TIMEOUT = 10


	def git_cmd(repo_dir, args):
	"""Run a git command in repo_dir with timeout, return stdout or empty string."""
	try:
	result = subprocess.run(
	["git"] + args,
	cwd=repo_dir,
	capture_output=True,
	text=True,
	timeout=TIMEOUT,
	)
	return result.stdout.strip()
	except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
	return ""


	def main():
	if not os.path.isdir(BASE_DIR):
	print(f"Error: {BASE_DIR} not found", file=sys.stderr)
	sys.exit(1)

	t_start = time.monotonic()
	print(f"[manifest-git] Finding git repos in {BASE_DIR} ...", file=sys.stderr)

	# Find all .git directories
	git_dirs = []
	for root, dirs, files in os.walk(BASE_DIR):
	if ".git" in dirs:
	git_dirs.append(root)
	dirs.remove(".git") # Don't recurse into .git
	# Don't recurse into .cpan
	if ".cpan" in dirs:
	dirs.remove(".cpan")

	print(f" Found {len(git_dirs)} git repos", file=sys.stderr)

	rows = []
	for repo_dir in sorted(git_dirs):
	rel_path = os.path.relpath(repo_dir, BASE_DIR)
	print(f" {rel_path} ...", file=sys.stderr)

	# Total commits
	log_count = git_cmd(repo_dir, ["rev-list", "--count", "HEAD"])
	total_commits = int(log_count) if log_count.isdigit() else 0

	# First commit date
	first_date = git_cmd(repo_dir, [
	"log", "--reverse", "--format=%aI", "--max-count=1"
	])

	# Last commit date and message
	last_info = git_cmd(repo_dir, [
	"log", "--format=%aI\|\|\|%s", "--max-count=1"
	])
	last_date = ""
	last_msg = ""
	if "\|\|\|" in last_info:
	last_date, last_msg = last_info.split("\|\|\|", 1)

	# Branches
	branches_raw = git_cmd(repo_dir, ["branch", "--format=%(refname:short)"])
	branches = ", ".join(branches_raw.split("\n")) if branches_raw else ""

	rows.append({
	"repo_path": rel_path,
	"total_commits": total_commits,
	"first_commit_date": first_date,
	"last_commit_date": last_date,
	"branches": branches,
	"last_commit_message": last_msg,
	})

	csv_path = os.path.join(OUT_DIR, "git-summaries.csv")
	with open(csv_path, "w", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=[
	"repo_path", "total_commits", "first_commit_date",
	"last_commit_date", "branches", "last_commit_message"
	])
	writer.writeheader()
	writer.writerows(rows)

	print(f" Wrote {csv_path} ({len(rows)} repos)", file=sys.stderr)

	elapsed = time.monotonic() - t_start
	print(f" Done in {elapsed:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	"""Extract Perl module dependencies from all .pl/.pm files in ilsaux.

	Output: docs/ilsaux/manifests/perl-dependencies.csv
	Includes reverse dependency map (which reports use which Sierra:: modules).
	"""

	import csv
	import os
	import re
	import sys
	import time

	BASE_DIR = "/home/ray/Documents/ilsaux"
	OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"

	# Core Perl modules (common ones seen in this codebase)
	CORE_MODULES = {
	"strict", "warnings", "Carp", "DBI", "Exporter", "POSIX",
	"File::Basename", "File::Copy", "File::Path", "File::Find",
	"Getopt::Long", "Getopt::Std", "Data::Dumper", "Scalar::Util",
	"List::Util", "Time::Local", "Time::HiRes", "IO::File",
	"Encode", "utf8",
	}

	# Known CPAN modules
	CPAN_MODULES = {
	"Config::Simple", "XML::Simple", "Net::FTP", "SQL::Beautify",
	"Net::SFTP::Foreign", "Text::CSV", "Text::CSV_XS", "JSON",
	"JSON::XS", "LWP::UserAgent", "HTTP::Request", "HTTP::Response",
	"SOAP::Lite", "MARC::Record", "MARC::Field", "MARC::Batch",
	"MARC::File::USMARC", "DBIx::Class", "Excel::Writer::XLSX",
	"Spreadsheet::WriteExcel", "CGI", "Template",
	}

	USE_RE = re.compile(r'^\suse\s+([\w:]+)(?:\s+qw\(([^)])\))?', re.MULTILINE)
	REQUIRE_RE = re.compile(r'^\s*require\s+([\w:]+)', re.MULTILINE)
	EXPORT_OK_RE = re.compile(r'@EXPORT_OK\s=\sqw\(\s(.?)\s*\)', re.DOTALL)
	EXPORT_RE = re.compile(r'@EXPORT\s=\sqw\(\s(.?)\s*\)', re.DOTALL)


	def classify_module(name):
	if name.startswith("Sierra::"):
	return "local"
	if name in CORE_MODULES:
	return "core"
	if name in CPAN_MODULES:
	return "cpan"
	# Version numbers (use 5.008007)
	if re.match(r'^\d', name):
	return "pragma"
	# Lowercase = pragma
	if name[0].islower():
	return "pragma"
	return "cpan"


	def main():
	if not os.path.isdir(BASE_DIR):
	print(f"Error: {BASE_DIR} not found", file=sys.stderr)
	sys.exit(1)

	t_start = time.monotonic()
	print("[manifest-perl-deps] Scanning .pl/.pm files ...", file=sys.stderr)

	rows = []
	exports = {} # module_file -> {"export_ok": [...], "export": [...]}

	for root, dirs, files in os.walk(BASE_DIR):
	# Skip .git internals
	dirs[:] = [d for d in dirs if d != ".git"]

	for name in files:
	if not (name.endswith(".pl") or name.endswith(".pm")):
	continue

	path = os.path.join(root, name)
	rel_path = os.path.relpath(path, BASE_DIR)

	try:
	with open(path, "r", errors="replace") as f:
	content = f.read()
	except OSError as e:
	print(f" SKIP: {path}: {e}", file=sys.stderr)
	continue

	# Extract use statements
	for m in USE_RE.finditer(content):
	module = m.group(1)
	symbols = m.group(2) or ""
	symbols = " ".join(symbols.split())
	classification = classify_module(module)
	rows.append({
	"file": rel_path,
	"module": module,
	"import_type": "use",
	"imported_symbols": symbols,
	"classification": classification,
	})

	# Extract require statements
	for m in REQUIRE_RE.finditer(content):
	module = m.group(1)
	classification = classify_module(module)
	rows.append({
	"file": rel_path,
	"module": module,
	"import_type": "require",
	"imported_symbols": "",
	"classification": classification,
	})

	# Extract exports from .pm files
	if name.endswith(".pm"):
	export_ok = []
	export = []
	for m in EXPORT_OK_RE.finditer(content):
	export_ok.extend(m.group(1).split())
	for m in EXPORT_RE.finditer(content):
	export.extend(m.group(1).split())
	if export_ok or export:
	exports[rel_path] = {
	"export_ok": export_ok,
	"export": export,
	}

	print(f" Found {len(rows)} dependency entries", file=sys.stderr)

	# Write main CSV
	csv_path = os.path.join(OUT_DIR, "perl-dependencies.csv")
	rows.sort(key=lambda r: (r["file"], r["module"]))
	with open(csv_path, "w", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=[
	"file", "module", "import_type", "imported_symbols", "classification"
	])
	writer.writeheader()
	writer.writerows(rows)
	print(f" Wrote {csv_path} ({len(rows)} rows)", file=sys.stderr)

	# Print reverse dependency summary
	sierra_users = {} # sierra_module -> [files that use it]
	for r in rows:
	if r["classification"] == "local":
	sierra_users.setdefault(r["module"], []).append(r["file"])

	print("\n Reverse dependency map (Sierra:: modules):", file=sys.stderr)
	for mod in sorted(sierra_users):
	users = sierra_users[mod]
	print(f" {mod}: used by {len(users)} files", file=sys.stderr)
	for u in users:
	print(f" - {u}", file=sys.stderr)

	elapsed = time.monotonic() - t_start
	print(f"\n Done in {elapsed:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	"""Determine status of each report: active, inactive, or obsolete.

	Parses cron wrappers, JSON metadata files, and .cfg files.
	Output: docs/ilsaux/manifests/report-status.csv, credential-locations.csv
	"""

	import csv
	import json
	import os
	import re
	import sys
	import time
	from datetime import datetime

	BASE_DIR = "/home/ray/Documents/ilsaux/Reports"
	OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"

	# Patterns that indicate credentials (file+line only, NO values)
	CREDENTIAL_PATTERNS = [
	(re.compile(r'password', re.IGNORECASE), "password"),
	(re.compile(r'passwd', re.IGNORECASE), "password"),
	(re.compile(r'secret', re.IGNORECASE), "secret"),
	(re.compile(r'api[_-]?key', re.IGNORECASE), "api_key"),
	(re.compile(r'token', re.IGNORECASE), "token"),
	(re.compile(r'DBI->connect\s*\(', re.IGNORECASE), "db_connection"),
	(re.compile(r'host\s*=', re.IGNORECASE), "host_config"),
	(re.compile(r'user\s*=', re.IGNORECASE), "user_config"),
	]

	# Now threshold: 60 days for "active"
	ACTIVE_DAYS = 60
	INACTIVE_YEAR = 2020


	def parse_cron_wrapper(path):
	"""Extract variables from a -cron.sh file."""
	info = {}
	try:
	with open(path, "r") as f:
	for line in f:
	line = line.strip()
	for var in ("REPORTNAME", "FULLNAME", "SOURCEFILE", "LINK", "KEEPPERIOD"):
	m = re.match(rf'^{var}=(.+)', line)
	if m:
	val = m.group(1).strip().strip('"').strip("'")
	info[var.lower()] = val
	except OSError:
	pass
	return info


	def find_newest_json(report_dir):
	"""Find the newest .json metadata file and extract last run date."""
	newest_time = 0
	newest_date = None

	for name in os.listdir(report_dir):
	if name.endswith(".json") and "-" in name:
	path = os.path.join(report_dir, name)
	try:
	mtime = os.stat(path).st_mtime
	if mtime > newest_time:
	newest_time = mtime
	# Try to parse date from the JSON content
	try:
	with open(path) as f:
	data = json.load(f)
	if "date" in data:
	newest_date = data["date"]
	elif "timeFinished" in data:
	ts = int(data["timeFinished"])
	newest_date = datetime.fromtimestamp(ts).strftime("%Y-%m-%d")
	except (json.JSONDecodeError, ValueError, KeyError):
	newest_date = datetime.fromtimestamp(mtime).strftime("%Y-%m-%d")
	except OSError:
	continue

	return newest_date


	def count_pl_lines(report_dir):
	"""Count lines in .pl files."""
	total = 0
	pl_file = None
	for name in os.listdir(report_dir):
	if name.endswith(".pl"):
	pl_file = name
	path = os.path.join(report_dir, name)
	try:
	with open(path) as f:
	total += sum(1 for _ in f)
	except OSError:
	pass
	return pl_file, total


	def scan_credentials(report_dir):
	"""Scan .cfg and .pm and .pl files for credential patterns. Return file+line only."""
	cred_entries = []

	for name in os.listdir(report_dir):
	if not any(name.endswith(ext) for ext in (".cfg", ".pl", ".pm", ".conf")):
	continue

	path = os.path.join(report_dir, name)
	rel_path = os.path.relpath(path, os.path.dirname(BASE_DIR))

	try:
	with open(path, "r", errors="replace") as f:
	for i, line in enumerate(f, 1):
	for pattern, cred_type in CREDENTIAL_PATTERNS:
	if pattern.search(line):
	cred_entries.append({
	"file": rel_path,
	"line_number": i,
	"credential_type": cred_type,
	})
	break # One match per line is enough
	except OSError:
	pass

	return cred_entries


	def classify_status(last_run_date):
	"""Classify report status based on last run date."""
	if not last_run_date:
	return "obsolete"
	try:
	dt = datetime.strptime(last_run_date, "%Y-%m-%d")
	except ValueError:
	return "unknown"

	now = datetime.now()
	delta = (now - dt).days

	if delta <= ACTIVE_DAYS:
	return "active"
	elif dt.year >= INACTIVE_YEAR:
	return "inactive-recent"
	else:
	return "obsolete"


	def main():
	if not os.path.isdir(BASE_DIR):
	print(f"Error: {BASE_DIR} not found", file=sys.stderr)
	sys.exit(1)

	t_start = time.monotonic()
	print(f"[manifest-report-status] Scanning {BASE_DIR} ...", file=sys.stderr)

	report_rows = []
	all_credentials = []

	for name in sorted(os.listdir(BASE_DIR)):
	report_dir = os.path.join(BASE_DIR, name)
	if not os.path.isdir(report_dir):
	continue
	# Skip tar.gz entries
	if name.endswith(".tar.gz"):
	continue

	print(f" {name} ...", file=sys.stderr)

	# Parse cron wrapper
	cron_files = [f for f in os.listdir(report_dir) if f.endswith("-cron.sh")]
	cron_info = {}
	schedule = ""
	for cf in cron_files:
	cron_info = parse_cron_wrapper(os.path.join(report_dir, cf))

	# Find last run date from JSON metadata
	last_run_date = find_newest_json(report_dir)

	# Count .pl lines
	pl_file, pl_lines = count_pl_lines(report_dir)

	# Check for .git
	has_git = os.path.isdir(os.path.join(report_dir, ".git"))

	# Scan for credentials
	creds = scan_credentials(report_dir)
	all_credentials.extend(creds)

	status = classify_status(last_run_date)

	report_rows.append({
	"report_name": name,
	"status": status,
	"last_run_date": last_run_date or "",
	"schedule": cron_info.get("keepperiod", ""),
	"pl_file": pl_file or "",
	"pl_lines": pl_lines,
	"fullname": cron_info.get("fullname", ""),
	"link": cron_info.get("link", ""),
	"keepperiod": cron_info.get("keepperiod", ""),
	"has_git": has_git,
	"credential_count": len(creds),
	})

	# Write report-status.csv
	csv_path = os.path.join(OUT_DIR, "report-status.csv")
	with open(csv_path, "w", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=[
	"report_name", "status", "last_run_date", "schedule",
	"pl_file", "pl_lines", "fullname", "link", "keepperiod",
	"has_git", "credential_count"
	])
	writer.writeheader()
	writer.writerows(report_rows)
	print(f" Wrote {csv_path} ({len(report_rows)} reports)", file=sys.stderr)

	# Write credential-locations.csv (NO values!)
	cred_path = os.path.join(OUT_DIR, "credential-locations.csv")
	with open(cred_path, "w", newline="") as f:
	writer = csv.DictWriter(f, fieldnames=["file", "line_number", "credential_type"])
	writer.writeheader()
	writer.writerows(all_credentials)
	print(f" Wrote {cred_path} ({len(all_credentials)} entries, NO values)", file=sys.stderr)

	# Summary
	status_counts = {}
	for r in report_rows:
	status_counts[r["status"]] = status_counts.get(r["status"], 0) + 1
	print("\n Status summary:", file=sys.stderr)
	for s, c in sorted(status_counts.items()):
	print(f" {s}: {c}", file=sys.stderr)

	elapsed = time.monotonic() - t_start
	print(f"\n Done in {elapsed:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()
	#!/usr/bin/env python3
	"""Extract self-documenting content from Perl scripts: comments, SQL, subs, data maps, etc.

	Output: docs/ilsaux/manifests/script-content.json
	This is the richest source of documentation since formal docs are sparse.
	"""

	import json
	import os
	import re
	import sys
	import time

	BASE_DIR = "/home/ray/Documents/ilsaux"
	OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"

	# Sensitive patterns to redact from all extracted content
	_SANITIZE_PATTERNS = [
	# Literal passwords
	(re.compile(r'\b[REDACTED-PASSWORD]\b', re.IGNORECASE), '[REDACTED-PASSWORD]'),
	(re.compile(r'\b[REDACTED-PASSWORD]\b', re.IGNORECASE), '[REDACTED-PASSWORD]'),
	(re.compile(r'[REDACTED-PASSWORD]', re.IGNORECASE), '[REDACTED-PASSWORD]'),
	# DB usernames as quoted strings
	(re.compile(r'(?<=["\'])(?:sqlaccess\|sqllabels\d*\|sqldataentryerrors\|svc_vmsp1)(?=["\'])', re.IGNORECASE), '[REDACTED-USER]'),
	# Internal hostnames
	(re.compile(r'\b[\w.-]+\.plch\.net\b', re.IGNORECASE), '[REDACTED-HOST]'),
	(re.compile(r'\b[\w.-]+\.iii\.com\b', re.IGNORECASE), '[REDACTED-HOST]'),
	(re.compile(r'\b[\w.-]+\.cincinnatilibrary\.org\b', re.IGNORECASE), '[REDACTED-HOST]'),
	# Email addresses at internal domains
	(re.compile(r'\b[\w.+-]+@cincinnatilibrary\.org\b', re.IGNORECASE), '[REDACTED-EMAIL]'),
	(re.compile(r'\b[\w.+-]+@plch\.net\b', re.IGNORECASE), '[REDACTED-EMAIL]'),
	# DBI->connect credential args
	(re.compile(r'(DBI->connect\(\s"[^"]"\s,\s)"[^"]"\s,\s"[^"]"', re.IGNORECASE), r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"'),
	# $password = "..." and $username = "..."
	(re.compile(r'(\$password\s=\s)"[^"]*"', re.IGNORECASE), r'\1"[REDACTED-PASSWORD]"'),
	(re.compile(r'(\$username\s=\s)"[^"]*"', re.IGNORECASE), r'\1"[REDACTED-USER]"'),
	# $ua->credentials
	(re.compile(r"(\$ua->credentials\([^)])'[^']'\s,\s'[^']'\s\)", re.IGNORECASE), r"\1'[REDACTED-USER]', '[REDACTED-PASSWORD]')"),
	]


	def sanitize_text(text):
	"""Apply credential redaction to extracted text."""
	for pattern, replacement in _SANITIZE_PATTERNS:
	text = pattern.sub(replacement, text)
	return text


	def extract_report_identity(lines):
	"""Extract human-readable report name from BEGIN block print statements."""
	in_begin = False
	for line in lines:
	stripped = line.strip()
	if stripped.startswith("BEGIN"):
	in_begin = True
	if in_begin:
	m = re.search(r'print\s+"([^"](?:begin\|Report)[^"])"\s*;', stripped, re.IGNORECASE)
	if m:
	name = m.group(1)
	# Clean up the name
	name = re.sub(r'[\+\n\\]', '', name)
	name = name.strip()
	return name
	if stripped == "}":
	in_begin = False
	return None


	def extract_comments(lines):
	"""Extract all comment lines, categorized."""
	todo_comments = []
	section_comments = []
	inline_comments = []

	for i, line in enumerate(lines, 1):
	stripped = line.strip()
	if not stripped:
	continue

	# Pure comment line
	if stripped.startswith("#"):
	comment = stripped.lstrip("#").strip()
	if re.match(r'TODO\|FIXME\|HACK', comment, re.IGNORECASE):
	todo_comments.append({"line": i, "text": comment})
	elif re.match(r'-{3,}', stripped.lstrip("#")):
	section_comments.append({"line": i, "text": comment})
	else:
	inline_comments.append({"line": i, "text": comment})
	# Inline comment on code line
	elif "#" in stripped:
	# Avoid matches inside strings
	code_part, _, comment_part = stripped.partition("#")
	# Simple heuristic: if there's a quote before #, skip
	if code_part.count('"') % 2 == 0 and code_part.count("'") % 2 == 0:
	if comment_part.strip():
	if re.match(r'TODO\|FIXME\|HACK', comment_part.strip(), re.IGNORECASE):
	todo_comments.append({"line": i, "text": comment_part.strip()})
	else:
	inline_comments.append({
	"line": i,
	"text": comment_part.strip(),
	"code": code_part.strip(),
	})

	return todo_comments, section_comments, inline_comments


	def extract_sql(lines):
	"""Reconstruct SQL from $sql_query .= "..." concatenation patterns."""
	queries = []
	current_sql = []
	current_var = None
	start_line = None

	for i, line in enumerate(lines, 1):
	stripped = line.strip()

	# Match: $sql_query = "..."; or $sql .= "...";
	m = re.match(r'\$(\w+)\s\.?=\s"(.?)"\s;', stripped)
	if m:
	var_name = m.group(1)
	sql_part = m.group(2)

	if "sql" in var_name.lower() or "query" in var_name.lower():
	if ".=" not in stripped and current_sql:
	# New assignment, save previous
	queries.append({
	"variable": current_var,
	"start_line": start_line,
	"sql": "\n".join(current_sql),
	})
	current_sql = []

	if not current_sql:
	current_var = var_name
	start_line = i

	current_sql.append(sql_part)
	continue

	# Also match heredoc-style SQL
	if re.match(r'\$\w+\s=\s<<', stripped):
	# Heredoc start - capture until delimiter
	pass

	# If we were building SQL and hit a non-continuation line, save
	if current_sql and not re.match(r'\$\wsql\w\s*\.?=', stripped, re.IGNORECASE):
	queries.append({
	"variable": current_var,
	"start_line": start_line,
	"sql": "\n".join(current_sql),
	})
	current_sql = []
	current_var = None

	# Don't forget last one
	if current_sql:
	queries.append({
	"variable": current_var,
	"start_line": start_line,
	"sql": "\n".join(current_sql),
	})

	return queries


	def extract_subroutines(lines):
	"""Extract sub declarations with leading comments and parameters."""
	subs = []
	i = 0
	while i < len(lines):
	line = lines[i].strip()
	m = re.match(r'^sub\s+(\w+)', line)
	if m:
	sub_name = m.group(1)
	sub_start = i + 1 # 1-indexed

	# Look back for leading comments
	leading_comments = []
	j = i - 1
	while j >= 0 and lines[j].strip().startswith("#"):
	leading_comments.insert(0, lines[j].strip().lstrip("#").strip())
	j -= 1

	# Count lines until closing brace (approximate)
	brace_depth = 0
	sub_end = i
	for k in range(i, len(lines)):
	brace_depth += lines[k].count("{") - lines[k].count("}")
	if brace_depth <= 0 and k > i:
	sub_end = k
	break
	else:
	sub_end = len(lines) - 1

	line_count = sub_end - i + 1

	# Look for parameter unpacking
	params = ""
	for k in range(i, min(i + 10, len(lines))):
	pm = re.search(r'my\s\(([^)]+)\)\s=\s*@_', lines[k])
	if pm:
	params = pm.group(1).strip()
	break

	subs.append({
	"name": sub_name,
	"line": sub_start,
	"line_count": line_count,
	"leading_comments": leading_comments,
	"parameters": params,
	})
	i += 1

	return subs


	def extract_data_maps(lines):
	"""Extract named hash definitions with sample keys."""
	maps = []
	i = 0
	while i < len(lines):
	line = lines[i].strip()

	# Match: my %name = ( or my %name_for_... = (
	m = re.match(r'my\s+(%\w+)\s=\s\(', line)
	if m:
	var_name = m.group(1)
	start_line = i + 1

	# Collect all lines until closing paren
	content_lines = [line]
	brace_depth = line.count("(") - line.count(")")
	k = i + 1
	while k < len(lines) and brace_depth > 0:
	content_lines.append(lines[k])
	brace_depth += lines[k].count("(") - lines[k].count(")")
	k += 1

	full_content = "\n".join(content_lines)

	# Extract key => value pairs
	pairs = re.findall(r'["\']?(\w+)["\']?\s*=>', full_content)
	sample_keys = pairs[:5]

	if pairs: # Only include if we found actual key-value pairs
	maps.append({
	"variable": var_name,
	"line": start_line,
	"key_count": len(pairs),
	"sample_keys": sample_keys,
	})
	i = k
	continue
	i += 1

	return maps


	def extract_config_refs(lines):
	"""Extract $cfg->param("...") calls."""
	refs = []
	for i, line in enumerate(lines, 1):
	for m in re.finditer(r'\$cfg->param\(\s["\']([^"\']+)["\']\s\)', line):
	refs.append({"line": i, "key": m.group(1)})
	return refs


	def extract_print_stmts(lines):
	"""Extract print statements (execution narrative)."""
	stmts = []
	for i, line in enumerate(lines, 1):
	m = re.search(r'print\s+"([^"]+)"', line.strip())
	if m:
	text = m.group(1)
	# Skip purely variable prints and separator lines
	if text.strip() and not re.match(r'^[\+\-\=]+$', text.strip()):
	stmts.append({"line": i, "text": text.replace("\\n", "").strip()})
	return stmts


	def extract_commented_code(lines):
	"""Detect blocks of 3+ consecutive commented lines that look like code."""
	blocks = []
	current_block = []
	current_start = None
	code_indicators = re.compile(r'[\$\@\%]\|->\|=\s\|;\s$\|if\s\(\|while\|foreach\|sub\s')

	for i, line in enumerate(lines, 1):
	stripped = line.strip()
	if stripped.startswith("#") and not stripped.startswith("#!"):
	comment_content = stripped.lstrip("#")
	if code_indicators.search(comment_content):
	if not current_block:
	current_start = i
	current_block.append({"line": i, "text": comment_content.strip()})
	else:
	if len(current_block) >= 3:
	blocks.append({
	"start_line": current_start,
	"end_line": current_block[-1]["line"],
	"line_count": len(current_block),
	"sample": [b["text"] for b in current_block[:3]],
	})
	current_block = []
	else:
	if len(current_block) >= 3:
	blocks.append({
	"start_line": current_start,
	"end_line": current_block[-1]["line"],
	"line_count": len(current_block),
	"sample": [b["text"] for b in current_block[:3]],
	})
	current_block = []

	# Final block
	if len(current_block) >= 3:
	blocks.append({
	"start_line": current_start,
	"end_line": current_block[-1]["line"],
	"line_count": len(current_block),
	"sample": [b["text"] for b in current_block[:3]],
	})

	return blocks


	def extract_db_connections(lines):
	"""Extract DBI->connect calls (sanitized -- credentials and hostnames redacted)."""
	connections = []
	for i, line in enumerate(lines, 1):
	if "DBI->connect" in line or "dbi:Pg" in line.lower() or "dbi:mysql" in line.lower():
	sanitized = line.strip()
	# Redact host= values in DSN strings
	sanitized = re.sub(
	r'(host=)[\w.-]+\.(plch\.net\|iii\.com\|cincinnatilibrary\.org)',
	r'\1[REDACTED-HOST]',
	sanitized,
	flags=re.IGNORECASE,
	)
	# Redact user/password args in DBI->connect("dsn","[REDACTED-USER]","[REDACTED-PASSWORD]",...)
	sanitized = re.sub(
	r'(DBI->connect\(\s"[^"]"\s,\s)"[^"]"\s,\s"[^"]"',
	r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"',
	sanitized,
	flags=re.IGNORECASE,
	)
	connections.append({"line": i, "pattern": sanitized})
	return connections


	def sanitize_result(obj):
	"""Recursively sanitize all string values in a nested data structure."""
	if isinstance(obj, str):
	return sanitize_text(obj)
	if isinstance(obj, list):
	return [sanitize_result(item) for item in obj]
	if isinstance(obj, dict):
	return {k: sanitize_result(v) for k, v in obj.items()}
	return obj


	def process_file(path, rel_path):
	"""Process a single .pl/.pm file and extract all self-documenting content."""
	try:
	with open(path, "r", errors="replace") as f:
	content = f.read()
	except OSError as e:
	print(f" SKIP: {path}: {e}", file=sys.stderr)
	return None

	lines = content.split("\n")

	todo_comments, section_comments, inline_comments = extract_comments(lines)

	result = {
	"file": rel_path,
	"line_count": len(lines),
	"report_identity": extract_report_identity(lines),
	"todo_comments": todo_comments,
	"section_comments": section_comments,
	"inline_comment_count": len(inline_comments),
	"inline_comments_sample": inline_comments[:20],
	"sql_queries": extract_sql(lines),
	"subroutines": extract_subroutines(lines),
	"data_maps": extract_data_maps(lines),
	"config_refs": extract_config_refs(lines),
	"print_statements": extract_print_stmts(lines),
	"commented_code_blocks": extract_commented_code(lines),
	"db_connections": extract_db_connections(lines),
	}

	# Sanitize all string content to redact credentials/hostnames
	return sanitize_result(result)


	def main():
	if not os.path.isdir(BASE_DIR):
	print(f"Error: {BASE_DIR} not found", file=sys.stderr)
	sys.exit(1)

	t_start = time.monotonic()
	print("[manifest-script-content] Extracting self-documentation ...", file=sys.stderr)

	results = []
	file_count = 0

	for root, dirs, files in os.walk(BASE_DIR):
	dirs[:] = [d for d in dirs if d != ".git"]

	for name in files:
	if not (name.endswith(".pl") or name.endswith(".pm")):
	continue

	path = os.path.join(root, name)
	rel_path = os.path.relpath(path, BASE_DIR)
	result = process_file(path, rel_path)
	if result:
	results.append(result)
	file_count += 1

	# Sort by file path
	results.sort(key=lambda r: r["file"])

	out_path = os.path.join(OUT_DIR, "script-content.json")
	with open(out_path, "w") as f:
	json.dump(results, f, indent=2)

	print(f" Processed {file_count} files", file=sys.stderr)
	print(f" Wrote {out_path}", file=sys.stderr)

	# Summary stats
	total_subs = sum(len(r["subroutines"]) for r in results)
	total_sql = sum(len(r["sql_queries"]) for r in results)
	total_maps = sum(len(r["data_maps"]) for r in results)
	total_todos = sum(len(r["todo_comments"]) for r in results)
	print(f" Totals: {total_subs} subs, {total_sql} SQL queries, "
	f"{total_maps} data maps, {total_todos} TODOs", file=sys.stderr)

	elapsed = time.monotonic() - t_start
	print(f" Done in {elapsed:.1f}s", file=sys.stderr)


	if __name__ == "__main__":
	main()