Skip to content

Instantly share code, notes, and snippets.

@rayvoelker
Last active March 6, 2026 22:28
Show Gist options
  • Select an option

  • Save rayvoelker/558ac1812ff2dc29c22ba4ed9cc1a72c to your computer and use it in GitHub Desktop.

Select an option

Save rayvoelker/558ac1812ff2dc29c22ba4ed9cc1a72c to your computer and use it in GitHub Desktop.
ilsaux Documentation Generator Scripts - CHPL Sierra ILS
#!/usr/bin/env python3
"""Generate archive classification and migration plan from all manifests.
Output: docs/ilsaux/archive-plan.md
"""
import csv
import os
import sys
import time
from collections import Counter, defaultdict
from datetime import datetime
MANIFEST_DIR = "/home/ray/claude/docs/ilsaux/manifests"
OUT_PATH = "/home/ray/claude/docs/ilsaux/archive-plan.md"
def human_size(nbytes):
for unit in ("B", "KB", "MB", "GB", "TB"):
if abs(nbytes) < 1024:
return f"{nbytes:.1f} {unit}"
nbytes /= 1024
return f"{nbytes:.1f} PB"
def read_csv_file(filename):
path = os.path.join(MANIFEST_DIR, filename)
if not os.path.exists(path):
print(f" WARNING: {path} not found", file=sys.stderr)
return []
with open(path, newline="") as f:
return list(csv.DictReader(f))
def main():
t_start = time.monotonic()
print("[generate-archive-plan] Generating archive classification ...", file=sys.stderr)
report_status = read_csv_file("report-status.csv")
file_manifest = read_csv_file("file-manifest.csv")
credential_locs = read_csv_file("credential-locations.csv")
git_summaries = read_csv_file("git-summaries.csv")
# Build size map per report directory
report_sizes = defaultdict(int)
report_file_counts = defaultdict(int)
top_dir_sizes = defaultdict(int)
for f in file_manifest:
parts = f["parent_dir"].split("/")
if len(parts) >= 2 and parts[0] == "Reports":
report_sizes[parts[1]] += int(f["size_bytes"])
report_file_counts[parts[1]] += 1
top = parts[0] if parts[0] != "." else "(root)"
top_dir_sizes[top] += int(f["size_bytes"])
# Classify reports
categories = {
"active-critical": [],
"active": [],
"inactive-recent": [],
"inactive-legacy": [],
"obsolete": [],
}
for r in report_status:
name = r["report_name"]
status = r["status"]
is_sl = name.startswith("sl")
if status == "active" and is_sl:
categories["active-critical"].append(r)
elif status == "active":
categories["active"].append(r)
elif status == "inactive-recent":
categories["inactive-recent"].append(r)
elif status == "obsolete":
# Further classify by last run date
last = r.get("last_run_date", "")
if last:
try:
dt = datetime.strptime(last, "%Y-%m-%d")
if dt.year >= 2020:
categories["inactive-recent"].append(r)
else:
categories["inactive-legacy"].append(r)
continue
except ValueError:
pass
categories["obsolete"].append(r)
else:
categories["obsolete"].append(r)
# Credential files per report
cred_by_report = defaultdict(set)
for c in credential_locs:
parts = c["file"].split("/")
if len(parts) >= 2:
# Reports/name/file -> name
if parts[0] == "Reports":
cred_by_report[parts[1]].add(c["credential_type"])
with open(OUT_PATH, "w") as f:
def w(line=""):
f.write(line + "\n")
w("# ILS Auxiliary Server Archive & Migration Plan")
w()
w(f"Generated: {datetime.now().isoformat(timespec='seconds')}")
w()
w("---")
w()
# Classification table
w("## Report Classification")
w()
for cat, label, desc in [
("active-critical", "Active-Critical", "Running in 2026, shelf list reports -- document fully, migrate first"),
("active", "Active", "Running in 2026, non-shelf-list -- document, evaluate for migration"),
("inactive-recent", "Inactive-Recent", "Last run 2020-2025 -- review with stakeholders before archiving"),
("inactive-legacy", "Inactive-Legacy", "Last run before 2020 -- archive only, low priority"),
("obsolete", "Obsolete", "No output found or commented out -- archive as historical record"),
]:
reports = categories[cat]
if not reports:
continue
total_size = sum(report_sizes.get(r["report_name"], 0) for r in reports)
total_files = sum(report_file_counts.get(r["report_name"], 0) for r in reports)
w(f"### {label} ({len(reports)} reports, {human_size(total_size)}, {total_files:,} files)")
w()
w(f"> {desc}")
w()
w(f"| Report | Full Name | Last Run | Size | Credentials |")
w(f"|--------|-----------|----------|------|-------------|")
for r in sorted(reports, key=lambda x: x["report_name"]):
name = r["report_name"]
fullname = r.get("fullname", "") or ""
last = r.get("last_run_date", "") or "never"
size = human_size(report_sizes.get(name, 0))
creds = ", ".join(sorted(cred_by_report.get(name, []))) or "none"
w(f"| `{name}` | {fullname} | {last} | {size} | {creds} |")
w()
# Historical/non-report directories
w("### Historical Directories")
w()
w("These are not report directories but contain historical data:")
w()
historical_dirs = ["Symphony_Hist", "Symphony_Bincustom", "webpac"]
for d in historical_dirs:
size = top_dir_sizes.get(d, 0)
if size > 0:
w(f"- **{d}**: {human_size(size)} -- archive as historical record")
w()
# Credential rotation
w("## Credential Rotation Requirements")
w()
w(f"Total credential references found: {len(credential_locs)}")
w()
cred_types = Counter(c["credential_type"] for c in credential_locs)
w("| Type | Count | Action |")
w("|------|-------|--------|")
for ctype, count in cred_types.most_common():
action = "Rotate immediately" if ctype in ("password", "db_connection") else "Review"
w(f"| {ctype} | {count} | {action} |")
w()
w("**All `.cfg` files contain plaintext credentials and must NOT be migrated as-is.**")
w()
# Size breakdown
w("## Size Breakdown")
w()
total_size = sum(int(fi["size_bytes"]) for fi in file_manifest)
w(f"**Total ilsaux size:** {human_size(total_size)}")
w()
active_size = sum(
report_sizes.get(r["report_name"], 0)
for r in categories["active-critical"] + categories["active"]
)
inactive_size = sum(
report_sizes.get(r["report_name"], 0)
for r in categories["inactive-recent"] + categories["inactive-legacy"]
)
obsolete_size = sum(
report_sizes.get(r["report_name"], 0)
for r in categories["obsolete"]
)
w(f"| Category | Size | Percentage |")
w(f"|----------|------|------------|")
for label, size in [
("Active (critical + other)", active_size),
("Inactive (recent + legacy)", inactive_size),
("Obsolete", obsolete_size),
("Non-report dirs", total_size - active_size - inactive_size - obsolete_size),
]:
pct = (size / total_size * 100) if total_size > 0 else 0
w(f"| {label} | {human_size(size)} | {pct:.1f}% |")
w()
# Migration priorities
w("## Migration Priorities")
w()
w("### Priority 1: Active-Critical (Shelf List Reports)")
w()
for r in categories["active-critical"]:
w(f"1. **`{r['report_name']}`** -- {r.get('fullname', '')}")
w()
w("These reports are actively running and serve shelf list operations.")
w("Document fully, test migration, coordinate with staff.")
w()
w("### Priority 2: Active (Other Reports)")
w()
for r in categories["active"]:
w(f"1. **`{r['report_name']}`** -- {r.get('fullname', '')}")
w()
w("Running in production. Evaluate each for continued need.")
w()
w("### Priority 3: Inactive-Recent")
w()
w("Review with stakeholders. Some may need reactivation, others can be archived.")
w()
w("### Priority 4: Legacy & Historical")
w()
w("Archive for reference. No migration needed.")
w()
# Recommendations
w("## Recommendations")
w()
w("1. **Credential rotation**: All plaintext credentials must be rotated before any migration")
w("2. **Config modernization**: Replace Config::Simple `.cfg` with environment variables or vault")
w("3. **Consolidate git**: Merge per-report repos into monorepo for easier management")
w("4. **Archive Symphony data**: The 13 GB of historical Symphony logs can be compressed and cold-stored")
w("5. **Document sl-reports first**: These are the highest-value, most-used reports")
w("6. **Test framework**: The generic-cron.sh pattern is sound but should be modernized (systemd timers, structured logging)")
print(f" Wrote {OUT_PATH}", file=sys.stderr)
elapsed = time.monotonic() - t_start
print(f" Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""Document the generic-cron.sh execution framework and .cfg file format.
Output: docs/ilsaux/framework/generic-cron-framework.md, config-file-format.md
"""
import os
import re
import sys
import time
BASE_DIR = "/home/ray/Documents/ilsaux"
OUT_DIR = "/home/ray/claude/docs/ilsaux/framework"
def document_generic_cron():
"""Read and document the generic-cron.sh framework."""
cron_path = os.path.join(BASE_DIR, "Reports/generic/generic-cron.sh")
try:
with open(cron_path, "r") as f:
content = f.read()
except OSError as e:
print(f" ERROR: {e}", file=sys.stderr)
return
lines = content.split("\n")
doc_path = os.path.join(OUT_DIR, "generic-cron-framework.md")
with open(doc_path, "w") as f:
f.write("# Generic Cron Framework\n\n")
f.write(f"**File:** `Reports/generic/generic-cron.sh`\n")
f.write(f"**Lines:** {len(lines)}\n\n")
f.write("---\n\n")
f.write("## Overview\n\n")
f.write("The generic-cron.sh script is the execution framework for all ilsaux reports.\n")
f.write("Each report has a thin wrapper (`<name>-cron.sh`) that sets variables and sources this script.\n\n")
f.write("## Execution Flow\n\n")
f.write("1. **Variable Setup** -- Date variables (TODAY, WEEKAGO, MONTHAGO, YEARAGO)\n")
f.write("2. **Defaults** -- Sets REPORTNAME, LOGFILE, JSONFILE, KEEPPERIOD, LINK if not provided by wrapper\n")
f.write("3. **Run Report** -- `cd` to report dir, run `perl ./$SOURCEFILE >> $LOGFILE-$TODAY.txt`\n")
f.write("4. **JSON Metadata** -- Creates timestamped JSON with fullName, name, date, timeStarted, timeFinished, logFile, link\n")
f.write("5. **Cleanup** -- Deletes old log files and JSON based on KEEPPERIOD (WEEK/MONTH/YEAR)\n")
f.write("6. **MESA Integration** -- Copies JSON + log to `/var/www/html/mesa/`, cleans old files, rebuilds index\n\n")
f.write("## Required Variables (set by wrapper)\n\n")
f.write("| Variable | Required | Default | Description |\n")
f.write("|----------|----------|---------|-------------|\n")
f.write("| `REPORTNAME` | Yes | `generic` | Directory name and base filename |\n")
f.write("| `FULLNAME` | Yes | `Generic Report` | Human-readable name for JSON/MESA |\n")
f.write("| `SOURCEFILE` | Yes | `SierraGenericReport.pl` | Perl script filename |\n")
f.write("| `LINK` | No | `nil` | URL for the report output |\n")
f.write("| `KEEPPERIOD` | No | `YEAR` | Retention: WEEK, MONTH, or YEAR |\n")
f.write("| `LOGFILE` | No | `$REPORTNAME-log` | Log file basename |\n")
f.write("| `JSONFILE` | No | `$REPORTNAME` | JSON metadata basename |\n\n")
f.write("## Cron Wrapper Pattern\n\n")
f.write("Every report follows this pattern:\n\n")
f.write("```bash\n")
f.write('#!/bin/bash\n\n')
f.write('REPORTNAME=slmainmissing\n')
f.write('FULLNAME="Shelflist - Main Missing"\n')
f.write('SOURCEFILE=SierraShelfListMainMissing.pl\n')
f.write('LINK="http://[REDACTED-HOST]/ils/shelflists/mainmissing.asp"\n')
f.write('KEEPPERIOD=MONTH\n\n')
f.write('source ~/Reports/generic/generic-cron.sh\n')
f.write("```\n\n")
f.write("## JSON Metadata Format\n\n")
f.write("```json\n")
f.write('{\n')
f.write(' "fullName": "Shelflist - Main Missing",\n')
f.write(' "name": "slmainmissing",\n')
f.write(' "date": "2026-01-15",\n')
f.write(' "timeStarted": "1737000000",\n')
f.write(' "timeFinished": "1737000300",\n')
f.write(' "logFile": "slmainmissing-log-20260115.txt",\n')
f.write(' "link": "http://[REDACTED-HOST]/ils/shelflists/mainmissing.asp"\n')
f.write('}\n')
f.write("```\n\n")
f.write("## MESA Dashboard Integration\n\n")
f.write("- JSON metadata copied to `/var/www/html/mesa/finished/`\n")
f.write("- Log files copied to `/var/www/html/mesa/logs/`\n")
f.write("- `json-wn.pl` generates `/var/www/html/mesa/upcoming.json`\n")
f.write("- `json-index.pl` generates `/var/www/html/mesa/finished/index.json`\n")
f.write("- Old MESA files cleaned after 32 days\n\n")
f.write("## Retention Periods\n\n")
f.write("| Period | Log Cleanup | JSON Cleanup |\n")
f.write("|--------|-------------|-------------|\n")
f.write("| WEEK | 7 days | 8 days |\n")
f.write("| MONTH | 30 days | 32 days |\n")
f.write("| YEAR | 365 days | 366 days |\n\n")
f.write("## Historical Note\n\n")
f.write("The script contains commented-out FTP code that previously transferred files to `[REDACTED-HOST]`.\n")
f.write("This was replaced by direct file copy to the MESA web directory on the same server.\n")
print(f" Wrote {doc_path}", file=sys.stderr)
def document_config_format():
"""Document the .cfg file format used by reports."""
doc_path = os.path.join(OUT_DIR, "config-file-format.md")
# Scan for .cfg files to find common keys
config_keys = {} # key -> [files]
reports_dir = os.path.join(BASE_DIR, "Reports")
for root, dirs, files in os.walk(reports_dir):
dirs[:] = [d for d in dirs if d != ".git"]
for name in files:
if not name.endswith(".cfg"):
continue
path = os.path.join(root, name)
rel = os.path.relpath(path, BASE_DIR)
try:
with open(path, "r", errors="replace") as f:
for line in f:
line = line.strip()
# Config::Simple format: key value or key=value
m = re.match(r'^(\w+)\s*[=:]\s*(.+)', line)
if m:
key = m.group(1)
config_keys.setdefault(key, []).append(rel)
except OSError:
pass
with open(doc_path, "w") as f:
f.write("# Config File Format (.cfg)\n\n")
f.write("Reports use `Config::Simple` to read `.cfg` files.\n")
f.write("Format: `key value` or `key=value` (one per line).\n\n")
f.write("---\n\n")
f.write("## Common Configuration Keys\n\n")
f.write("| Key | Used By (count) | Description |\n")
f.write("|-----|----------------|-------------|\n")
for key in sorted(config_keys, key=lambda k: len(config_keys[k]), reverse=True):
count = len(config_keys[key])
desc = ""
kl = key.lower()
if "module" in kl:
desc = "Path to Sierra:: Perl modules"
elif "host" in kl or "server" in kl:
desc = "Database or server hostname"
elif "database" in kl or "dbname" in kl:
desc = "Database name"
elif "user" in kl:
desc = "Database or service username"
elif "password" in kl or "passwd" in kl:
desc = "**CREDENTIAL** -- database or service password"
elif "port" in kl:
desc = "Service port number"
elif "ftp" in kl:
desc = "FTP-related setting"
elif "output" in kl or "file" in kl:
desc = "Output file path"
f.write(f"| `{key}` | {count} | {desc} |\n")
f.write("\n## Security Note\n\n")
f.write("Many `.cfg` files contain **plaintext credentials** (database passwords, FTP credentials).\n")
f.write("These are NOT documented here and must be rotated as part of any migration.\n")
f.write("See `credential-locations.csv` for an inventory of affected files.\n")
print(f" Wrote {doc_path}", file=sys.stderr)
def main():
t_start = time.monotonic()
print("[generate-framework-doc] Generating framework documentation ...", file=sys.stderr)
document_generic_cron()
document_config_format()
elapsed = time.monotonic() - t_start
print(f" Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""Generate documentation for each Sierra:: Perl module.
Reads the modules directly plus perl-dependencies.csv for reverse dependency map.
Output: docs/ilsaux/modules/ (one .md per module)
"""
import csv
import os
import re
import sys
import time
BASE_DIR = "/home/ray/Documents/ilsaux/Modules/Sierra"
MANIFEST_DIR = "/home/ray/claude/docs/ilsaux/manifests"
OUT_DIR = "/home/ray/claude/docs/ilsaux/modules"
CREDENTIAL_PATTERNS = [
re.compile(r'password', re.IGNORECASE),
re.compile(r'passwd', re.IGNORECASE),
re.compile(r'secret', re.IGNORECASE),
re.compile(r'DBI->connect', re.IGNORECASE),
]
def extract_module_info(filepath):
"""Extract package, exports, subs, and credential flags from a .pm file."""
with open(filepath, "r", errors="replace") as f:
content = f.read()
lines = content.split("\n")
# Package name
package = ""
m = re.search(r'^package\s+([\w:]+)', content, re.MULTILINE)
if m:
package = m.group(1)
# Exports
export_ok = []
export = []
for m in re.finditer(r'@EXPORT_OK\s*=\s*qw\(\s*(.*?)\s*\)', content, re.DOTALL):
export_ok.extend(m.group(1).split())
for m in re.finditer(r'@EXPORT\s*=\s*qw\(\s*(.*?)\s*\)', content, re.DOTALL):
export.extend(m.group(1).split())
# Subroutines
subs = []
for i, line in enumerate(lines):
sm = re.match(r'^sub\s+(\w+)', line)
if sm:
sub_name = sm.group(1)
# Look back for leading comments
comments = []
j = i - 1
while j >= 0 and lines[j].strip().startswith("#"):
comments.insert(0, lines[j].strip().lstrip("#").strip())
j -= 1
# Parameter unpacking
params = ""
for k in range(i, min(i + 10, len(lines))):
pm = re.search(r'my\s*\(([^)]+)\)\s*=\s*@_', lines[k])
if pm:
params = pm.group(1).strip()
break
subs.append({
"name": sub_name,
"line": i + 1,
"comments": comments,
"params": params,
})
# Credential flags
has_credentials = False
cred_lines = []
for i, line in enumerate(lines, 1):
for pat in CREDENTIAL_PATTERNS:
if pat.search(line):
has_credentials = True
cred_lines.append(i)
break
# Data structures (hashes)
data_maps = []
for i, line in enumerate(lines):
hm = re.match(r'my\s+(%\w+)\s*=\s*\(', line)
if hm:
data_maps.append({"name": hm.group(1), "line": i + 1})
return {
"package": package,
"export_ok": export_ok,
"export": export,
"subs": subs,
"has_credentials": has_credentials,
"credential_lines": cred_lines,
"data_maps": data_maps,
"line_count": len(lines),
}
def main():
if not os.path.isdir(BASE_DIR):
print(f"Error: {BASE_DIR} not found", file=sys.stderr)
sys.exit(1)
t_start = time.monotonic()
print("[generate-module-docs] Generating module documentation ...", file=sys.stderr)
# Build reverse dependency map from perl-dependencies.csv
reverse_deps = {} # module_name -> [files that use it]
deps_path = os.path.join(MANIFEST_DIR, "perl-dependencies.csv")
if os.path.exists(deps_path):
with open(deps_path, newline="") as f:
for row in csv.DictReader(f):
if row["classification"] == "local":
reverse_deps.setdefault(row["module"], []).append(row["file"])
# Process each .pm file
count = 0
for name in sorted(os.listdir(BASE_DIR)):
if not name.endswith(".pm"):
continue
# Skip backup files
if "backup" in name or name.endswith(".orig"):
continue
filepath = os.path.join(BASE_DIR, name)
print(f" {name} ...", file=sys.stderr)
info = extract_module_info(filepath)
# Generate slug for output filename
slug = name.replace(".pm", "").lower()
slug = f"sierra-{slug}"
# Find users of this module
module_name = info["package"] or f"Sierra::{name.replace('.pm', '')}"
users = reverse_deps.get(module_name, [])
doc_path = os.path.join(OUT_DIR, f"{slug}.md")
with open(doc_path, "w") as f:
f.write(f"# {module_name}\n\n")
f.write(f"**File:** `Modules/Sierra/{name}`\n")
f.write(f"**Lines:** {info['line_count']}\n")
if info["has_credentials"]:
f.write(f"**WARNING:** Contains credential references (lines: {', '.join(map(str, info['credential_lines']))})\n")
f.write("\n---\n\n")
# Exports
f.write("## Exports\n\n")
if info["export_ok"]:
f.write("**@EXPORT_OK:**\n")
for sym in info["export_ok"]:
f.write(f"- `{sym}`\n")
if info["export"]:
f.write("\n**@EXPORT (auto-imported):**\n")
for sym in info["export"]:
f.write(f"- `{sym}`\n")
if not info["export_ok"] and not info["export"]:
f.write("No exports defined.\n")
f.write("\n")
# Subroutines
f.write("## Subroutines\n\n")
if info["subs"]:
for s in info["subs"]:
params = f"({s['params']})" if s["params"] else "()"
f.write(f"### `{s['name']}`{params}\n\n")
f.write(f"Line {s['line']}\n\n")
if s["comments"]:
f.write("> " + " ".join(s["comments"]) + "\n\n")
else:
f.write("No subroutines found.\n")
f.write("\n")
# Data structures
if info["data_maps"]:
f.write("## Data Structures\n\n")
for dm in info["data_maps"]:
f.write(f"- `{dm['name']}` (line {dm['line']})\n")
f.write("\n")
# Used by
f.write("## Used By\n\n")
if users:
for u in sorted(users):
f.write(f"- `{u}`\n")
else:
f.write("No known users found in dependency scan.\n")
f.write("\n")
count += 1
print(f"\n Generated {count} module docs in {OUT_DIR}", file=sys.stderr)
elapsed = time.monotonic() - t_start
print(f" Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""Generate per-report markdown documentation from Phase 1 manifests.
Reads report-status.csv, perl-dependencies.csv, git-summaries.csv, and script-content.json
to produce pre-populated documentation for each report.
Output: docs/ilsaux/reports/_template.md + one .md per report
"""
import csv
import json
import os
import re
import sys
import time
from datetime import datetime
BASE_DIR = "/home/ray/Documents/ilsaux"
MANIFEST_DIR = "/home/ray/claude/docs/ilsaux/manifests"
OUT_DIR = "/home/ray/claude/docs/ilsaux/reports"
SL_PREFIX = "sl"
TEMPLATE = """# {fullname}
**Report:** `{report_name}`
**Status:** {status}{priority}
**Last Run:** {last_run_date}
**Retention:** {keepperiod}
**Perl Script:** `{pl_file}` ({pl_lines} lines)
**Link:** {link}
**Has Git:** {has_git}
---
## Purpose
{purpose}
---
## Execution Flow
{execution_narrative}
---
## Dependencies
### Sierra:: Modules
{sierra_deps}
### CPAN Modules
{cpan_deps}
---
## Subroutines
{subroutines}
---
## SQL Queries
{sql_queries}
---
## Domain Data Maps
{data_maps}
---
## Configuration Keys
{config_keys}
---
## Known Issues / TODOs
{todos}
---
## Historical Notes (Commented-out Code)
{historical_notes}
---
## Git History
{git_history}
---
## Database Connections
{db_connections}
---
<!-- TODO: Add business context -->
<!-- TODO: Add stakeholders -->
<!-- TODO: Add migration plan -->
"""
def read_csv_file(filename):
path = os.path.join(MANIFEST_DIR, filename)
if not os.path.exists(path):
print(f" WARNING: {path} not found", file=sys.stderr)
return []
with open(path, newline="") as f:
return list(csv.DictReader(f))
def read_json_file(filename):
path = os.path.join(MANIFEST_DIR, filename)
if not os.path.exists(path):
print(f" WARNING: {path} not found", file=sys.stderr)
return []
with open(path) as f:
return json.load(f)
def format_subroutines(subs):
if not subs:
return "No subroutines found."
lines = []
for s in subs:
params = f"({s['parameters']})" if s.get("parameters") else "()"
comment = ""
if s.get("leading_comments"):
comment = " -- " + " ".join(s["leading_comments"])
lines.append(f"- **`{s['name']}`**{params} (line {s['line']}, ~{s['line_count']} lines){comment}")
return "\n".join(lines)
def format_sql(queries):
if not queries:
return "No SQL queries extracted."
lines = []
for i, q in enumerate(queries, 1):
lines.append(f"### Query {i} (`${q['variable']}`, line {q['start_line']})")
lines.append("```sql")
lines.append(q["sql"])
lines.append("```")
lines.append("")
return "\n".join(lines)
def format_data_maps(maps):
if not maps:
return "No data maps found."
lines = []
for m in maps:
keys_sample = ", ".join(m["sample_keys"])
lines.append(f"- **`{m['variable']}`** (line {m['line']}, {m['key_count']} keys) -- sample: {keys_sample}")
return "\n".join(lines)
def format_config_keys(refs):
if not refs:
return "No config keys found."
seen = {}
for r in refs:
if r["key"] not in seen:
seen[r["key"]] = r["line"]
lines = []
for key, line in sorted(seen.items()):
lines.append(f"- `{key}` (first used line {line})")
return "\n".join(lines)
def format_execution_narrative(prints):
if not prints:
return "No print statements extracted."
lines = []
for p in prints:
text = sanitize_text(p["text"])
if text and not all(c in "+-=." for c in text):
lines.append(f"{p['line']:>5}: {text}")
if not lines:
return "No meaningful print statements."
return "```\n" + "\n".join(lines[:40]) + "\n```"
def format_todos(todos):
if not todos:
return "None found."
lines = []
for t in todos:
lines.append(f"- Line {t['line']}: {t['text']}")
return "\n".join(lines)
def format_historical_notes(blocks):
if not blocks:
return "No significant commented-out code blocks found."
lines = []
for b in blocks:
sample = sanitize_text(" / ".join(b["sample"][:2]))
lines.append(f"- Lines {b['start_line']}-{b['end_line']} ({b['line_count']} lines): `{sample}`")
return "\n".join(lines)
def sanitize_text(text):
"""Belt-and-suspenders credential redaction for generated output."""
# Literal passwords
text = re.sub(r'\b[REDACTED-PASSWORD]\b', '[REDACTED-PASSWORD]', text, flags=re.IGNORECASE)
text = re.sub(r'\b[REDACTED-PASSWORD]\b', '[REDACTED-PASSWORD]', text, flags=re.IGNORECASE)
text = re.sub(r'[REDACTED-PASSWORD]', '[REDACTED-PASSWORD]', text, flags=re.IGNORECASE)
# DB usernames as quoted strings
text = re.sub(r'(?<=["\'])(?:sqlaccess|sqllabels\d*|sqldataentryerrors|svc_vmsp1)(?=["\'])',
'[REDACTED-USER]', text, flags=re.IGNORECASE)
# Internal hostnames
text = re.sub(r'\b[\w.-]+\.plch\.net\b', '[REDACTED-HOST]', text, flags=re.IGNORECASE)
text = re.sub(r'\b[\w.-]+\.iii\.com\b', '[REDACTED-HOST]', text, flags=re.IGNORECASE)
text = re.sub(r'\b[\w.-]+\.cincinnatilibrary\.org\b', '[REDACTED-HOST]', text, flags=re.IGNORECASE)
# Internal emails
text = re.sub(r'\b[\w.+-]+@cincinnatilibrary\.org\b', '[REDACTED-EMAIL]', text, flags=re.IGNORECASE)
text = re.sub(r'\b[\w.+-]+@plch\.net\b', '[REDACTED-EMAIL]', text, flags=re.IGNORECASE)
# DBI->connect credential args
text = re.sub(r'(DBI->connect\(\s*"[^"]*"\s*,\s*)"[^"]*"\s*,\s*"[^"]*"',
r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"', text, flags=re.IGNORECASE)
# $password/$username = "..."
text = re.sub(r'(\$password\s*=\s*)"[^"]*"', r'\1"[REDACTED-PASSWORD]"', text, flags=re.IGNORECASE)
text = re.sub(r'(\$username\s*=\s*)"[^"]*"', r'\1"[REDACTED-USER]"', text, flags=re.IGNORECASE)
return text
def format_db_connections(conns):
if not conns:
return "No direct DB connections found (may use Sierra::DB module)."
lines = []
for c in conns:
pattern = sanitize_text(c['pattern'])
lines.append(f"- Line {c['line']}: `{pattern}`")
return "\n".join(lines)
def find_script_content(script_contents, report_name):
"""Find matching script-content entries for a report."""
matches = []
for sc in script_contents:
file_path = sc["file"]
# Match by report directory
if file_path.startswith(f"Reports/{report_name}/"):
matches.append(sc)
return matches
def main():
t_start = time.monotonic()
print("[generate-report-docs] Generating report documentation ...", file=sys.stderr)
# Read manifests
report_status = read_csv_file("report-status.csv")
perl_deps = read_csv_file("perl-dependencies.csv")
git_summaries = read_csv_file("git-summaries.csv")
script_contents = read_json_file("script-content.json")
# Build lookup maps
deps_by_file = {}
for r in perl_deps:
deps_by_file.setdefault(r["file"], []).append(r)
git_by_path = {}
for g in git_summaries:
# Normalize path
path = g["repo_path"]
if path.startswith("Reports/"):
name = path.split("/")[1] if "/" in path else path
git_by_path[name] = g
# Write template
template_path = os.path.join(OUT_DIR, "_template.md")
with open(template_path, "w") as f:
f.write(TEMPLATE.replace("{", "{{").replace("}", "}}").replace("{{{{", "{").replace("}}}}", "}"))
# Actually just write a reference template
with open(template_path, "w") as f:
f.write("# Report Documentation Template\n\n")
f.write("This template is used by `generate-report-docs.py` to create per-report docs.\n")
f.write("See any generated report file for the actual structure.\n")
print(f" Wrote {template_path}", file=sys.stderr)
# Generate per-report docs
for report in report_status:
name = report["report_name"]
print(f" {name} ...", file=sys.stderr)
# Find script content
scs = find_script_content(script_contents, name)
# Aggregate content from all matching scripts
all_subs = []
all_sql = []
all_maps = []
all_config = []
all_prints = []
all_todos = []
all_historical = []
all_db = []
purpose = ""
for sc in scs:
if sc.get("report_identity") and not purpose:
purpose = sc["report_identity"]
all_subs.extend(sc.get("subroutines", []))
all_sql.extend(sc.get("sql_queries", []))
all_maps.extend(sc.get("data_maps", []))
all_config.extend(sc.get("config_refs", []))
all_prints.extend(sc.get("print_statements", []))
all_todos.extend(sc.get("todo_comments", []))
all_historical.extend(sc.get("commented_code_blocks", []))
all_db.extend(sc.get("db_connections", []))
if not purpose:
purpose = report.get("fullname") or f"Report: {name}"
# Collect dependencies
sierra_deps = []
cpan_deps = []
for dep_list in deps_by_file.values():
for d in dep_list:
if d["file"].startswith(f"Reports/{name}/"):
if d["classification"] == "local":
sierra_deps.append(f"- `{d['module']}` ({d['imported_symbols'] or 'default'})")
elif d["classification"] == "cpan":
cpan_deps.append(f"- `{d['module']}`")
sierra_deps_str = "\n".join(sorted(set(sierra_deps))) if sierra_deps else "None"
cpan_deps_str = "\n".join(sorted(set(cpan_deps))) if cpan_deps else "None"
# Git history
git_info = git_by_path.get(name)
if git_info:
git_history = (
f"- Commits: {git_info['total_commits']}\n"
f"- First commit: {git_info['first_commit_date']}\n"
f"- Last commit: {git_info['last_commit_date']}\n"
f"- Branches: {git_info['branches']}\n"
f"- Last message: {git_info['last_commit_message']}"
)
else:
git_history = "No git repository found for this report."
priority = " **[HIGH PRIORITY - Shelf List]**" if name.startswith(SL_PREFIX) else ""
content = TEMPLATE.format(
fullname=report.get("fullname") or name,
report_name=name,
status=report["status"],
priority=priority,
last_run_date=report["last_run_date"] or "Unknown",
keepperiod=report.get("keepperiod") or "Unknown",
pl_file=report.get("pl_file") or "Unknown",
pl_lines=report.get("pl_lines", 0),
link=sanitize_text(report.get("link") or "N/A"),
has_git=report.get("has_git", False),
purpose=sanitize_text(purpose),
execution_narrative=format_execution_narrative(all_prints),
sierra_deps=sierra_deps_str,
cpan_deps=cpan_deps_str,
subroutines=format_subroutines(all_subs),
sql_queries=format_sql(all_sql),
data_maps=format_data_maps(all_maps),
config_keys=format_config_keys(all_config),
todos=format_todos(all_todos),
historical_notes=format_historical_notes(all_historical),
git_history=git_history,
db_connections=format_db_connections(all_db),
)
doc_path = os.path.join(OUT_DIR, f"{name}.md")
with open(doc_path, "w") as f:
f.write(content)
print(f"\n Generated {len(report_status)} report docs in {OUT_DIR}", file=sys.stderr)
elapsed = time.monotonic() - t_start
print(f" Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
{
"version": 1,
"groups": {
"reports": {
"gist_id": "8aad3cf2c3d6c80742604fa76e9045bd",
"gist_url": "https://gist.github.com/rayvoelker/8aad3cf2c3d6c80742604fa76e9045bd",
"file_count": 53
},
"modules": {
"gist_id": "66b8bf43f5d840f9c71724433b20ba56",
"gist_url": "https://gist.github.com/rayvoelker/66b8bf43f5d840f9c71724433b20ba56",
"file_count": 17
},
"framework": {
"gist_id": "5aae40b92a5758a8713360931b2df2e5",
"gist_url": "https://gist.github.com/rayvoelker/5aae40b92a5758a8713360931b2df2e5",
"file_count": 4
},
"manifests-small": {
"gist_id": "28151f90732c9f2484c205602ba17852",
"gist_url": "https://gist.github.com/rayvoelker/28151f90732c9f2484c205602ba17852",
"file_count": 8
},
"manifests-large": {
"gist_id": "c060b43289f560745f77008c56e0a4ee",
"gist_url": "https://gist.github.com/rayvoelker/c060b43289f560745f77008c56e0a4ee",
"file_count": 3
},
"scripts": {
"gist_id": "558ac1812ff2dc29c22ba4ed9cc1a72c",
"gist_url": "https://gist.github.com/rayvoelker/558ac1812ff2dc29c22ba4ed9cc1a72c",
"file_count": 15
}
},
"old_single_gist_id": "cce2e74ff232c461e6c6b0e9a620a24f",
"master_toc_gist_id": "cdb532b9b3d535e76dabf784d09ca4b9",
"master_toc_gist_url": "https://gist.github.com/rayvoelker/cdb532b9b3d535e76dabf784d09ca4b9"
}
#!/usr/bin/env python3
"""Parse all crontab backup files and extract schedule entries.
Output: docs/ilsaux/manifests/cron-schedule.csv
"""
import csv
import os
import re
import sys
import time
BASE_DIR = "/home/ray/Documents/ilsaux/crontab_files"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"
DOW_NAMES = {0: "Sun", 1: "Mon", 2: "Tue", 3: "Wed", 4: "Thu", 5: "Fri", 6: "Sat", 7: "Sun"}
def human_schedule(minute, hour, dom, month, dow):
"""Convert cron fields to a human-readable schedule description."""
parts = []
# Day of week
if dow != "*":
if dow in DOW_NAMES:
parts.append(f"on {DOW_NAMES[dow]}")
elif "-" in str(dow):
parts.append(f"days {dow}")
else:
parts.append(f"dow={dow}")
elif dom != "*":
parts.append(f"on day {dom}")
# Time
if hour != "*" and minute != "*":
try:
h = int(hour)
m = int(minute)
ampm = "AM" if h < 12 else "PM"
h12 = h % 12 or 12
parts.append(f"at {h12}:{m:02d} {ampm}")
except (ValueError, TypeError):
parts.append(f"at {hour}:{minute}")
elif hour != "*":
parts.append(f"at hour {hour}")
if month != "*":
parts.append(f"month={month}")
return " ".join(parts) if parts else "every minute"
def extract_report_name(command):
"""Try to extract report name from cron command."""
# Match patterns like Reports/slitemdata/slitemdata-cron.sh
m = re.search(r'Reports/(\w+)/\w+-cron\.sh', command)
if m:
return m.group(1)
# Match patterns like /path/to/reportname-cron.sh
m = re.search(r'/(\w+)-cron\.sh', command)
if m:
return m.group(1)
# Match perl scripts
m = re.search(r'Sierra(\w+)\.pl', command)
if m:
return m.group(1).lower()
return ""
def parse_crontab(filepath):
"""Parse a single crontab file and return list of entries."""
filename = os.path.basename(filepath)
# Extract date from filename: crontab.backup.YYYYMMDD
m = re.search(r'(\d{8})$', filename)
crontab_date = m.group(1) if m else ""
entries = []
try:
with open(filepath, "r", errors="replace") as f:
lines = f.readlines()
except OSError as e:
print(f" SKIP: {filepath}: {e}", file=sys.stderr)
return entries
for line in lines:
stripped = line.strip()
if not stripped:
continue
# Skip variable assignments and shell settings
if re.match(r'^(SHELL|PATH|MAILTO|HOME|#\s*m\s+h)', stripped):
continue
is_commented = stripped.startswith("#")
notes = ""
# Extract inline comment/notes
if is_commented:
# Remove leading # and check if it's a cron entry
uncommented = stripped.lstrip("#").strip()
# Check if there's a note after the command
if re.match(r'^\d', uncommented) or re.match(r'^\*', uncommented):
stripped = uncommented
else:
# Pure comment line - extract as note if relevant
continue
# Parse cron fields
m = re.match(
r'^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$',
stripped
)
if not m:
continue
minute, hour, dom, month, dow, command = m.groups()
# Check for inline comment in command
if "#" in command:
command, _, notes = command.partition("#")
command = command.strip()
notes = notes.strip()
report_name = extract_report_name(command)
entries.append({
"crontab_file": filename,
"crontab_date": crontab_date,
"minute": minute,
"hour": hour,
"dom": dom,
"month": month,
"dow": dow,
"command": command,
"report_name": report_name,
"is_commented": is_commented,
"human_schedule": human_schedule(minute, hour, dom, month, dow),
"notes": notes,
})
return entries
def main():
if not os.path.isdir(BASE_DIR):
print(f"Error: {BASE_DIR} not found", file=sys.stderr)
sys.exit(1)
t_start = time.monotonic()
print(f"[manifest-cron] Parsing crontab files in {BASE_DIR} ...", file=sys.stderr)
all_entries = []
crontab_files = sorted(os.listdir(BASE_DIR))
for name in crontab_files:
path = os.path.join(BASE_DIR, name)
if os.path.isfile(path):
entries = parse_crontab(path)
all_entries.extend(entries)
print(f" {name}: {len(entries)} entries", file=sys.stderr)
csv_path = os.path.join(OUT_DIR, "cron-schedule.csv")
with open(csv_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[
"crontab_file", "crontab_date", "minute", "hour", "dom", "month",
"dow", "command", "report_name", "is_commented", "human_schedule", "notes"
])
writer.writeheader()
writer.writerows(all_entries)
print(f" Wrote {csv_path} ({len(all_entries)} rows)", file=sys.stderr)
# Summary of latest crontab
latest = [e for e in all_entries if e["crontab_date"] == "20190709"]
if latest:
active = [e for e in latest if not e["is_commented"]]
commented = [e for e in latest if e["is_commented"]]
print(f"\n Latest crontab (20190709): {len(active)} active, {len(commented)} commented", file=sys.stderr)
for e in active:
print(f" {e['human_schedule']}: {e['report_name'] or e['command'][:50]}", file=sys.stderr)
elapsed = time.monotonic() - t_start
print(f"\n Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""Find all git repos in ilsaux and extract summary information.
Output: docs/ilsaux/manifests/git-summaries.csv
"""
import csv
import os
import subprocess
import sys
import time
BASE_DIR = "/home/ray/Documents/ilsaux"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"
TIMEOUT = 10
def git_cmd(repo_dir, args):
"""Run a git command in repo_dir with timeout, return stdout or empty string."""
try:
result = subprocess.run(
["git"] + args,
cwd=repo_dir,
capture_output=True,
text=True,
timeout=TIMEOUT,
)
return result.stdout.strip()
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
return ""
def main():
if not os.path.isdir(BASE_DIR):
print(f"Error: {BASE_DIR} not found", file=sys.stderr)
sys.exit(1)
t_start = time.monotonic()
print(f"[manifest-git] Finding git repos in {BASE_DIR} ...", file=sys.stderr)
# Find all .git directories
git_dirs = []
for root, dirs, files in os.walk(BASE_DIR):
if ".git" in dirs:
git_dirs.append(root)
dirs.remove(".git") # Don't recurse into .git
# Don't recurse into .cpan
if ".cpan" in dirs:
dirs.remove(".cpan")
print(f" Found {len(git_dirs)} git repos", file=sys.stderr)
rows = []
for repo_dir in sorted(git_dirs):
rel_path = os.path.relpath(repo_dir, BASE_DIR)
print(f" {rel_path} ...", file=sys.stderr)
# Total commits
log_count = git_cmd(repo_dir, ["rev-list", "--count", "HEAD"])
total_commits = int(log_count) if log_count.isdigit() else 0
# First commit date
first_date = git_cmd(repo_dir, [
"log", "--reverse", "--format=%aI", "--max-count=1"
])
# Last commit date and message
last_info = git_cmd(repo_dir, [
"log", "--format=%aI|||%s", "--max-count=1"
])
last_date = ""
last_msg = ""
if "|||" in last_info:
last_date, last_msg = last_info.split("|||", 1)
# Branches
branches_raw = git_cmd(repo_dir, ["branch", "--format=%(refname:short)"])
branches = ", ".join(branches_raw.split("\n")) if branches_raw else ""
rows.append({
"repo_path": rel_path,
"total_commits": total_commits,
"first_commit_date": first_date,
"last_commit_date": last_date,
"branches": branches,
"last_commit_message": last_msg,
})
csv_path = os.path.join(OUT_DIR, "git-summaries.csv")
with open(csv_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[
"repo_path", "total_commits", "first_commit_date",
"last_commit_date", "branches", "last_commit_message"
])
writer.writeheader()
writer.writerows(rows)
print(f" Wrote {csv_path} ({len(rows)} repos)", file=sys.stderr)
elapsed = time.monotonic() - t_start
print(f" Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""Extract Perl module dependencies from all .pl/.pm files in ilsaux.
Output: docs/ilsaux/manifests/perl-dependencies.csv
Includes reverse dependency map (which reports use which Sierra:: modules).
"""
import csv
import os
import re
import sys
import time
BASE_DIR = "/home/ray/Documents/ilsaux"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"
# Core Perl modules (common ones seen in this codebase)
CORE_MODULES = {
"strict", "warnings", "Carp", "DBI", "Exporter", "POSIX",
"File::Basename", "File::Copy", "File::Path", "File::Find",
"Getopt::Long", "Getopt::Std", "Data::Dumper", "Scalar::Util",
"List::Util", "Time::Local", "Time::HiRes", "IO::File",
"Encode", "utf8",
}
# Known CPAN modules
CPAN_MODULES = {
"Config::Simple", "XML::Simple", "Net::FTP", "SQL::Beautify",
"Net::SFTP::Foreign", "Text::CSV", "Text::CSV_XS", "JSON",
"JSON::XS", "LWP::UserAgent", "HTTP::Request", "HTTP::Response",
"SOAP::Lite", "MARC::Record", "MARC::Field", "MARC::Batch",
"MARC::File::USMARC", "DBIx::Class", "Excel::Writer::XLSX",
"Spreadsheet::WriteExcel", "CGI", "Template",
}
USE_RE = re.compile(r'^\s*use\s+([\w:]+)(?:\s+qw\(([^)]*)\))?', re.MULTILINE)
REQUIRE_RE = re.compile(r'^\s*require\s+([\w:]+)', re.MULTILINE)
EXPORT_OK_RE = re.compile(r'@EXPORT_OK\s*=\s*qw\(\s*(.*?)\s*\)', re.DOTALL)
EXPORT_RE = re.compile(r'@EXPORT\s*=\s*qw\(\s*(.*?)\s*\)', re.DOTALL)
def classify_module(name):
if name.startswith("Sierra::"):
return "local"
if name in CORE_MODULES:
return "core"
if name in CPAN_MODULES:
return "cpan"
# Version numbers (use 5.008007)
if re.match(r'^\d', name):
return "pragma"
# Lowercase = pragma
if name[0].islower():
return "pragma"
return "cpan"
def main():
if not os.path.isdir(BASE_DIR):
print(f"Error: {BASE_DIR} not found", file=sys.stderr)
sys.exit(1)
t_start = time.monotonic()
print("[manifest-perl-deps] Scanning .pl/.pm files ...", file=sys.stderr)
rows = []
exports = {} # module_file -> {"export_ok": [...], "export": [...]}
for root, dirs, files in os.walk(BASE_DIR):
# Skip .git internals
dirs[:] = [d for d in dirs if d != ".git"]
for name in files:
if not (name.endswith(".pl") or name.endswith(".pm")):
continue
path = os.path.join(root, name)
rel_path = os.path.relpath(path, BASE_DIR)
try:
with open(path, "r", errors="replace") as f:
content = f.read()
except OSError as e:
print(f" SKIP: {path}: {e}", file=sys.stderr)
continue
# Extract use statements
for m in USE_RE.finditer(content):
module = m.group(1)
symbols = m.group(2) or ""
symbols = " ".join(symbols.split())
classification = classify_module(module)
rows.append({
"file": rel_path,
"module": module,
"import_type": "use",
"imported_symbols": symbols,
"classification": classification,
})
# Extract require statements
for m in REQUIRE_RE.finditer(content):
module = m.group(1)
classification = classify_module(module)
rows.append({
"file": rel_path,
"module": module,
"import_type": "require",
"imported_symbols": "",
"classification": classification,
})
# Extract exports from .pm files
if name.endswith(".pm"):
export_ok = []
export = []
for m in EXPORT_OK_RE.finditer(content):
export_ok.extend(m.group(1).split())
for m in EXPORT_RE.finditer(content):
export.extend(m.group(1).split())
if export_ok or export:
exports[rel_path] = {
"export_ok": export_ok,
"export": export,
}
print(f" Found {len(rows)} dependency entries", file=sys.stderr)
# Write main CSV
csv_path = os.path.join(OUT_DIR, "perl-dependencies.csv")
rows.sort(key=lambda r: (r["file"], r["module"]))
with open(csv_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[
"file", "module", "import_type", "imported_symbols", "classification"
])
writer.writeheader()
writer.writerows(rows)
print(f" Wrote {csv_path} ({len(rows)} rows)", file=sys.stderr)
# Print reverse dependency summary
sierra_users = {} # sierra_module -> [files that use it]
for r in rows:
if r["classification"] == "local":
sierra_users.setdefault(r["module"], []).append(r["file"])
print("\n Reverse dependency map (Sierra:: modules):", file=sys.stderr)
for mod in sorted(sierra_users):
users = sierra_users[mod]
print(f" {mod}: used by {len(users)} files", file=sys.stderr)
for u in users:
print(f" - {u}", file=sys.stderr)
elapsed = time.monotonic() - t_start
print(f"\n Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""Determine status of each report: active, inactive, or obsolete.
Parses cron wrappers, JSON metadata files, and .cfg files.
Output: docs/ilsaux/manifests/report-status.csv, credential-locations.csv
"""
import csv
import json
import os
import re
import sys
import time
from datetime import datetime
BASE_DIR = "/home/ray/Documents/ilsaux/Reports"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"
# Patterns that indicate credentials (file+line only, NO values)
CREDENTIAL_PATTERNS = [
(re.compile(r'password', re.IGNORECASE), "password"),
(re.compile(r'passwd', re.IGNORECASE), "password"),
(re.compile(r'secret', re.IGNORECASE), "secret"),
(re.compile(r'api[_-]?key', re.IGNORECASE), "api_key"),
(re.compile(r'token', re.IGNORECASE), "token"),
(re.compile(r'DBI->connect\s*\(', re.IGNORECASE), "db_connection"),
(re.compile(r'host\s*=', re.IGNORECASE), "host_config"),
(re.compile(r'user\s*=', re.IGNORECASE), "user_config"),
]
# Now threshold: 60 days for "active"
ACTIVE_DAYS = 60
INACTIVE_YEAR = 2020
def parse_cron_wrapper(path):
"""Extract variables from a -cron.sh file."""
info = {}
try:
with open(path, "r") as f:
for line in f:
line = line.strip()
for var in ("REPORTNAME", "FULLNAME", "SOURCEFILE", "LINK", "KEEPPERIOD"):
m = re.match(rf'^{var}=(.+)', line)
if m:
val = m.group(1).strip().strip('"').strip("'")
info[var.lower()] = val
except OSError:
pass
return info
def find_newest_json(report_dir):
"""Find the newest .json metadata file and extract last run date."""
newest_time = 0
newest_date = None
for name in os.listdir(report_dir):
if name.endswith(".json") and "-" in name:
path = os.path.join(report_dir, name)
try:
mtime = os.stat(path).st_mtime
if mtime > newest_time:
newest_time = mtime
# Try to parse date from the JSON content
try:
with open(path) as f:
data = json.load(f)
if "date" in data:
newest_date = data["date"]
elif "timeFinished" in data:
ts = int(data["timeFinished"])
newest_date = datetime.fromtimestamp(ts).strftime("%Y-%m-%d")
except (json.JSONDecodeError, ValueError, KeyError):
newest_date = datetime.fromtimestamp(mtime).strftime("%Y-%m-%d")
except OSError:
continue
return newest_date
def count_pl_lines(report_dir):
"""Count lines in .pl files."""
total = 0
pl_file = None
for name in os.listdir(report_dir):
if name.endswith(".pl"):
pl_file = name
path = os.path.join(report_dir, name)
try:
with open(path) as f:
total += sum(1 for _ in f)
except OSError:
pass
return pl_file, total
def scan_credentials(report_dir):
"""Scan .cfg and .pm and .pl files for credential patterns. Return file+line only."""
cred_entries = []
for name in os.listdir(report_dir):
if not any(name.endswith(ext) for ext in (".cfg", ".pl", ".pm", ".conf")):
continue
path = os.path.join(report_dir, name)
rel_path = os.path.relpath(path, os.path.dirname(BASE_DIR))
try:
with open(path, "r", errors="replace") as f:
for i, line in enumerate(f, 1):
for pattern, cred_type in CREDENTIAL_PATTERNS:
if pattern.search(line):
cred_entries.append({
"file": rel_path,
"line_number": i,
"credential_type": cred_type,
})
break # One match per line is enough
except OSError:
pass
return cred_entries
def classify_status(last_run_date):
"""Classify report status based on last run date."""
if not last_run_date:
return "obsolete"
try:
dt = datetime.strptime(last_run_date, "%Y-%m-%d")
except ValueError:
return "unknown"
now = datetime.now()
delta = (now - dt).days
if delta <= ACTIVE_DAYS:
return "active"
elif dt.year >= INACTIVE_YEAR:
return "inactive-recent"
else:
return "obsolete"
def main():
if not os.path.isdir(BASE_DIR):
print(f"Error: {BASE_DIR} not found", file=sys.stderr)
sys.exit(1)
t_start = time.monotonic()
print(f"[manifest-report-status] Scanning {BASE_DIR} ...", file=sys.stderr)
report_rows = []
all_credentials = []
for name in sorted(os.listdir(BASE_DIR)):
report_dir = os.path.join(BASE_DIR, name)
if not os.path.isdir(report_dir):
continue
# Skip tar.gz entries
if name.endswith(".tar.gz"):
continue
print(f" {name} ...", file=sys.stderr)
# Parse cron wrapper
cron_files = [f for f in os.listdir(report_dir) if f.endswith("-cron.sh")]
cron_info = {}
schedule = ""
for cf in cron_files:
cron_info = parse_cron_wrapper(os.path.join(report_dir, cf))
# Find last run date from JSON metadata
last_run_date = find_newest_json(report_dir)
# Count .pl lines
pl_file, pl_lines = count_pl_lines(report_dir)
# Check for .git
has_git = os.path.isdir(os.path.join(report_dir, ".git"))
# Scan for credentials
creds = scan_credentials(report_dir)
all_credentials.extend(creds)
status = classify_status(last_run_date)
report_rows.append({
"report_name": name,
"status": status,
"last_run_date": last_run_date or "",
"schedule": cron_info.get("keepperiod", ""),
"pl_file": pl_file or "",
"pl_lines": pl_lines,
"fullname": cron_info.get("fullname", ""),
"link": cron_info.get("link", ""),
"keepperiod": cron_info.get("keepperiod", ""),
"has_git": has_git,
"credential_count": len(creds),
})
# Write report-status.csv
csv_path = os.path.join(OUT_DIR, "report-status.csv")
with open(csv_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[
"report_name", "status", "last_run_date", "schedule",
"pl_file", "pl_lines", "fullname", "link", "keepperiod",
"has_git", "credential_count"
])
writer.writeheader()
writer.writerows(report_rows)
print(f" Wrote {csv_path} ({len(report_rows)} reports)", file=sys.stderr)
# Write credential-locations.csv (NO values!)
cred_path = os.path.join(OUT_DIR, "credential-locations.csv")
with open(cred_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["file", "line_number", "credential_type"])
writer.writeheader()
writer.writerows(all_credentials)
print(f" Wrote {cred_path} ({len(all_credentials)} entries, NO values)", file=sys.stderr)
# Summary
status_counts = {}
for r in report_rows:
status_counts[r["status"]] = status_counts.get(r["status"], 0) + 1
print("\n Status summary:", file=sys.stderr)
for s, c in sorted(status_counts.items()):
print(f" {s}: {c}", file=sys.stderr)
elapsed = time.monotonic() - t_start
print(f"\n Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""Extract self-documenting content from Perl scripts: comments, SQL, subs, data maps, etc.
Output: docs/ilsaux/manifests/script-content.json
This is the richest source of documentation since formal docs are sparse.
"""
import json
import os
import re
import sys
import time
BASE_DIR = "/home/ray/Documents/ilsaux"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"
# Sensitive patterns to redact from all extracted content
_SANITIZE_PATTERNS = [
# Literal passwords
(re.compile(r'\b[REDACTED-PASSWORD]\b', re.IGNORECASE), '[REDACTED-PASSWORD]'),
(re.compile(r'\b[REDACTED-PASSWORD]\b', re.IGNORECASE), '[REDACTED-PASSWORD]'),
(re.compile(r'[REDACTED-PASSWORD]', re.IGNORECASE), '[REDACTED-PASSWORD]'),
# DB usernames as quoted strings
(re.compile(r'(?<=["\'])(?:sqlaccess|sqllabels\d*|sqldataentryerrors|svc_vmsp1)(?=["\'])', re.IGNORECASE), '[REDACTED-USER]'),
# Internal hostnames
(re.compile(r'\b[\w.-]+\.plch\.net\b', re.IGNORECASE), '[REDACTED-HOST]'),
(re.compile(r'\b[\w.-]+\.iii\.com\b', re.IGNORECASE), '[REDACTED-HOST]'),
(re.compile(r'\b[\w.-]+\.cincinnatilibrary\.org\b', re.IGNORECASE), '[REDACTED-HOST]'),
# Email addresses at internal domains
(re.compile(r'\b[\w.+-]+@cincinnatilibrary\.org\b', re.IGNORECASE), '[REDACTED-EMAIL]'),
(re.compile(r'\b[\w.+-]+@plch\.net\b', re.IGNORECASE), '[REDACTED-EMAIL]'),
# DBI->connect credential args
(re.compile(r'(DBI->connect\(\s*"[^"]*"\s*,\s*)"[^"]*"\s*,\s*"[^"]*"', re.IGNORECASE), r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"'),
# $password = "..." and $username = "..."
(re.compile(r'(\$password\s*=\s*)"[^"]*"', re.IGNORECASE), r'\1"[REDACTED-PASSWORD]"'),
(re.compile(r'(\$username\s*=\s*)"[^"]*"', re.IGNORECASE), r'\1"[REDACTED-USER]"'),
# $ua->credentials
(re.compile(r"(\$ua->credentials\([^)]*)'[^']*'\s*,\s*'[^']*'\s*\)", re.IGNORECASE), r"\1'[REDACTED-USER]', '[REDACTED-PASSWORD]')"),
]
def sanitize_text(text):
"""Apply credential redaction to extracted text."""
for pattern, replacement in _SANITIZE_PATTERNS:
text = pattern.sub(replacement, text)
return text
def extract_report_identity(lines):
"""Extract human-readable report name from BEGIN block print statements."""
in_begin = False
for line in lines:
stripped = line.strip()
if stripped.startswith("BEGIN"):
in_begin = True
if in_begin:
m = re.search(r'print\s+"([^"]*(?:begin|Report)[^"]*)"\s*;', stripped, re.IGNORECASE)
if m:
name = m.group(1)
# Clean up the name
name = re.sub(r'[\+\n\\]', '', name)
name = name.strip()
return name
if stripped == "}":
in_begin = False
return None
def extract_comments(lines):
"""Extract all comment lines, categorized."""
todo_comments = []
section_comments = []
inline_comments = []
for i, line in enumerate(lines, 1):
stripped = line.strip()
if not stripped:
continue
# Pure comment line
if stripped.startswith("#"):
comment = stripped.lstrip("#").strip()
if re.match(r'TODO|FIXME|HACK', comment, re.IGNORECASE):
todo_comments.append({"line": i, "text": comment})
elif re.match(r'-{3,}', stripped.lstrip("#")):
section_comments.append({"line": i, "text": comment})
else:
inline_comments.append({"line": i, "text": comment})
# Inline comment on code line
elif "#" in stripped:
# Avoid matches inside strings
code_part, _, comment_part = stripped.partition("#")
# Simple heuristic: if there's a quote before #, skip
if code_part.count('"') % 2 == 0 and code_part.count("'") % 2 == 0:
if comment_part.strip():
if re.match(r'TODO|FIXME|HACK', comment_part.strip(), re.IGNORECASE):
todo_comments.append({"line": i, "text": comment_part.strip()})
else:
inline_comments.append({
"line": i,
"text": comment_part.strip(),
"code": code_part.strip(),
})
return todo_comments, section_comments, inline_comments
def extract_sql(lines):
"""Reconstruct SQL from $sql_query .= "..." concatenation patterns."""
queries = []
current_sql = []
current_var = None
start_line = None
for i, line in enumerate(lines, 1):
stripped = line.strip()
# Match: $sql_query = "..."; or $sql .= "...";
m = re.match(r'\$(\w+)\s*\.?=\s*"(.*?)"\s*;', stripped)
if m:
var_name = m.group(1)
sql_part = m.group(2)
if "sql" in var_name.lower() or "query" in var_name.lower():
if ".=" not in stripped and current_sql:
# New assignment, save previous
queries.append({
"variable": current_var,
"start_line": start_line,
"sql": "\n".join(current_sql),
})
current_sql = []
if not current_sql:
current_var = var_name
start_line = i
current_sql.append(sql_part)
continue
# Also match heredoc-style SQL
if re.match(r'\$\w+\s*=\s*<<', stripped):
# Heredoc start - capture until delimiter
pass
# If we were building SQL and hit a non-continuation line, save
if current_sql and not re.match(r'\$\w*sql\w*\s*\.?=', stripped, re.IGNORECASE):
queries.append({
"variable": current_var,
"start_line": start_line,
"sql": "\n".join(current_sql),
})
current_sql = []
current_var = None
# Don't forget last one
if current_sql:
queries.append({
"variable": current_var,
"start_line": start_line,
"sql": "\n".join(current_sql),
})
return queries
def extract_subroutines(lines):
"""Extract sub declarations with leading comments and parameters."""
subs = []
i = 0
while i < len(lines):
line = lines[i].strip()
m = re.match(r'^sub\s+(\w+)', line)
if m:
sub_name = m.group(1)
sub_start = i + 1 # 1-indexed
# Look back for leading comments
leading_comments = []
j = i - 1
while j >= 0 and lines[j].strip().startswith("#"):
leading_comments.insert(0, lines[j].strip().lstrip("#").strip())
j -= 1
# Count lines until closing brace (approximate)
brace_depth = 0
sub_end = i
for k in range(i, len(lines)):
brace_depth += lines[k].count("{") - lines[k].count("}")
if brace_depth <= 0 and k > i:
sub_end = k
break
else:
sub_end = len(lines) - 1
line_count = sub_end - i + 1
# Look for parameter unpacking
params = ""
for k in range(i, min(i + 10, len(lines))):
pm = re.search(r'my\s*\(([^)]+)\)\s*=\s*@_', lines[k])
if pm:
params = pm.group(1).strip()
break
subs.append({
"name": sub_name,
"line": sub_start,
"line_count": line_count,
"leading_comments": leading_comments,
"parameters": params,
})
i += 1
return subs
def extract_data_maps(lines):
"""Extract named hash definitions with sample keys."""
maps = []
i = 0
while i < len(lines):
line = lines[i].strip()
# Match: my %name = ( or my %name_for_... = (
m = re.match(r'my\s+(%\w+)\s*=\s*\(', line)
if m:
var_name = m.group(1)
start_line = i + 1
# Collect all lines until closing paren
content_lines = [line]
brace_depth = line.count("(") - line.count(")")
k = i + 1
while k < len(lines) and brace_depth > 0:
content_lines.append(lines[k])
brace_depth += lines[k].count("(") - lines[k].count(")")
k += 1
full_content = "\n".join(content_lines)
# Extract key => value pairs
pairs = re.findall(r'["\']?(\w+)["\']?\s*=>', full_content)
sample_keys = pairs[:5]
if pairs: # Only include if we found actual key-value pairs
maps.append({
"variable": var_name,
"line": start_line,
"key_count": len(pairs),
"sample_keys": sample_keys,
})
i = k
continue
i += 1
return maps
def extract_config_refs(lines):
"""Extract $cfg->param("...") calls."""
refs = []
for i, line in enumerate(lines, 1):
for m in re.finditer(r'\$cfg->param\(\s*["\']([^"\']+)["\']\s*\)', line):
refs.append({"line": i, "key": m.group(1)})
return refs
def extract_print_stmts(lines):
"""Extract print statements (execution narrative)."""
stmts = []
for i, line in enumerate(lines, 1):
m = re.search(r'print\s+"([^"]+)"', line.strip())
if m:
text = m.group(1)
# Skip purely variable prints and separator lines
if text.strip() and not re.match(r'^[\+\-\=]+$', text.strip()):
stmts.append({"line": i, "text": text.replace("\\n", "").strip()})
return stmts
def extract_commented_code(lines):
"""Detect blocks of 3+ consecutive commented lines that look like code."""
blocks = []
current_block = []
current_start = None
code_indicators = re.compile(r'[\$\@\%]|->|=\s|;\s*$|if\s*\(|while|foreach|sub\s')
for i, line in enumerate(lines, 1):
stripped = line.strip()
if stripped.startswith("#") and not stripped.startswith("#!"):
comment_content = stripped.lstrip("#")
if code_indicators.search(comment_content):
if not current_block:
current_start = i
current_block.append({"line": i, "text": comment_content.strip()})
else:
if len(current_block) >= 3:
blocks.append({
"start_line": current_start,
"end_line": current_block[-1]["line"],
"line_count": len(current_block),
"sample": [b["text"] for b in current_block[:3]],
})
current_block = []
else:
if len(current_block) >= 3:
blocks.append({
"start_line": current_start,
"end_line": current_block[-1]["line"],
"line_count": len(current_block),
"sample": [b["text"] for b in current_block[:3]],
})
current_block = []
# Final block
if len(current_block) >= 3:
blocks.append({
"start_line": current_start,
"end_line": current_block[-1]["line"],
"line_count": len(current_block),
"sample": [b["text"] for b in current_block[:3]],
})
return blocks
def extract_db_connections(lines):
"""Extract DBI->connect calls (sanitized -- credentials and hostnames redacted)."""
connections = []
for i, line in enumerate(lines, 1):
if "DBI->connect" in line or "dbi:Pg" in line.lower() or "dbi:mysql" in line.lower():
sanitized = line.strip()
# Redact host= values in DSN strings
sanitized = re.sub(
r'(host=)[\w.-]+\.(plch\.net|iii\.com|cincinnatilibrary\.org)',
r'\1[REDACTED-HOST]',
sanitized,
flags=re.IGNORECASE,
)
# Redact user/password args in DBI->connect("dsn","[REDACTED-USER]","[REDACTED-PASSWORD]",...)
sanitized = re.sub(
r'(DBI->connect\(\s*"[^"]*"\s*,\s*)"[^"]*"\s*,\s*"[^"]*"',
r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"',
sanitized,
flags=re.IGNORECASE,
)
connections.append({"line": i, "pattern": sanitized})
return connections
def sanitize_result(obj):
"""Recursively sanitize all string values in a nested data structure."""
if isinstance(obj, str):
return sanitize_text(obj)
if isinstance(obj, list):
return [sanitize_result(item) for item in obj]
if isinstance(obj, dict):
return {k: sanitize_result(v) for k, v in obj.items()}
return obj
def process_file(path, rel_path):
"""Process a single .pl/.pm file and extract all self-documenting content."""
try:
with open(path, "r", errors="replace") as f:
content = f.read()
except OSError as e:
print(f" SKIP: {path}: {e}", file=sys.stderr)
return None
lines = content.split("\n")
todo_comments, section_comments, inline_comments = extract_comments(lines)
result = {
"file": rel_path,
"line_count": len(lines),
"report_identity": extract_report_identity(lines),
"todo_comments": todo_comments,
"section_comments": section_comments,
"inline_comment_count": len(inline_comments),
"inline_comments_sample": inline_comments[:20],
"sql_queries": extract_sql(lines),
"subroutines": extract_subroutines(lines),
"data_maps": extract_data_maps(lines),
"config_refs": extract_config_refs(lines),
"print_statements": extract_print_stmts(lines),
"commented_code_blocks": extract_commented_code(lines),
"db_connections": extract_db_connections(lines),
}
# Sanitize all string content to redact credentials/hostnames
return sanitize_result(result)
def main():
if not os.path.isdir(BASE_DIR):
print(f"Error: {BASE_DIR} not found", file=sys.stderr)
sys.exit(1)
t_start = time.monotonic()
print("[manifest-script-content] Extracting self-documentation ...", file=sys.stderr)
results = []
file_count = 0
for root, dirs, files in os.walk(BASE_DIR):
dirs[:] = [d for d in dirs if d != ".git"]
for name in files:
if not (name.endswith(".pl") or name.endswith(".pm")):
continue
path = os.path.join(root, name)
rel_path = os.path.relpath(path, BASE_DIR)
result = process_file(path, rel_path)
if result:
results.append(result)
file_count += 1
# Sort by file path
results.sort(key=lambda r: r["file"])
out_path = os.path.join(OUT_DIR, "script-content.json")
with open(out_path, "w") as f:
json.dump(results, f, indent=2)
print(f" Processed {file_count} files", file=sys.stderr)
print(f" Wrote {out_path}", file=sys.stderr)
# Summary stats
total_subs = sum(len(r["subroutines"]) for r in results)
total_sql = sum(len(r["sql_queries"]) for r in results)
total_maps = sum(len(r["data_maps"]) for r in results)
total_todos = sum(len(r["todo_comments"]) for r in results)
print(f" Totals: {total_subs} subs, {total_sql} SQL queries, "
f"{total_maps} data maps, {total_todos} TODOs", file=sys.stderr)
elapsed = time.monotonic() - t_start
print(f" Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""Read all Phase 1 CSVs and produce a human-readable summary report.
Output: docs/ilsaux/manifests/summary-report.txt
"""
import csv
import os
import sys
import time
from collections import Counter, defaultdict
from datetime import datetime
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"
def human_size(nbytes):
for unit in ("B", "KB", "MB", "GB", "TB"):
if abs(nbytes) < 1024:
return f"{nbytes:.1f} {unit}"
nbytes /= 1024
return f"{nbytes:.1f} PB"
def read_csv(filename):
path = os.path.join(OUT_DIR, filename)
if not os.path.exists(path):
print(f" WARNING: {path} not found", file=sys.stderr)
return []
with open(path, newline="") as f:
return list(csv.DictReader(f))
def main():
t_start = time.monotonic()
print("[manifest-summary] Generating summary report ...", file=sys.stderr)
# Read all CSVs
file_manifest = read_csv("file-manifest.csv")
perl_deps = read_csv("perl-dependencies.csv")
cron_schedule = read_csv("cron-schedule.csv")
report_status = read_csv("report-status.csv")
git_summaries = read_csv("git-summaries.csv")
credential_locs = read_csv("credential-locations.csv")
out_path = os.path.join(OUT_DIR, "summary-report.txt")
with open(out_path, "w") as f:
def w(line=""):
f.write(line + "\n")
w("=" * 72)
w("ILS AUXILIARY SERVER (ilsaux) DOCUMENTATION SUMMARY")
w(f"Generated: {datetime.now().isoformat(timespec='seconds')}")
w("=" * 72)
# 1. File Statistics
w()
w("1. FILE STATISTICS")
w("-" * 50)
total_files = len(file_manifest)
total_size = sum(int(r["size_bytes"]) for r in file_manifest)
w(f" Total files: {total_files:,}")
w(f" Total size: {human_size(total_size)}")
# Extension breakdown
ext_counts = Counter()
ext_sizes = Counter()
for r in file_manifest:
ext = r["extension"] or "(none)"
ext_counts[ext] += 1
ext_sizes[ext] += int(r["size_bytes"])
w()
w(" Top extensions by count:")
for ext, count in ext_counts.most_common(15):
size = ext_sizes[ext]
w(f" {ext:<15} {count:>6} files {human_size(size):>10}")
# 2. Report Status
w()
w("2. REPORT STATUS")
w("-" * 50)
status_counts = Counter(r["status"] for r in report_status)
for status in ["active", "inactive-recent", "obsolete", "unknown"]:
if status in status_counts:
reports = [r for r in report_status if r["status"] == status]
w(f"\n {status.upper()} ({status_counts[status]}):")
for r in sorted(reports, key=lambda x: x["report_name"]):
last = r["last_run_date"] or "never"
fullname = r["fullname"]
name = r["report_name"]
sl = " [HIGH PRIORITY]" if name.startswith("sl") else ""
w(f" {name:<30} last run: {last:<12} {fullname}{sl}")
# 3. Perl Dependency Frequency
w()
w("3. PERL DEPENDENCY FREQUENCY")
w("-" * 50)
dep_class = Counter()
sierra_usage = Counter()
for r in perl_deps:
dep_class[r["classification"]] += 1
if r["classification"] == "local":
sierra_usage[r["module"]] += 1
w(" By classification:")
for cls, count in dep_class.most_common():
w(f" {cls:<15} {count:>4} imports")
w()
w(" Sierra:: module usage (most to least):")
for mod, count in sierra_usage.most_common():
w(f" {mod:<30} used by {count} files")
# 4. Cron Timeline
w()
w("4. CRON SCHEDULE (latest crontab)")
w("-" * 50)
latest_cron = [e for e in cron_schedule if e.get("crontab_date") == "20190709"]
active_cron = [e for e in latest_cron if e["is_commented"] == "False"]
commented_cron = [e for e in latest_cron if e["is_commented"] == "True"]
w(f" Active entries: {len(active_cron)}")
w(f" Commented out: {len(commented_cron)}")
w()
w(" Active schedule:")
for e in sorted(active_cron, key=lambda x: (x["hour"], x["minute"])):
name = e["report_name"] or e["command"][:40]
w(f" {e['human_schedule']:<30} {name}")
if commented_cron:
w()
w(" Commented out (historical):")
for e in sorted(commented_cron, key=lambda x: x.get("report_name", "")):
name = e["report_name"] or e["command"][:40]
notes = f" -- {e['notes']}" if e.get("notes") else ""
w(f" {name}{notes}")
# 5. Git Repositories
w()
w("5. GIT REPOSITORIES")
w("-" * 50)
w(f" Total repos: {len(git_summaries)}")
w()
for g in sorted(git_summaries, key=lambda x: x.get("last_commit_date", ""), reverse=True):
commits = g["total_commits"]
last = g["last_commit_date"][:10] if g["last_commit_date"] else "unknown"
w(f" {g['repo_path']:<40} {commits:>4} commits last: {last}")
# 6. Credential Exposure
w()
w("6. CREDENTIAL EXPOSURE SUMMARY")
w("-" * 50)
w(f" Total credential references found: {len(credential_locs)}")
w(" (File + line number only -- NO values stored)")
w()
cred_types = Counter(c["credential_type"] for c in credential_locs)
for ctype, count in cred_types.most_common():
w(f" {ctype:<20} {count:>4} occurrences")
cred_files = Counter(c["file"] for c in credential_locs)
w()
w(" Files with most credential references:")
for cfile, count in cred_files.most_common(10):
w(f" {cfile:<50} {count:>3}")
# 7. Size Breakdown
w()
w("7. SIZE BREAKDOWN BY DIRECTORY")
w("-" * 50)
dir_sizes = defaultdict(int)
for r in file_manifest:
# Get top-level parent
parts = r["parent_dir"].split("/")
top = parts[0] if parts[0] != "." else "(root)"
dir_sizes[top] += int(r["size_bytes"])
for d, size in sorted(dir_sizes.items(), key=lambda x: x[1], reverse=True):
w(f" {d:<40} {human_size(size):>10}")
w()
w("=" * 72)
w(f"Report based on manifests in {OUT_DIR}")
w("=" * 72)
print(f" Wrote {out_path}", file=sys.stderr)
elapsed = time.monotonic() - t_start
print(f" Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""Walk ilsaux directory tree and produce directory-tree.txt + file-manifest.csv.
Skips .git/ and .cpan/ internals but notes their presence.
Output: docs/ilsaux/manifests/directory-tree.txt, file-manifest.csv
"""
import csv
import os
import sys
import time
from datetime import datetime
BASE_DIR = "/home/ray/Documents/ilsaux"
OUT_DIR = "/home/ray/claude/docs/ilsaux/manifests"
SKIP_DIRS = {".git", ".cpan"}
def human_size(nbytes):
for unit in ("B", "KB", "MB", "GB", "TB"):
if abs(nbytes) < 1024:
return f"{nbytes:.1f} {unit}"
nbytes /= 1024
return f"{nbytes:.1f} PB"
def main():
if not os.path.isdir(BASE_DIR):
print(f"Error: {BASE_DIR} not found", file=sys.stderr)
sys.exit(1)
t_start = time.monotonic()
print(f"[manifest-tree] Walking {BASE_DIR} ...", file=sys.stderr)
# Collect all files
all_files = []
dir_sizes = {} # dir_path -> total size
dir_has_git = set()
dir_has_cpan = set()
errors = 0
for root, dirs, files in os.walk(BASE_DIR):
rel_root = os.path.relpath(root, BASE_DIR)
# Note and skip .git/.cpan
skip = []
for d in dirs:
if d in SKIP_DIRS:
skip.append(d)
full = os.path.join(root, d)
if d == ".git":
dir_has_git.add(rel_root)
elif d == ".cpan":
dir_has_cpan.add(rel_root)
for s in skip:
dirs.remove(s)
for name in files:
path = os.path.join(root, name)
try:
st = os.stat(path)
size = st.st_size
mtime = st.st_mtime
except OSError as e:
print(f" SKIP: {path}: {e}", file=sys.stderr)
errors += 1
continue
rel_path = os.path.relpath(path, BASE_DIR)
_, ext = os.path.splitext(name)
parent = os.path.relpath(root, BASE_DIR)
all_files.append({
"path": rel_path,
"size_bytes": size,
"size_human": human_size(size),
"mtime_iso": datetime.fromtimestamp(mtime).isoformat(timespec="seconds"),
"extension": ext,
"parent_dir": parent,
})
# Accumulate dir sizes
dir_sizes[parent] = dir_sizes.get(parent, 0) + size
print(f" Found {len(all_files):,} files, {errors} errors", file=sys.stderr)
# Write file-manifest.csv
csv_path = os.path.join(OUT_DIR, "file-manifest.csv")
all_files.sort(key=lambda f: f["path"])
with open(csv_path, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[
"path", "size_bytes", "size_human", "mtime_iso", "extension", "parent_dir"
])
writer.writeheader()
writer.writerows(all_files)
print(f" Wrote {csv_path} ({len(all_files):,} rows)", file=sys.stderr)
# Write directory-tree.txt
tree_path = os.path.join(OUT_DIR, "directory-tree.txt")
# Collect unique directories with their total sizes
all_dirs = set()
for f in all_files:
p = f["parent_dir"]
while p and p != ".":
all_dirs.add(p)
p = os.path.dirname(p)
all_dirs.add(".")
with open(tree_path, "w") as f:
f.write(f"Directory tree of {BASE_DIR}\n")
f.write(f"Generated: {datetime.now().isoformat(timespec='seconds')}\n")
f.write(f"Total files: {len(all_files):,}\n")
total_size = sum(fi["size_bytes"] for fi in all_files)
f.write(f"Total size: {human_size(total_size)}\n")
f.write("=" * 72 + "\n\n")
for root, dirs, files in os.walk(BASE_DIR):
rel = os.path.relpath(root, BASE_DIR)
depth = 0 if rel == "." else rel.count(os.sep) + 1
indent = " " * depth
dirname = os.path.basename(root) if rel != "." else BASE_DIR
size = dir_sizes.get(rel, 0)
# Skip internals of .git/.cpan
parts = rel.split(os.sep)
if any(p in SKIP_DIRS for p in parts):
continue
annotations = []
if rel in dir_has_git:
annotations.append("[git]")
if rel in dir_has_cpan:
annotations.append("[cpan]")
ann = " ".join(annotations)
if ann:
ann = " " + ann
f.write(f"{indent}{dirname}/ ({human_size(size)}){ann}\n")
# Remove skip dirs from walk
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
dirs.sort()
f.write("\n")
f.write("Per-directory size rollup (top 30):\n")
f.write("-" * 60 + "\n")
sorted_dirs = sorted(dir_sizes.items(), key=lambda x: x[1], reverse=True)
for dpath, dsize in sorted_dirs[:30]:
f.write(f" {human_size(dsize):>10} {dpath}\n")
print(f" Wrote {tree_path}", file=sys.stderr)
elapsed = time.monotonic() - t_start
print(f" Done in {elapsed:.1f}s", file=sys.stderr)
if __name__ == "__main__":
main()
#!/bin/bash
set -e
# Publish sanitized ilsaux documentation to a GitHub gist.
# Uses sanitize-for-gist.py to create a clean staging copy, then
# bundles all files into a gist with directory-prefixed filenames.
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
WORKSPACE="$(cd "$SCRIPT_DIR/../.." && pwd)"
echo "=== ilsaux Documentation Gist Publisher ==="
echo ""
# Step 1: Run sanitizer
echo "[1/3] Sanitizing documentation ..."
SANITIZE_LOG=$(mktemp)
STAGING_DIR=$(python3 "$SCRIPT_DIR/sanitize-for-gist.py" 2>"$SANITIZE_LOG") || true
if [ -z "$STAGING_DIR" ] || [ ! -d "$STAGING_DIR" ]; then
echo "ERROR: Sanitization failed. Details:"
cat "$SANITIZE_LOG"
rm -f "$SANITIZE_LOG"
exit 1
fi
echo " Staging directory: $STAGING_DIR"
echo " $(grep 'Processed\|CLEAN' "$SANITIZE_LOG" || true)"
rm -f "$SANITIZE_LOG"
# Step 2: Belt-and-suspenders verification
echo ""
echo "[2/3] Final verification grep ..."
GREP_HITS=$(grep -ri '[REDACTED-PASSWORD]\|sqlaccess\|sqllabels\|sqldataentryerrors\|[REDACTED-PASSWORD]\|eS3cuRe\|svc_vmsp\|cinci-db\|sierra-db\|\.plch\.net\|\.iii\.com\|cincinnatilibrary\.org' "$STAGING_DIR" \
--include='*.md' --include='*.csv' --include='*.txt' --include='*.json' \
2>/dev/null | grep -v 'sanitize-for-gist\.py' | grep -v 'manifest-script-content\.py' | grep -v 'generate-report-docs\.py' || true)
if [ -n "$GREP_HITS" ]; then
echo "ERROR: Sensitive patterns still found in staging directory!"
echo "$GREP_HITS"
echo ""
echo "Aborting. Fix sanitization rules before publishing."
exit 1
fi
echo " CLEAN: No sensitive patterns found."
# Step 3: Flatten files with prefixed names for gist (gists have no subdirs)
echo ""
echo "[3/3] Creating gist ..."
FLAT_DIR="/tmp/ilsaux-gist-flat"
rm -rf "$FLAT_DIR"
mkdir -p "$FLAT_DIR"
flatten_dir() {
local src_dir="$1"
local prefix="$2"
[ -d "$src_dir" ] || return
for file in "$src_dir"/*; do
[ -f "$file" ] || continue
local basename=$(basename "$file")
cp "$file" "$FLAT_DIR/${prefix}${basename}"
done
}
flatten_dir "$STAGING_DIR/docs/manifests" "manifests--"
flatten_dir "$STAGING_DIR/docs/reports" "reports--"
flatten_dir "$STAGING_DIR/docs/modules" "modules--"
flatten_dir "$STAGING_DIR/docs/framework" "framework--"
flatten_dir "$STAGING_DIR/scripts" "scripts--"
# Top-level docs
[ -f "$STAGING_DIR/docs/00-INDEX.md" ] && cp "$STAGING_DIR/docs/00-INDEX.md" "$FLAT_DIR/00-INDEX.md"
[ -f "$STAGING_DIR/docs/archive-plan.md" ] && cp "$STAGING_DIR/docs/archive-plan.md" "$FLAT_DIR/archive-plan.md"
[ -f "$STAGING_DIR/ilsaux-documentation.md" ] && cp "$STAGING_DIR/ilsaux-documentation.md" "$FLAT_DIR/ilsaux-documentation.md"
FILE_COUNT=$(find "$FLAT_DIR" -type f | wc -l)
echo " Prepared $FILE_COUNT files in flat layout ..."
if [ "$FILE_COUNT" -eq 0 ]; then
echo "ERROR: No files found to publish."
exit 1
fi
# Exclude very large files that have low sharing value
# file-manifest.csv (~2MB, 17K rows of paths/sizes) is borderline
# script-content.json (~3MB) has high value -- keep it
# Create gist with all flattened files
# gh gist create uses basename, so the prefixed names become the gist filenames
GIST_FILES=()
for file in "$FLAT_DIR"/*; do
[ -f "$file" ] || continue
GIST_FILES+=("$file")
done
echo " Uploading to GitHub gist ..."
GIST_OUTPUT=$(gh gist create --public \
--desc "ilsaux ILS Auxiliary System - Documentation & Analysis Scripts (CHPL)" \
"${GIST_FILES[@]}" 2>&1)
GH_EXIT=$?
if [ $GH_EXIT -ne 0 ]; then
echo "ERROR: gh gist create failed:"
echo "$GIST_OUTPUT"
echo ""
echo "Staging directory preserved at: $STAGING_DIR"
echo "Flat directory preserved at: $FLAT_DIR"
echo "You can inspect or manually publish from there."
exit 1
fi
# Extract URL from output (last line typically)
GIST_URL=$(echo "$GIST_OUTPUT" | grep -o 'https://gist.github.com/[^ ]*' | tail -1)
echo ""
echo "=== SUCCESS ==="
echo "Gist URL: $GIST_URL"
echo "Files published: $FILE_COUNT"
echo ""
echo "Staging directory: $STAGING_DIR"
echo "Flat directory: $FLAT_DIR"
echo "(Clean up with: rm -rf $STAGING_DIR $FLAT_DIR)"
#!/usr/bin/env python3
"""Publish sanitized ilsaux documentation as multiple GitHub gists.
Breaks the documentation into category-based gists to avoid GitHub's
rendering limits, then creates a master TOC gist linking them all.
Requires: gh CLI authenticated with gist scope.
Usage:
python3 scripts/ilsaux/publish-ilsaux-gists.py [--dry-run] [--delete-old]
"""
import json
import os
import re
import subprocess
import sys
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
WORKSPACE = os.path.dirname(os.path.dirname(SCRIPT_DIR))
STAGING_DIR = "/tmp/ilsaux-gist"
MANIFEST_PATH = os.path.join(SCRIPT_DIR, "gist-manifest.json")
# GitHub username for constructing gist URLs
GITHUB_USER = "rayvoelker"
# Old single gist to optionally delete
OLD_SINGLE_GIST_ID = "cce2e74ff232c461e6c6b0e9a620a24f"
# Gist group definitions: (group_name, description, source_subdir, file_filter)
# source_subdir is relative to STAGING_DIR
# file_filter: None means all files, or a callable(filename) -> bool
GIST_GROUPS = [
{
"name": "reports",
"description": "ilsaux Report Documentation (52 reports) - CHPL Sierra ILS",
"source_dirs": ["docs/reports"],
"prefix": "reports--",
},
{
"name": "modules",
"description": "ilsaux Sierra:: Perl Module Documentation (16 modules) - CHPL",
"source_dirs": ["docs/modules"],
"prefix": "modules--",
},
{
"name": "framework",
"description": "ilsaux Framework & Archive Plan - CHPL Sierra ILS",
"source_dirs": ["docs/framework"],
"extra_files": ["docs/archive-plan.md"],
"prefix": "framework--",
},
{
"name": "manifests-small",
"description": "ilsaux Manifests (CSVs & Text) - CHPL Sierra ILS",
"source_dirs": ["docs/manifests"],
"prefix": "manifests--",
"exclude": {"file-manifest.csv", "script-content.json"},
},
{
"name": "manifests-large",
"description": "ilsaux Large Manifests (file listing & script content) - CHPL",
"source_dirs": ["docs/manifests"],
"prefix": "manifests--",
"only": {"file-manifest.csv", "script-content.json"},
},
{
"name": "scripts",
"description": "ilsaux Documentation Generator Scripts - CHPL Sierra ILS",
"source_dirs": ["scripts"],
"prefix": "scripts--",
},
]
MASTER_TOC_PLACEHOLDER = "{{MASTER_TOC_URL}}"
def run_sanitizer():
"""Run sanitize-for-gist.py and return the staging directory path."""
sanitize_script = os.path.join(SCRIPT_DIR, "sanitize-for-gist.py")
result = subprocess.run(
[sys.executable, sanitize_script],
capture_output=True,
text=True,
)
# Sanitizer prints status to stderr, staging dir path to stdout
sys.stderr.write(result.stderr)
if result.returncode != 0:
print("ERROR: Sanitization failed.", file=sys.stderr)
sys.exit(1)
staging = result.stdout.strip()
if not os.path.isdir(staging):
print(f"ERROR: Staging directory not found: {staging}", file=sys.stderr)
sys.exit(1)
return staging
def collect_group_files(group, staging_dir):
"""Collect files for a gist group. Returns dict of {gist_filename: local_path}."""
files = {}
prefix = group.get("prefix", "")
exclude = group.get("exclude", set())
only = group.get("only", None)
for src_dir in group.get("source_dirs", []):
full_dir = os.path.join(staging_dir, src_dir)
if not os.path.isdir(full_dir):
print(f" WARNING: {full_dir} not found", file=sys.stderr)
continue
for name in sorted(os.listdir(full_dir)):
path = os.path.join(full_dir, name)
if not os.path.isfile(path):
continue
if only is not None and name not in only:
continue
if name in exclude:
continue
gist_name = f"{prefix}{name}"
files[gist_name] = path
# Extra individual files
for extra in group.get("extra_files", []):
path = os.path.join(staging_dir, extra)
if os.path.isfile(path):
gist_name = f"{prefix}{os.path.basename(path)}"
files[gist_name] = path
return files
def filename_to_anchor(filename):
"""Convert a gist filename to GitHub's anchor slug format.
GitHub gist anchors: lowercase, dots become hyphens, collapse multiple hyphens.
Format: #file-{slug}
"""
slug = filename.lower()
slug = slug.replace(".", "-")
# Collapse consecutive hyphens
slug = re.sub(r"-+", "-", slug)
slug = slug.strip("-")
return f"#file-{slug}"
def gist_file_url(gist_url, filename):
"""Build a deep link URL to a specific file in a gist."""
anchor = filename_to_anchor(filename)
return f"{gist_url}{anchor}"
def generate_group_readme(group, files, master_toc_url=None):
"""Generate a 00-README.md for a sub-gist group."""
toc_link = master_toc_url or MASTER_TOC_PLACEHOLDER
name = group["name"]
desc = group["description"]
lines = [
f"# {desc}",
"",
f"**Category:** {name}",
f"**Files:** {len(files)}",
f"**Master Index:** [{toc_link}]({toc_link})",
"",
"---",
"",
"## Files in This Gist",
"",
]
for fname in sorted(files.keys()):
anchor = filename_to_anchor(fname)
lines.append(f"- [{fname}]({anchor})")
lines.append("")
return "\n".join(lines)
def load_manifest():
"""Load existing gist manifest, or return empty structure."""
if os.path.exists(MANIFEST_PATH):
with open(MANIFEST_PATH) as f:
return json.load(f)
return {"version": 1, "groups": {}, "old_single_gist_id": OLD_SINGLE_GIST_ID}
def save_manifest(manifest):
"""Save gist manifest to disk."""
with open(MANIFEST_PATH, "w") as f:
json.dump(manifest, f, indent=2)
f.write("\n")
def gist_exists(gist_id):
"""Check if a gist exists (returns True/False)."""
result = subprocess.run(
["gh", "api", f"/gists/{gist_id}", "--silent"],
capture_output=True,
)
return result.returncode == 0
def create_gist(files_dict, description, dry_run=False):
"""Create a new public gist. Returns (gist_id, gist_url)."""
if dry_run:
print(f" [DRY RUN] Would create gist: {description}", file=sys.stderr)
print(f" [DRY RUN] Files: {len(files_dict)}", file=sys.stderr)
return "dry-run-id", "https://gist.github.com/dry-run"
# Write files to a temp directory for gh gist create
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
paths = []
for gist_name, content_or_path in files_dict.items():
dest = os.path.join(tmpdir, gist_name)
if isinstance(content_or_path, str) and os.path.isfile(content_or_path):
# It's a file path, copy it
import shutil
shutil.copy2(content_or_path, dest)
else:
# It's content string
with open(dest, "w") as f:
f.write(content_or_path)
paths.append(dest)
result = subprocess.run(
["gh", "gist", "create", "--public", "--desc", description] + paths,
capture_output=True,
text=True,
)
if result.returncode != 0:
print(f"ERROR: gh gist create failed: {result.stderr}", file=sys.stderr)
return None, None
gist_url = result.stdout.strip()
# Extract gist ID from URL
gist_id = gist_url.rstrip("/").split("/")[-1]
return gist_id, gist_url
def update_gist(gist_id, files_dict, description=None, dry_run=False):
"""Update an existing gist with new file contents. Returns success bool."""
if dry_run:
print(f" [DRY RUN] Would update gist {gist_id}", file=sys.stderr)
print(f" [DRY RUN] Files: {len(files_dict)}", file=sys.stderr)
return True
# Build the JSON payload for gh api PATCH
payload = {"files": {}}
if description:
payload["description"] = description
for gist_name, content_or_path in files_dict.items():
if isinstance(content_or_path, str) and os.path.isfile(content_or_path):
with open(content_or_path, "r", errors="replace") as f:
content = f.read()
else:
content = content_or_path
payload["files"][gist_name] = {"content": content}
result = subprocess.run(
["gh", "api", "--method", "PATCH", f"/gists/{gist_id}", "--input", "-"],
input=json.dumps(payload),
capture_output=True,
text=True,
)
if result.returncode != 0:
print(f"ERROR: Failed to update gist {gist_id}: {result.stderr}", file=sys.stderr)
return False
return True
def create_or_update_gist(gist_id, files_dict, description, dry_run=False):
"""Create or update a gist. Returns (gist_id, gist_url, created_new)."""
if gist_id and gist_exists(gist_id):
ok = update_gist(gist_id, files_dict, description, dry_run)
if ok:
url = f"https://gist.github.com/{GITHUB_USER}/{gist_id}"
return gist_id, url, False
else:
print(f" Update failed for {gist_id}, will recreate", file=sys.stderr)
# Create new
new_id, new_url = create_gist(files_dict, description, dry_run)
return new_id, new_url, True
def generate_master_toc(group_data, staging_dir):
"""Generate the master 00-INDEX.md with real gist URLs and deep links."""
# Read the original 00-INDEX.md as a base for content
original_index = os.path.join(staging_dir, "docs/00-INDEX.md")
if not os.path.exists(original_index):
print("ERROR: docs/00-INDEX.md not found in staging", file=sys.stderr)
sys.exit(1)
with open(original_index) as f:
original_content = f.read()
# Now rewrite the index with multi-gist links
lines = []
lines.append("# ilsaux -- ILS Auxiliary Server Documentation")
lines.append("")
lines.append("**System:** Sierra ILS report automation server at Cincinnati & Hamilton County Public Library (CHPL)")
lines.append("**Contents:** 94 documentation files covering 51 reports, 16 Perl modules, cron framework, and migration plan")
lines.append("**Credentials:** All sensitive values replaced with `[REDACTED-*]` markers (see bottom of this file)")
lines.append("")
lines.append("---")
lines.append("")
lines.append("## Documentation Gists")
lines.append("")
lines.append("This documentation is split across multiple gists to stay within GitHub's rendering limits.")
lines.append("")
lines.append("| Category | Files | Description | Link |")
lines.append("|----------|-------|-------------|------|")
group_order = ["reports", "modules", "framework", "manifests-small", "manifests-large", "scripts"]
group_labels = {
"reports": ("Reports", "52 report docs (51 reports + template)"),
"modules": ("Modules", "16 Sierra:: Perl module docs"),
"framework": ("Framework", "Cron framework, config format, archive plan"),
"manifests-small": ("Manifests (Small)", "7 renderable CSVs and text files"),
"manifests-large": ("Manifests (Large)", "Full file listing + script content JSON"),
"scripts": ("Scripts", "13 Python/bash generator scripts"),
}
for gname in group_order:
gd = group_data[gname]
label, desc = group_labels[gname]
url = gd["gist_url"]
count = gd["file_count"]
lines.append(f"| **{label}** | {count} | {desc} | [View gist]({url}) |")
lines.append("")
lines.append("---")
lines.append("")
# Extract and rewrite the report tables with deep links into the reports gist
reports_url = group_data["reports"]["gist_url"]
modules_url = group_data["modules"]["gist_url"]
framework_url = group_data["framework"]["gist_url"]
manifests_small_url = group_data["manifests-small"]["gist_url"]
manifests_large_url = group_data["manifests-large"]["gist_url"]
scripts_url = group_data["scripts"]["gist_url"]
# Rewrite the Active Reports section with deep links
lines.append("## Active Reports -- Quick Reference")
lines.append("")
lines.append("### Shelf-List Reports (6 -- HIGH PRIORITY)")
lines.append("")
lines.append("These are the highest-value reports, actively used for collection management.")
lines.append("")
lines.append("| Report | Full Name | Schedule | Last Run | Doc File |")
lines.append("|--------|-----------|----------|----------|----------|")
# Parse original content for report tables
# We'll extract from the original and rewrite links
_rewrite_report_tables(original_content, lines, reports_url)
lines.append("")
lines.append("---")
lines.append("")
# Sierra:: Modules section
lines.append("## Sierra:: Modules (16)")
lines.append("")
lines.append("Custom Perl modules in `Modules/Sierra/` providing database access, location mapping, and ILS integration.")
lines.append("")
lines.append("| Module | Purpose | Used By | Doc File |")
lines.append("|--------|---------|---------|----------|")
_rewrite_module_table(original_content, lines, modules_url)
lines.append("")
lines.append("---")
lines.append("")
# Manifest files section
lines.append("## Manifest Files (9)")
lines.append("")
lines.append("Machine-readable data files generated by the analysis scripts.")
lines.append("")
lines.append("| File | Format | Contents | Gist |")
lines.append("|------|--------|----------|------|")
_rewrite_manifest_table(original_content, lines, manifests_small_url, manifests_large_url)
lines.append("")
lines.append("---")
lines.append("")
# Scripts section
lines.append("## Scripts (13)")
lines.append("")
lines.append("Python scripts (stdlib-only, rerunnable) that generated this documentation from the live ilsaux server.")
lines.append("")
_rewrite_scripts_section(original_content, lines, scripts_url)
lines.append("")
lines.append("---")
lines.append("")
# Framework section
lines.append("## Framework & Architecture")
lines.append("")
lines.append("All reports follow the same execution pattern through `generic-cron.sh`:")
lines.append("")
lines.append("```")
lines.append("cron schedule -> <report>-cron.sh -> generic-cron.sh -> perl ./$SOURCEFILE -> Sierra::DB -> PostgreSQL")
lines.append("```")
lines.append("")
fw_cron = gist_file_url(framework_url, "framework--generic-cron-framework.md")
fw_cfg = gist_file_url(framework_url, "framework--config-file-format.md")
lines.append(f"See [generic-cron-framework.md]({fw_cron}) for the full execution flow.")
lines.append(f"See [config-file-format.md]({fw_cfg}) for the Config::Simple `.cfg` credential format.")
lines.append("")
lines.append("---")
lines.append("")
# Migration section
lines.append("## Migration & Archive Plan")
lines.append("")
archive_link = gist_file_url(framework_url, "framework--archive-plan.md")
lines.append(f"See [archive-plan.md]({archive_link}) for the full classification and migration priorities.")
lines.append("")
lines.append("**Key numbers:**")
lines.append("- **6** active-critical shelf-list reports (migrate first)")
lines.append("- **16** other active reports (evaluate for migration)")
lines.append("- **2** inactive-recent (review with stakeholders)")
lines.append("- **27** obsolete (archive as historical record)")
lines.append("- **49.6 GB** total server size; 65.9% is active report data")
lines.append("- **316** credential references that need rotation before any migration")
lines.append("")
lines.append("---")
lines.append("")
# Credential safety note
lines.append("## Credential Safety Note")
lines.append("")
sanitize_link = gist_file_url(scripts_url, "scripts--sanitize-for-gist.py")
lines.append(f"All sensitive values in these gists have been replaced by the sanitizer ([sanitize-for-gist.py]({sanitize_link})):")
lines.append("")
lines.append("| Marker | Meaning |")
lines.append("|--------|---------|")
lines.append("| `[REDACTED-PASSWORD]` | Database or service password |")
lines.append("| `[REDACTED-USER]` | Database username or service account |")
lines.append("| `[REDACTED-HOST]` | Internal hostname or domain (*.plch.net, *.iii.com, etc.) |")
lines.append("| `[REDACTED-EMAIL]` | Internal email address |")
lines.append("| `[INTERNAL-HOST]` | Short internal hostname reference |")
lines.append("")
cred_link = gist_file_url(manifests_small_url, "manifests--credential-locations.csv")
lines.append(f"The [credential-locations.csv]({cred_link}) file lists where credentials appear (file + line + type) but contains **no actual credential values**.")
return "\n".join(lines) + "\n"
def _rewrite_report_tables(original, lines, reports_url):
"""Extract report table rows from original and rewrite links."""
# Match table rows like: | slitemdata | Item Data ... | ... | [reports--slitemdata.md](...) |
pattern = re.compile(
r"^\| (\S+) \| (.+?) \| (.+?) \| (.+?) \| \[reports--(\S+?)\]\([^)]*\) \|$",
re.MULTILINE,
)
in_shelf = True # First table is shelf-list
found_other = False
for m in pattern.finditer(original):
report, full_name, schedule, last_run, filename = m.groups()
gist_fname = f"reports--{filename}"
deep_link = gist_file_url(reports_url, gist_fname)
# Detect transition from shelf-list to other reports
line_pos = m.start()
preceding = original[max(0, line_pos - 200):line_pos]
if "Other Active Reports" in preceding and not found_other:
found_other = True
in_shelf = False
lines.append("")
lines.append("### Other Active Reports (16)")
lines.append("")
lines.append("| Report | Full Name | Schedule | Last Run | Doc File |")
lines.append("|--------|-----------|----------|----------|----------|")
lines.append(f"| {report} | {full_name} | {schedule} | {last_run} | [{gist_fname}]({deep_link}) |")
# Add note about inactive
lines.append("")
archive_link = gist_file_url(
# framework gist has archive-plan
reports_url.replace(reports_url.split("/")[-1], ""), # won't work, use group_data
"framework--archive-plan.md",
)
lines.append("**Inactive and obsolete reports** (2 inactive-recent + 27 obsolete) are classified in the Framework gist.")
def _rewrite_module_table(original, lines, modules_url):
"""Extract module table rows and rewrite links."""
pattern = re.compile(
r"^\| (Sierra::\S+) \| (.+?) \| (.+?) \| \[modules--(\S+?)\]\([^)]*\) \|$",
re.MULTILINE,
)
for m in pattern.finditer(original):
module, purpose, used_by, filename = m.groups()
gist_fname = f"modules--{filename}"
deep_link = gist_file_url(modules_url, gist_fname)
lines.append(f"| {module} | {purpose} | {used_by} | [{gist_fname}]({deep_link}) |")
def _rewrite_manifest_table(original, lines, small_url, large_url):
"""Extract manifest table rows and rewrite links."""
pattern = re.compile(
r"^\| \[manifests--(\S+?)\]\([^)]*\) \| (\S+) \| (.+?) \|$",
re.MULTILINE,
)
large_files = {"file-manifest.csv", "script-content.json"}
for m in pattern.finditer(original):
filename, fmt, contents = m.groups()
gist_fname = f"manifests--{filename}"
url = large_url if filename in large_files else small_url
deep_link = gist_file_url(url, gist_fname)
gist_label = "Large" if filename in large_files else "Small"
lines.append(f"| [{gist_fname}]({deep_link}) | {fmt} | {contents} | {gist_label} |")
def _rewrite_scripts_section(original, lines, scripts_url):
"""Rewrite scripts section with deep links."""
# Manifest generators
lines.append("### Manifest Generators (7)")
lines.append("")
lines.append("| Script | Output |")
lines.append("|--------|--------|")
script_manifest = [
("manifest-tree.py", "`file-manifest.csv`, `directory-tree.txt`"),
("manifest-perl-deps.py", "`perl-dependencies.csv`"),
("manifest-script-content.py", "`script-content.json`"),
("manifest-cron.py", "`cron-schedule.csv`"),
("manifest-report-status.py", "`report-status.csv`"),
("manifest-git.py", "`git-summaries.csv`"),
("manifest-summary.py", "`summary-report.txt`"),
]
for script, output in script_manifest:
gist_fname = f"scripts--{script}"
deep_link = gist_file_url(scripts_url, gist_fname)
lines.append(f"| [{gist_fname}]({deep_link}) | {output} |")
lines.append("")
lines.append("### Documentation Generators (3)")
lines.append("")
lines.append("| Script | Output |")
lines.append("|--------|--------|")
doc_generators = [
("generate-report-docs.py", "51 report docs in `reports--*.md`"),
("generate-module-docs.py", "16 module docs in `modules--*.md`"),
("generate-framework-doc.py", "Framework docs"),
]
for script, output in doc_generators:
gist_fname = f"scripts--{script}"
deep_link = gist_file_url(scripts_url, gist_fname)
lines.append(f"| [{gist_fname}]({deep_link}) | {output} |")
lines.append("")
lines.append("### Utilities (3)")
lines.append("")
lines.append("| Script | Purpose |")
lines.append("|--------|---------|")
utilities = [
("generate-archive-plan.py", "Generates archive-plan.md (migration classification and priorities)"),
("sanitize-for-gist.py", "Redacts credentials and internal hostnames for safe publishing"),
("publish-ilsaux-gist.sh", "Original single-gist publisher (kept for reference)"),
]
for script, purpose in utilities:
gist_fname = f"scripts--{script}"
deep_link = gist_file_url(scripts_url, gist_fname)
lines.append(f"| [{gist_fname}]({deep_link}) | {purpose} |")
def delete_gist(gist_id, dry_run=False):
"""Delete a gist by ID."""
if dry_run:
print(f" [DRY RUN] Would delete gist {gist_id}", file=sys.stderr)
return True
result = subprocess.run(
["gh", "gist", "delete", gist_id],
capture_output=True,
text=True,
)
if result.returncode != 0:
print(f"ERROR: Failed to delete gist {gist_id}: {result.stderr}", file=sys.stderr)
return False
return True
def main():
import argparse
parser = argparse.ArgumentParser(description="Publish ilsaux docs as multiple gists")
parser.add_argument("--dry-run", action="store_true", help="Show what would happen without API calls")
parser.add_argument("--delete-old", action="store_true", help="Delete the old single gist after publish")
args = parser.parse_args()
dry_run = args.dry_run
print("=== ilsaux Multi-Gist Publisher ===\n")
# Phase 1: Sanitize
print("[1/8] Sanitizing documentation ...")
staging_dir = run_sanitizer()
print(f" Staging: {staging_dir}\n")
# Phase 2: Group files
print("[2/8] Grouping files ...")
grouped = {}
for group in GIST_GROUPS:
name = group["name"]
files = collect_group_files(group, staging_dir)
grouped[name] = {"group": group, "files": files}
print(f" {name}: {len(files)} files")
print()
# Phase 3: Generate per-gist 00-README.md (with placeholder)
print("[3/8] Generating per-gist README files ...")
for name, data in grouped.items():
readme_content = generate_group_readme(data["group"], data["files"])
data["files"]["00-README.md"] = readme_content
print(f" {name}: 00-README.md added ({len(data['files'])} total)")
print()
# Phase 4: Create or update sub-gists
print("[4/8] Creating/updating sub-gists ...")
manifest = load_manifest()
group_data = {}
for name, data in grouped.items():
existing_id = manifest.get("groups", {}).get(name, {}).get("gist_id")
desc = data["group"]["description"]
gist_id, gist_url, created = create_or_update_gist(
existing_id, data["files"], desc, dry_run
)
if not gist_id:
print(f"ERROR: Failed to create/update gist for {name}", file=sys.stderr)
sys.exit(1)
action = "Created" if created else "Updated"
print(f" {action} {name}: {gist_url}")
group_data[name] = {
"gist_id": gist_id,
"gist_url": gist_url,
"file_count": len(data["files"]),
}
print()
# Phase 5: Generate master TOC
print("[5/8] Generating master TOC ...")
master_toc_content = generate_master_toc(group_data, staging_dir)
if dry_run:
print(f" [DRY RUN] Master TOC: {len(master_toc_content)} bytes")
print()
# Phase 6: Create or update master TOC gist
print("[6/8] Creating/updating master TOC gist ...")
master_id = manifest.get("master_toc_gist_id")
master_files = {"00-INDEX.md": master_toc_content}
master_desc = "ilsaux ILS Auxiliary Server - Master Documentation Index (CHPL)"
master_gist_id, master_gist_url, master_created = create_or_update_gist(
master_id, master_files, master_desc, dry_run
)
if not master_gist_id:
print("ERROR: Failed to create/update master TOC gist", file=sys.stderr)
sys.exit(1)
action = "Created" if master_created else "Updated"
print(f" {action} master TOC: {master_gist_url}\n")
# Phase 7: Back-patch sub-gist READMEs
print("[7/8] Back-patching sub-gist READMEs with master TOC URL ...")
for name, data in grouped.items():
readme = generate_group_readme(
data["group"], data["files"], master_toc_url=master_gist_url
)
gist_id = group_data[name]["gist_id"]
if dry_run:
print(f" [DRY RUN] Would patch {name} 00-README.md")
else:
ok = update_gist(gist_id, {"00-README.md": readme}, dry_run=False)
if ok:
print(f" Patched {name}")
else:
print(f" WARNING: Failed to patch {name} README", file=sys.stderr)
print()
# Phase 8: Save manifest
print("[8/8] Saving manifest ...")
manifest["version"] = 1
manifest["master_toc_gist_id"] = master_gist_id
manifest["master_toc_gist_url"] = master_gist_url
manifest["old_single_gist_id"] = OLD_SINGLE_GIST_ID
manifest["groups"] = group_data
if dry_run:
print(f" [DRY RUN] Would save manifest to {MANIFEST_PATH}")
print(f" [DRY RUN] Manifest content:")
print(json.dumps(manifest, indent=2))
else:
save_manifest(manifest)
print(f" Saved: {MANIFEST_PATH}")
print()
# Optional: delete old gist
if args.delete_old:
print(f"Deleting old single gist {OLD_SINGLE_GIST_ID} ...")
if delete_gist(OLD_SINGLE_GIST_ID, dry_run):
print(" Deleted.")
else:
print(" WARNING: Could not delete old gist.", file=sys.stderr)
print()
# Summary
print("=== DONE ===")
print(f"Master TOC: {master_gist_url}")
for name in ["reports", "modules", "framework", "manifests-small", "manifests-large", "scripts"]:
gd = group_data[name]
print(f" {name} ({gd['file_count']} files): {gd['gist_url']}")
if not dry_run:
print(f"\nManifest saved to: {MANIFEST_PATH}")
print("Commit it with: git add scripts/ilsaux/gist-manifest.json")
if __name__ == "__main__":
main()
#!/usr/bin/env python3
"""Sanitize ilsaux documentation for gist publishing.
Copies docs/ilsaux/ and scripts/ilsaux/ to a staging directory, then scans
and redacts credentials, internal hostnames, and other sensitive data.
Output: /tmp/ilsaux-gist/ (staging directory ready for gist publish)
"""
import os
import re
import shutil
import sys
WORKSPACE = "/home/ray/claude"
STAGING_DIR = "/tmp/ilsaux-gist"
# Source directories to copy
SOURCES = [
("docs/ilsaux", "docs"),
("scripts/ilsaux", "scripts"),
]
# --- Redaction rules ---
# Literal strings to redact (case-insensitive matching, replaced entirely)
LITERAL_PASSWORDS = [
"[REDACTED-PASSWORD]",
"[REDACTED-PASSWORD]",
"[REDACTED-PASSWORD]",
]
# DB usernames used in DBI->connect strings (not generic words)
DB_USERNAMES = [
"[REDACTED-USER]",
"[REDACTED-USER]",
"[REDACTED-USER]",
"[REDACTED-USER]",
"[REDACTED-USER]",
"[REDACTED-USER]",
]
# Internal hostnames / domains to redact
# Order matters: more specific patterns first
HOST_PATTERNS = [
(re.compile(r'sierra-train\.cincinnatilibrary\.org', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'sierra-train\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'sierra-db\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'cinci-db\.iii\.com', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'cinci\.iii\.com', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'partner\.iii\.com', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'host-ilsaux\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'webtools\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'[INTERNAL-HOST]\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'www2\.plch\.net', re.IGNORECASE), "[REDACTED-HOST]"),
# Catch-all for any remaining *.plch.net or *.iii.com subdomains
(re.compile(r'\b[\w.-]+\.plch\.net\b', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'\b[\w.-]+\.iii\.com\b', re.IGNORECASE), "[REDACTED-HOST]"),
(re.compile(r'\b[\w.-]+\.cincinnatilibrary\.org\b', re.IGNORECASE), "[REDACTED-HOST]"),
]
# Regex patterns for credential constructs
CREDENTIAL_REGEXES = [
# DBI->connect with inline credentials: redact user and password args
# Matches: DBI->connect("dsn","[REDACTED-USER]","[REDACTED-PASSWORD]", ...)
(
re.compile(
r'(DBI->connect\(\s*"[^"]*"\s*,\s*)"([^"]*?)"\s*,\s*"([^"]*?)"',
re.IGNORECASE,
),
r'\1"[REDACTED-USER]","[REDACTED-PASSWORD]"',
),
# $ua->credentials('host:port', 'realm', '[REDACTED-USER]', '[REDACTED-PASSWORD]')
(
re.compile(
r"(\$ua->credentials\(\s*'[^']*'\s*,\s*'[^']*'\s*,\s*)'([^']*)'\s*,\s*'([^']*)'",
re.IGNORECASE,
),
r"\1'[REDACTED-USER]', '[REDACTED-PASSWORD]'",
),
# my $password = "[REDACTED-PASSWORD]";
(
re.compile(r'(my\s+\$password\s*=\s*)"[^"]*"', re.IGNORECASE),
r'\1"[REDACTED-PASSWORD]"',
),
# my $username = "[REDACTED-USER]"; (only in credential context, not generic)
(
re.compile(r'(my\s+\$username\s*=\s*)"[^"]*"', re.IGNORECASE),
r'\1"[REDACTED-USER]"',
),
]
# Email addresses at internal domains
EMAIL_PATTERNS = [
(re.compile(r'\b[\w.+-]+@cincinnatilibrary\.org\b', re.IGNORECASE), "[REDACTED-EMAIL]"),
(re.compile(r'\b[\w.+-]+@plch\.net\b', re.IGNORECASE), "[REDACTED-EMAIL]"),
]
# Hostname references that appear as just the short name (e.g., "[INTERNAL-HOST]" in print stmts)
# These are less sensitive but still internal infrastructure names
SHORT_HOST_REFS = [
(re.compile(r'\bMain12\b', re.IGNORECASE), "[INTERNAL-HOST]"),
]
# Escaped variants for JSON content (backslash-escaped quotes)
JSON_CREDENTIAL_REGEXES = [
# DBI->connect with escaped quotes in JSON
(
re.compile(
r'(DBI->connect\(\\?"[^"\\]*(?:\\.[^"\\]*)*\\?"\s*,\s*\\?)"([^"\\]*)"(\\?\s*,\s*\\?)"([^"\\]*)"',
re.IGNORECASE,
),
r'\1"[REDACTED-USER]"\3"[REDACTED-PASSWORD]"',
),
]
def redact_line(line, filepath):
"""Apply all redaction rules to a single line. Returns (new_line, list_of_redactions)."""
redactions = []
original = line
# 1. Literal password redaction
for pw in LITERAL_PASSWORDS:
if pw.lower() in line.lower():
# Use case-insensitive replacement
pattern = re.compile(re.escape(pw), re.IGNORECASE)
if pattern.search(line):
line = pattern.sub("[REDACTED-PASSWORD]", line)
redactions.append(f"literal-password: {pw}")
# 2. DB username redaction (as standalone quoted strings)
for uname in DB_USERNAMES:
# Match "username" or 'username' as standalone tokens
for quote in ['"', "'"]:
token = f"{quote}{uname}{quote}"
if token in line:
line = line.replace(token, f'{quote}[REDACTED-USER]{quote}')
redactions.append(f"db-username: {uname}")
# Also handle escaped quotes in JSON: \"username\"
escaped_token = f'\\"{uname}\\"'
if escaped_token in line:
line = line.replace(escaped_token, '\\"[REDACTED-USER]\\"')
redactions.append(f"db-username-json: {uname}")
# Also [REDACTED-USER] style
escaped_backslash = f"plchnet\\\\{uname}" if uname == "[REDACTED-USER]" else None
if escaped_backslash and escaped_backslash in line:
line = line.replace(escaped_backslash, "[REDACTED-USER]")
redactions.append(f"service-account: plchnet\\{uname}")
# 3. Email address redaction (before host patterns, since emails contain domains)
for pattern, replacement in EMAIL_PATTERNS:
if pattern.search(line):
line = pattern.sub(replacement, line)
redactions.append(f"email: {pattern.pattern}")
# 4. Host pattern redaction
for pattern, replacement in HOST_PATTERNS:
if pattern.search(line):
line = pattern.sub(replacement, line)
redactions.append(f"hostname: {pattern.pattern}")
# 5. Credential regex redaction
for pattern, replacement in CREDENTIAL_REGEXES:
if pattern.search(line):
line = pattern.sub(replacement, line)
redactions.append(f"credential-pattern: {pattern.pattern[:60]}")
# 6. Short host references
for pattern, replacement in SHORT_HOST_REFS:
if pattern.search(line):
line = pattern.sub(replacement, line)
redactions.append(f"short-host: {pattern.pattern}")
if line != original:
return line, redactions
return line, []
def process_file(filepath):
"""Sanitize a single file in-place. Returns count of redacted lines."""
try:
with open(filepath, "r", errors="replace") as f:
lines = f.readlines()
except OSError as e:
print(f" SKIP: {filepath}: {e}", file=sys.stderr)
return 0
new_lines = []
total_redactions = 0
rel_path = os.path.relpath(filepath, STAGING_DIR)
for i, line in enumerate(lines, 1):
new_line, redactions = redact_line(line, filepath)
new_lines.append(new_line)
if redactions:
total_redactions += 1
for r in redactions:
print(f" REDACT {rel_path}:{i} [{r}]", file=sys.stderr)
if total_redactions > 0:
with open(filepath, "w") as f:
f.writelines(new_lines)
return total_redactions
def verify_clean(staging_dir):
"""Re-scan staging directory for any remaining sensitive patterns. Returns list of findings."""
# Patterns that should NOT appear in clean output
verify_patterns = [
re.compile(r'[REDACTED-PASSWORD]', re.IGNORECASE),
re.compile(r'[REDACTED-PASSWORD]', re.IGNORECASE),
re.compile(r'eS3cuRe', re.IGNORECASE),
re.compile(r'[REDACTED-USER]', re.IGNORECASE),
re.compile(r'sqllabels\d*', re.IGNORECASE),
re.compile(r'[REDACTED-USER]', re.IGNORECASE),
re.compile(r'svc_vmsp', re.IGNORECASE),
re.compile(r'cinci-db\.iii\.com', re.IGNORECASE),
re.compile(r'sierra-db\.plch\.net', re.IGNORECASE),
re.compile(r'sierra-train', re.IGNORECASE),
re.compile(r'host-ilsaux', re.IGNORECASE),
re.compile(r'webtools\.plch\.net', re.IGNORECASE),
re.compile(r'partner\.iii\.com', re.IGNORECASE),
re.compile(r'cincinnatilibrary\.org', re.IGNORECASE),
# Catch-all domain patterns
re.compile(r'\b\w+\.plch\.net\b', re.IGNORECASE),
re.compile(r'\b\w+\.iii\.com\b', re.IGNORECASE),
]
# Skip Python/bash scripts that contain pattern definitions or grep strings.
# These files get redacted too, but residual fragments in regex strings and
# grep patterns can trigger false positives in verification.
skip_extensions = {".py", ".sh"}
findings = []
for root, _dirs, files in os.walk(staging_dir):
for name in files:
_, ext = os.path.splitext(name)
if ext in skip_extensions:
continue
path = os.path.join(root, name)
rel = os.path.relpath(path, staging_dir)
try:
with open(path, "r", errors="replace") as f:
for i, line in enumerate(f, 1):
for pat in verify_patterns:
m = pat.search(line)
if m:
findings.append(f"{rel}:{i}: {m.group()} (pattern: {pat.pattern})")
except OSError:
continue
return findings
def main():
print("[sanitize-for-gist] Starting sanitization ...", file=sys.stderr)
# Clean and create staging directory
if os.path.exists(STAGING_DIR):
shutil.rmtree(STAGING_DIR)
os.makedirs(STAGING_DIR)
# Copy source directories
for src_rel, dst_name in SOURCES:
src = os.path.join(WORKSPACE, src_rel)
dst = os.path.join(STAGING_DIR, dst_name)
if os.path.isdir(src):
shutil.copytree(src, dst)
print(f" Copied {src_rel} -> {dst_name}/", file=sys.stderr)
else:
print(f" WARNING: {src} not found, skipping", file=sys.stderr)
# Also copy the top-level ilsaux documentation file if it exists
ilsaux_doc = os.path.join(WORKSPACE, "llore/ilsaux-documentation.md")
if os.path.exists(ilsaux_doc):
shutil.copy2(ilsaux_doc, os.path.join(STAGING_DIR, "ilsaux-documentation.md"))
print(f" Copied llore/ilsaux-documentation.md", file=sys.stderr)
# Process all text files in staging
total_files = 0
total_redacted_lines = 0
for root, _dirs, files in os.walk(STAGING_DIR):
for name in files:
path = os.path.join(root, name)
total_files += 1
total_redacted_lines += process_file(path)
print(f"\n Processed {total_files} files, redacted {total_redacted_lines} lines", file=sys.stderr)
# Verify clean
print("\n[sanitize-for-gist] Verification scan ...", file=sys.stderr)
findings = verify_clean(STAGING_DIR)
if findings:
print(f"\n WARNING: {len(findings)} remaining sensitive patterns found!", file=sys.stderr)
for f in findings:
print(f" {f}", file=sys.stderr)
print("\n Staging directory NOT clean. Review and update redaction rules.", file=sys.stderr)
sys.exit(1)
else:
print(" CLEAN: No sensitive patterns found in staging directory.", file=sys.stderr)
print(f"\n Staging directory: {STAGING_DIR}", file=sys.stderr)
# Print to stdout for script consumption
print(STAGING_DIR)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment