marpe/forensic_supercut.py

## forensic_supercut.py
#!/usr/bin/env python3
"""
Forensic Files supercut generator.
Step 1a: Extract embedded English SRT subtitles from MKV files (ffmpeg)
Step 1b: Download missing English SRT subtitles via subliminal
Step 2:  Find timestamps for target phrases
Step 3:  Cut clips with ffmpeg and concatenate into a supercut

Usage:
  python forensic_supercut.py               # full pipeline
  python forensic_supercut.py --skip-download  # skip subliminal, search existing SRTs only
  python forensic_supercut.py --search-only    # only search + cut (no subtitle fetching)
"""

import sys
import re
import subprocess
import json
import argparse
from pathlib import Path

# Force unbuffered output for real-time progress
sys.stdout.reconfigure(line_buffering=True)
sys.stderr.reconfigure(line_buffering=True)

# ── Configuration ─────────────────────────────────────────────────────────────

SEARCH_TERMS = [
    "mitochondrial dna"
]
PAD_BEFORE = 0.5   # seconds before phrase start
PAD_AFTER  = 0.5   # seconds after phrase end
OUTPUT_DIR  = Path("forensic_supercut")
OUTPUT_FILE = OUTPUT_DIR / "supercut.mp4"
CLIPS_DIR   = OUTPUT_DIR / "clips"
SUBLIMINAL  = str(Path.home() / "AppData/Roaming/Python/Python311/Scripts/subliminal.exe")

FORENSIC_DIRS = [
    "Forensic Files - Season 01",
    "Forensic Files - Season 02",
    "Forensic Files - Season 03",
    "Forensic Files - Season 04",
    "Forensic Files - Season 05",
    "Forensic Files - Season 06",
    "Forensic Files - Season 07",
    "Forensic Files - Season 08",
    "Forensic Files - Season 09",
    "Forensic Files - Season 10",
    "Forensic Files - Season 11",
    "Forensic Files - Season 12",
    "Forensic Files - Season 13",
    "Forensic Files - Season 14",
    "Forensic Files - Specials",
]

VIDEO_EXTS = {".avi", ".mp4", ".mkv", ".m4v"}

# ── Helpers ───────────────────────────────────────────────────────────────────

def srt_time_to_seconds(t: str) -> float:
    """Convert SRT timestamp HH:MM:SS,mmm -> float seconds."""
    t = t.replace(",", ".")
    h, m, rest = t.split(":")
    s, ms = rest.split(".")
    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000


def parse_srt(srt_path: Path):
    """Yield (start_sec, end_sec, text) for each SRT block."""
    text = srt_path.read_text(encoding="utf-8", errors="replace")
    blocks = re.split(r"\n\s*\n", text.strip())
    for block in blocks:
        lines = block.strip().splitlines()
        if len(lines) < 3:
            continue
        tc_line = lines[1]
        m = re.match(
            r"(\d{2}:\d{2}:\d{2}[,\.]\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}[,\.]\d{3})",
            tc_line,
        )
        if not m:
            continue
        start = srt_time_to_seconds(m.group(1))
        end   = srt_time_to_seconds(m.group(2))
        caption = " ".join(lines[2:]).strip()
        yield start, end, caption


def find_video_files() -> list[Path]:
    base = Path(".")
    videos = []
    for d in FORENSIC_DIRS:
        p = base / d
        if not p.exists():
            print(f"  [WARN] Directory not found: {d}")
            continue
        for f in sorted(p.rglob("*")):
            if f.suffix.lower() in VIDEO_EXTS:
                videos.append(f)
    return videos


def srt_for_video(video: Path) -> Path | None:
    """Return the .en.srt or .srt path next to a video if it exists."""
    for suffix in [".en.srt", ".srt"]:
        candidate = video.with_suffix(suffix)
        if candidate.exists():
            return candidate
    return None


def has_embedded_english_sub(video: Path) -> bool:
    """Check whether an MKV has an embedded English subtitle stream."""
    if video.suffix.lower() != ".mkv":
        return False
    result = subprocess.run(
        ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_streams", str(video)],
        capture_output=True, text=True,
    )
    if result.returncode != 0:
        return False
    try:
        streams = json.loads(result.stdout).get("streams", [])
        for s in streams:
            if s.get("codec_type") == "subtitle":
                lang = s.get("tags", {}).get("language", "")
                if lang in ("eng", "en", ""):
                    return True
    except Exception:
        pass
    return False


# ── Step 1a: Extract embedded subtitles ───────────────────────────────────────

def extract_embedded_subtitles(videos: list[Path]) -> int:
    """Extract embedded English subs from MKV files that lack an external SRT."""
    mkv_missing = [
        v for v in videos
        if v.suffix.lower() == ".mkv" and srt_for_video(v) is None
    ]
    if not mkv_missing:
        return 0

    print(f"\n  Extracting embedded subtitles from {len(mkv_missing)} MKV(s)...")
    extracted = 0
    for video in mkv_missing:
        out_srt = video.with_suffix(".en.srt")
        # Find first subtitle stream index
        probe = subprocess.run(
            ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_streams", str(video)],
            capture_output=True, text=True,
        )
        if probe.returncode != 0:
            continue
        try:
            streams = json.loads(probe.stdout).get("streams", [])
        except Exception:
            continue

        sub_idx = None
        for s in streams:
            if s.get("codec_type") == "subtitle":
                lang = s.get("tags", {}).get("language", "")
                if lang in ("eng", "en", ""):
                    sub_idx = s["index"]
                    break

        if sub_idx is None:
            continue

        result = subprocess.run(
            ["ffmpeg", "-y", "-i", str(video),
             "-map", f"0:{sub_idx}",
             "-c:s", "srt",
             str(out_srt)],
            capture_output=True, text=True,
        )
        if result.returncode == 0 and out_srt.exists():
            print(f"    Extracted: {video.name}")
            extracted += 1
        else:
            print(f"    [WARN] Could not extract from {video.name}")

    print(f"  Extracted {extracted}/{len(mkv_missing)} embedded subtitle(s)")
    return extracted


# ── Step 1b: Download subtitles via subliminal ────────────────────────────────

def download_subtitles(videos: list[Path]):
    missing = [v for v in videos if srt_for_video(v) is None]
    print(f"  {len(videos) - len(missing)} already have subtitles, {len(missing)} need downloading\n")

    if not missing:
        return

    # Group by parent directory
    dirs: dict[Path, list[Path]] = {}
    for v in missing:
        dirs.setdefault(v.parent, []).append(v)

    for i, (directory, files) in enumerate(sorted(dirs.items()), 1):
        print(f"  [{i}/{len(dirs)}] {directory.name}  ({len(files)} without sub)")
        cmd = [
            SUBLIMINAL, "download",
            "-l", "en",
            "-p", "podnapisi",
            "-p", "opensubtitles",
            "-p", "tvsubtitles",
            "--min-score", "25",
            "-w", "4",
            str(directory),
        ]
        subprocess.run(cmd, text=True)
        new = sum(1 for v in files if srt_for_video(v) is not None)
        print(f"    -> Downloaded {new}/{len(files)} subtitles")


# ── Step 2: Find occurrences ──────────────────────────────────────────────────

def find_occurrences(videos: list[Path]) -> list[dict]:
    print(f"\n{'='*60}")
    print("STEP 2: Searching subtitles for target phrases")
    print(f"{'='*60}")
    print(f"  Terms: {SEARCH_TERMS}")

    hits = []
    no_sub = 0
    searched = 0

    for video in videos:
        srt = srt_for_video(video)
        if srt is None:
            no_sub += 1
            continue
        searched += 1
        for start, end, caption in parse_srt(srt):
            lower = caption.lower()
            for term in SEARCH_TERMS:
                if term in lower:
                    hits.append({
                        "video": str(video),
                        "term": term,
                        "start": max(0.0, start - PAD_BEFORE),
                        "end": end + PAD_AFTER,
                        "caption": caption,
                        "raw_start": start,
                        "raw_end": end,
                    })
                    print(f"  FOUND [{term}]  {video.name}")
                    print(f"    {start:.1f}–{end:.1f}s: {caption!r}")

    print(f"\n  Searched {searched} episodes, {no_sub} had no subtitle")
    print(f"  Found {len(hits)} occurrence(s) total\n")

    OUTPUT_DIR.mkdir(exist_ok=True)
    manifest = OUTPUT_DIR / "occurrences.json"
    manifest.write_text(json.dumps(hits, indent=2))
    print(f"  Manifest saved -> {manifest}")

    # Deduplicate overlapping clips from same video
    hits = deduplicate_hits(hits)
    print(f"  After dedup: {len(hits)} clip(s) to cut")
    return hits


def deduplicate_hits(hits: list[dict]) -> list[dict]:
    """Merge clips from same video that overlap or are very close (<1s gap)."""
    by_video: dict[str, list[dict]] = {}
    for h in hits:
        by_video.setdefault(h["video"], []).append(h)

    result = []
    for video, vhits in by_video.items():
        vhits.sort(key=lambda h: h["start"])
        merged = [vhits[0].copy()]
        for h in vhits[1:]:
            prev = merged[-1]
            if h["start"] <= prev["end"] + 1.0:
                # Extend the previous clip
                prev["end"] = max(prev["end"], h["end"])
                prev["raw_end"] = max(prev["raw_end"], h["raw_end"])
                prev["term"] = f"{prev['term']} + {h['term']}" if h["term"] not in prev["term"] else prev["term"]
                prev["caption"] = prev["caption"] + " | " + h["caption"]
            else:
                merged.append(h.copy())
        result.extend(merged)

    result.sort(key=lambda h: (h["video"], h["start"]))
    return result


# ── Step 3: Cut clips ─────────────────────────────────────────────────────────

def cut_clips(hits: list[dict]) -> list[Path]:
    print(f"\n{'='*60}")
    print(f"STEP 3: Cutting {len(hits)} clips with ffmpeg")
    print(f"{'='*60}")

    CLIPS_DIR.mkdir(parents=True, exist_ok=True)
    clip_paths = []

    for i, hit in enumerate(hits):
        video = Path(hit["video"])
        start = hit["start"]
        duration = hit["end"] - hit["start"]
        term_slug = hit["term"].replace(" ", "_")
        clip_name = f"{i:04d}_{term_slug}_{video.stem[:40]}.mp4"
        clip_path = CLIPS_DIR / clip_name

        if clip_path.exists():
            print(f"  [{i+1}/{len(hits)}] Skip (exists): {clip_name}")
            clip_paths.append(clip_path)
            continue

        print(f"  [{i+1}/{len(hits)}] {video.name}  @{start:.1f}s  dur={duration:.1f}s")
        print(f"    -> {clip_name}")

        cmd = [
            "ffmpeg", "-y",
            "-ss", str(start),
            "-i", str(video),
            "-t", str(duration),
            # Normalize to 1280×720 H.264/AAC so concat is lossless
            "-c:v", "libx264", "-preset", "fast", "-crf", "20",
            "-vf", "scale=1280:720:force_original_aspect_ratio=decrease,"
                   "pad=1280:720:(ow-iw)/2:(oh-ih)/2,setsar=1",
            "-af", "loudnorm=I=-16:TP=-1.5:LRA=11",
            "-c:a", "aac", "-b:a", "128k", "-ar", "44100", "-ac", "2",
            str(clip_path),
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"    [ERROR] ffmpeg:\n{result.stderr[-400:]}")
        else:
            clip_paths.append(clip_path)

    return clip_paths


# ── Step 4: Concatenate ───────────────────────────────────────────────────────

def concatenate_clips(clip_paths: list[Path]):
    print(f"\n{'='*60}")
    print(f"STEP 4: Concatenating {len(clip_paths)} clips -> {OUTPUT_FILE}")
    print(f"{'='*60}")

    # Build filter_complex concat — more reliable than the concat demuxer
    # because it properly handles timestamp resets across all streams.
    inputs = []
    for clip in clip_paths:
        inputs += ["-i", str(clip)]

    n = len(clip_paths)
    filter_in  = "".join(f"[{i}:v][{i}:a]" for i in range(n))
    filter_complex = f"{filter_in}concat=n={n}:v=1:a=1[vout][aout]"

    cmd = [
        "ffmpeg", "-y",
        *inputs,
        "-filter_complex", filter_complex,
        "-map", "[vout]", "-map", "[aout]",
        "-c:v", "libx264", "-preset", "fast", "-crf", "20",
        "-c:a", "aac", "-b:a", "128k",
        "-movflags", "+faststart",
        str(OUTPUT_FILE),
    ]

    # Write concat list for reference (not used by ffmpeg here)
    concat_list = OUTPUT_DIR / "concat.txt"
    concat_list.write_text("".join(f"file '{clip.resolve()}'\n" for clip in clip_paths))

    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print(f"  [ERROR] concat failed:\n{result.stderr[-500:]}")
    else:
        size_mb = OUTPUT_FILE.stat().st_size / 1_048_576
        print(f"  Done!  {OUTPUT_FILE}  ({size_mb:.1f} MB)")


# ── Main ──────────────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="Forensic Files supercut generator")
    parser.add_argument("--skip-download", action="store_true",
                        help="Skip subliminal download step, only search existing SRTs")
    parser.add_argument("--search-only", action="store_true",
                        help="Only search + cut; skip all subtitle fetching")
    parser.add_argument("--no-merge", action="store_true",
                        help="Search and cut clips but stop before merging into supercut")
    parser.add_argument("--merge-only", action="store_true",
                        help="Merge whatever clips are currently in the clips folder, skipping search/cut")
    args = parser.parse_args()

    print("Forensic Files Supercut Generator")
    print(f"Terms:   {SEARCH_TERMS}")
    print(f"Padding: -{PAD_BEFORE}s / +{PAD_AFTER}s\n")

    if args.merge_only:
        clip_paths = sorted(CLIPS_DIR.glob("*.mp4"))
        if not clip_paths:
            print(f"No clips found in {CLIPS_DIR}")
            sys.exit(1)
        print(f"Merging {len(clip_paths)} clips from {CLIPS_DIR}")
        concatenate_clips(clip_paths)
        print(f"\nAll done! {len(clip_paths)} clips -> {OUTPUT_FILE}")
        return

    videos = find_video_files()
    print(f"Found {len(videos)} video files across all Forensic Files directories")

    if not args.search_only:
        print(f"\n{'='*60}")
        print("STEP 1: Obtaining subtitles")
        print(f"{'='*60}")

        # 1a: Extract embedded subs from MKVs first (free, fast)
        extract_embedded_subtitles(videos)

        if not args.skip_download:
            # 1b: Download remaining missing subs via subliminal
            download_subtitles(videos)
        else:
            missing = sum(1 for v in videos if srt_for_video(v) is None)
            print(f"  [skip-download] {missing} episodes still have no subtitle")

    hits = find_occurrences(videos)

    if not hits:
        print("No occurrences found — check that subtitles were downloaded.")
        sys.exit(0)

    clip_paths = cut_clips(hits)

    if not clip_paths:
        print("No clips were produced.")
        sys.exit(1)

    if args.no_merge:
        print(f"\n{len(clip_paths)} clips saved to {CLIPS_DIR}")
        print("Review and remove unwanted clips, then run with --merge-only to merge.")
        return

    concatenate_clips(clip_paths)
    print(f"\nAll done! {len(clip_paths)} clips -> {OUTPUT_FILE}")


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Forensic Files supercut generator.
	Step 1a: Extract embedded English SRT subtitles from MKV files (ffmpeg)
	Step 1b: Download missing English SRT subtitles via subliminal
	Step 2: Find timestamps for target phrases
	Step 3: Cut clips with ffmpeg and concatenate into a supercut

	Usage:
	python forensic_supercut.py # full pipeline
	python forensic_supercut.py --skip-download # skip subliminal, search existing SRTs only
	python forensic_supercut.py --search-only # only search + cut (no subtitle fetching)
	"""

	import sys
	import re
	import subprocess
	import json
	import argparse
	from pathlib import Path

	# Force unbuffered output for real-time progress
	sys.stdout.reconfigure(line_buffering=True)
	sys.stderr.reconfigure(line_buffering=True)

	# ── Configuration ─────────────────────────────────────────────────────────────

	SEARCH_TERMS = [
	"mitochondrial dna"
	]
	PAD_BEFORE = 0.5 # seconds before phrase start
	PAD_AFTER = 0.5 # seconds after phrase end
	OUTPUT_DIR = Path("forensic_supercut")
	OUTPUT_FILE = OUTPUT_DIR / "supercut.mp4"
	CLIPS_DIR = OUTPUT_DIR / "clips"
	SUBLIMINAL = str(Path.home() / "AppData/Roaming/Python/Python311/Scripts/subliminal.exe")

	FORENSIC_DIRS = [
	"Forensic Files - Season 01",
	"Forensic Files - Season 02",
	"Forensic Files - Season 03",
	"Forensic Files - Season 04",
	"Forensic Files - Season 05",
	"Forensic Files - Season 06",
	"Forensic Files - Season 07",
	"Forensic Files - Season 08",
	"Forensic Files - Season 09",
	"Forensic Files - Season 10",
	"Forensic Files - Season 11",
	"Forensic Files - Season 12",
	"Forensic Files - Season 13",
	"Forensic Files - Season 14",
	"Forensic Files - Specials",
	]

	VIDEO_EXTS = {".avi", ".mp4", ".mkv", ".m4v"}

	# ── Helpers ───────────────────────────────────────────────────────────────────

	def srt_time_to_seconds(t: str) -> float:
	"""Convert SRT timestamp HH:MM:SS,mmm -> float seconds."""
	t = t.replace(",", ".")
	h, m, rest = t.split(":")
	s, ms = rest.split(".")
	return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000


	def parse_srt(srt_path: Path):
	"""Yield (start_sec, end_sec, text) for each SRT block."""
	text = srt_path.read_text(encoding="utf-8", errors="replace")
	blocks = re.split(r"\n\s*\n", text.strip())
	for block in blocks:
	lines = block.strip().splitlines()
	if len(lines) < 3:
	continue
	tc_line = lines[1]
	m = re.match(
	r"(\d{2}:\d{2}:\d{2}[,\.]\d{3})\s-->\s(\d{2}:\d{2}:\d{2}[,\.]\d{3})",
	tc_line,
	)
	if not m:
	continue
	start = srt_time_to_seconds(m.group(1))
	end = srt_time_to_seconds(m.group(2))
	caption = " ".join(lines[2:]).strip()
	yield start, end, caption


	def find_video_files() -> list[Path]:
	base = Path(".")
	videos = []
	for d in FORENSIC_DIRS:
	p = base / d
	if not p.exists():
	print(f" [WARN] Directory not found: {d}")
	continue
	for f in sorted(p.rglob("*")):
	if f.suffix.lower() in VIDEO_EXTS:
	videos.append(f)
	return videos


	def srt_for_video(video: Path) -> Path \| None:
	"""Return the .en.srt or .srt path next to a video if it exists."""
	for suffix in [".en.srt", ".srt"]:
	candidate = video.with_suffix(suffix)
	if candidate.exists():
	return candidate
	return None


	def has_embedded_english_sub(video: Path) -> bool:
	"""Check whether an MKV has an embedded English subtitle stream."""
	if video.suffix.lower() != ".mkv":
	return False
	result = subprocess.run(
	["ffprobe", "-v", "quiet", "-print_format", "json", "-show_streams", str(video)],
	capture_output=True, text=True,
	)
	if result.returncode != 0:
	return False
	try:
	streams = json.loads(result.stdout).get("streams", [])
	for s in streams:
	if s.get("codec_type") == "subtitle":
	lang = s.get("tags", {}).get("language", "")
	if lang in ("eng", "en", ""):
	return True
	except Exception:
	pass
	return False


	# ── Step 1a: Extract embedded subtitles ───────────────────────────────────────

	def extract_embedded_subtitles(videos: list[Path]) -> int:
	"""Extract embedded English subs from MKV files that lack an external SRT."""
	mkv_missing = [
	v for v in videos
	if v.suffix.lower() == ".mkv" and srt_for_video(v) is None
	]
	if not mkv_missing:
	return 0

	print(f"\n Extracting embedded subtitles from {len(mkv_missing)} MKV(s)...")
	extracted = 0
	for video in mkv_missing:
	out_srt = video.with_suffix(".en.srt")
	# Find first subtitle stream index
	probe = subprocess.run(
	["ffprobe", "-v", "quiet", "-print_format", "json", "-show_streams", str(video)],
	capture_output=True, text=True,
	)
	if probe.returncode != 0:
	continue
	try:
	streams = json.loads(probe.stdout).get("streams", [])
	except Exception:
	continue

	sub_idx = None
	for s in streams:
	if s.get("codec_type") == "subtitle":
	lang = s.get("tags", {}).get("language", "")
	if lang in ("eng", "en", ""):
	sub_idx = s["index"]
	break

	if sub_idx is None:
	continue

	result = subprocess.run(
	["ffmpeg", "-y", "-i", str(video),
	"-map", f"0:{sub_idx}",
	"-c:s", "srt",
	str(out_srt)],
	capture_output=True, text=True,
	)
	if result.returncode == 0 and out_srt.exists():
	print(f" Extracted: {video.name}")
	extracted += 1
	else:
	print(f" [WARN] Could not extract from {video.name}")

	print(f" Extracted {extracted}/{len(mkv_missing)} embedded subtitle(s)")
	return extracted


	# ── Step 1b: Download subtitles via subliminal ────────────────────────────────

	def download_subtitles(videos: list[Path]):
	missing = [v for v in videos if srt_for_video(v) is None]
	print(f" {len(videos) - len(missing)} already have subtitles, {len(missing)} need downloading\n")

	if not missing:
	return

	# Group by parent directory
	dirs: dict[Path, list[Path]] = {}
	for v in missing:
	dirs.setdefault(v.parent, []).append(v)

	for i, (directory, files) in enumerate(sorted(dirs.items()), 1):
	print(f" [{i}/{len(dirs)}] {directory.name} ({len(files)} without sub)")
	cmd = [
	SUBLIMINAL, "download",
	"-l", "en",
	"-p", "podnapisi",
	"-p", "opensubtitles",
	"-p", "tvsubtitles",
	"--min-score", "25",
	"-w", "4",
	str(directory),
	]
	subprocess.run(cmd, text=True)
	new = sum(1 for v in files if srt_for_video(v) is not None)
	print(f" -> Downloaded {new}/{len(files)} subtitles")


	# ── Step 2: Find occurrences ──────────────────────────────────────────────────

	def find_occurrences(videos: list[Path]) -> list[dict]:
	print(f"\n{'='*60}")
	print("STEP 2: Searching subtitles for target phrases")
	print(f"{'='*60}")
	print(f" Terms: {SEARCH_TERMS}")

	hits = []
	no_sub = 0
	searched = 0

	for video in videos:
	srt = srt_for_video(video)
	if srt is None:
	no_sub += 1
	continue
	searched += 1
	for start, end, caption in parse_srt(srt):
	lower = caption.lower()
	for term in SEARCH_TERMS:
	if term in lower:
	hits.append({
	"video": str(video),
	"term": term,
	"start": max(0.0, start - PAD_BEFORE),
	"end": end + PAD_AFTER,
	"caption": caption,
	"raw_start": start,
	"raw_end": end,
	})
	print(f" FOUND [{term}] {video.name}")
	print(f" {start:.1f}–{end:.1f}s: {caption!r}")

	print(f"\n Searched {searched} episodes, {no_sub} had no subtitle")
	print(f" Found {len(hits)} occurrence(s) total\n")

	OUTPUT_DIR.mkdir(exist_ok=True)
	manifest = OUTPUT_DIR / "occurrences.json"
	manifest.write_text(json.dumps(hits, indent=2))
	print(f" Manifest saved -> {manifest}")

	# Deduplicate overlapping clips from same video
	hits = deduplicate_hits(hits)
	print(f" After dedup: {len(hits)} clip(s) to cut")
	return hits


	def deduplicate_hits(hits: list[dict]) -> list[dict]:
	"""Merge clips from same video that overlap or are very close (<1s gap)."""
	by_video: dict[str, list[dict]] = {}
	for h in hits:
	by_video.setdefault(h["video"], []).append(h)

	result = []
	for video, vhits in by_video.items():
	vhits.sort(key=lambda h: h["start"])
	merged = [vhits[0].copy()]
	for h in vhits[1:]:
	prev = merged[-1]
	if h["start"] <= prev["end"] + 1.0:
	# Extend the previous clip
	prev["end"] = max(prev["end"], h["end"])
	prev["raw_end"] = max(prev["raw_end"], h["raw_end"])
	prev["term"] = f"{prev['term']} + {h['term']}" if h["term"] not in prev["term"] else prev["term"]
	prev["caption"] = prev["caption"] + " \| " + h["caption"]
	else:
	merged.append(h.copy())
	result.extend(merged)

	result.sort(key=lambda h: (h["video"], h["start"]))
	return result


	# ── Step 3: Cut clips ─────────────────────────────────────────────────────────

	def cut_clips(hits: list[dict]) -> list[Path]:
	print(f"\n{'='*60}")
	print(f"STEP 3: Cutting {len(hits)} clips with ffmpeg")
	print(f"{'='*60}")

	CLIPS_DIR.mkdir(parents=True, exist_ok=True)
	clip_paths = []

	for i, hit in enumerate(hits):
	video = Path(hit["video"])
	start = hit["start"]
	duration = hit["end"] - hit["start"]
	term_slug = hit["term"].replace(" ", "_")
	clip_name = f"{i:04d}_{term_slug}_{video.stem[:40]}.mp4"
	clip_path = CLIPS_DIR / clip_name

	if clip_path.exists():
	print(f" [{i+1}/{len(hits)}] Skip (exists): {clip_name}")
	clip_paths.append(clip_path)
	continue

	print(f" [{i+1}/{len(hits)}] {video.name} @{start:.1f}s dur={duration:.1f}s")
	print(f" -> {clip_name}")

	cmd = [
	"ffmpeg", "-y",
	"-ss", str(start),
	"-i", str(video),
	"-t", str(duration),
	# Normalize to 1280×720 H.264/AAC so concat is lossless
	"-c:v", "libx264", "-preset", "fast", "-crf", "20",
	"-vf", "scale=1280:720:force_original_aspect_ratio=decrease,"
	"pad=1280:720:(ow-iw)/2:(oh-ih)/2,setsar=1",
	"-af", "loudnorm=I=-16:TP=-1.5:LRA=11",
	"-c:a", "aac", "-b:a", "128k", "-ar", "44100", "-ac", "2",
	str(clip_path),
	]
	result = subprocess.run(cmd, capture_output=True, text=True)
	if result.returncode != 0:
	print(f" [ERROR] ffmpeg:\n{result.stderr[-400:]}")
	else:
	clip_paths.append(clip_path)

	return clip_paths


	# ── Step 4: Concatenate ───────────────────────────────────────────────────────

	def concatenate_clips(clip_paths: list[Path]):
	print(f"\n{'='*60}")
	print(f"STEP 4: Concatenating {len(clip_paths)} clips -> {OUTPUT_FILE}")
	print(f"{'='*60}")

	# Build filter_complex concat — more reliable than the concat demuxer
	# because it properly handles timestamp resets across all streams.
	inputs = []
	for clip in clip_paths:
	inputs += ["-i", str(clip)]

	n = len(clip_paths)
	filter_in = "".join(f"[{i}:v][{i}:a]" for i in range(n))
	filter_complex = f"{filter_in}concat=n={n}:v=1:a=1[vout][aout]"

	cmd = [
	"ffmpeg", "-y",
	*inputs,
	"-filter_complex", filter_complex,
	"-map", "[vout]", "-map", "[aout]",
	"-c:v", "libx264", "-preset", "fast", "-crf", "20",
	"-c:a", "aac", "-b:a", "128k",
	"-movflags", "+faststart",
	str(OUTPUT_FILE),
	]

	# Write concat list for reference (not used by ffmpeg here)
	concat_list = OUTPUT_DIR / "concat.txt"
	concat_list.write_text("".join(f"file '{clip.resolve()}'\n" for clip in clip_paths))

	result = subprocess.run(cmd, capture_output=True, text=True)
	if result.returncode != 0:
	print(f" [ERROR] concat failed:\n{result.stderr[-500:]}")
	else:
	size_mb = OUTPUT_FILE.stat().st_size / 1_048_576
	print(f" Done! {OUTPUT_FILE} ({size_mb:.1f} MB)")


	# ── Main ──────────────────────────────────────────────────────────────────────

	def main():
	parser = argparse.ArgumentParser(description="Forensic Files supercut generator")
	parser.add_argument("--skip-download", action="store_true",
	help="Skip subliminal download step, only search existing SRTs")
	parser.add_argument("--search-only", action="store_true",
	help="Only search + cut; skip all subtitle fetching")
	parser.add_argument("--no-merge", action="store_true",
	help="Search and cut clips but stop before merging into supercut")
	parser.add_argument("--merge-only", action="store_true",
	help="Merge whatever clips are currently in the clips folder, skipping search/cut")
	args = parser.parse_args()

	print("Forensic Files Supercut Generator")
	print(f"Terms: {SEARCH_TERMS}")
	print(f"Padding: -{PAD_BEFORE}s / +{PAD_AFTER}s\n")

	if args.merge_only:
	clip_paths = sorted(CLIPS_DIR.glob("*.mp4"))
	if not clip_paths:
	print(f"No clips found in {CLIPS_DIR}")
	sys.exit(1)
	print(f"Merging {len(clip_paths)} clips from {CLIPS_DIR}")
	concatenate_clips(clip_paths)
	print(f"\nAll done! {len(clip_paths)} clips -> {OUTPUT_FILE}")
	return

	videos = find_video_files()
	print(f"Found {len(videos)} video files across all Forensic Files directories")

	if not args.search_only:
	print(f"\n{'='*60}")
	print("STEP 1: Obtaining subtitles")
	print(f"{'='*60}")

	# 1a: Extract embedded subs from MKVs first (free, fast)
	extract_embedded_subtitles(videos)

	if not args.skip_download:
	# 1b: Download remaining missing subs via subliminal
	download_subtitles(videos)
	else:
	missing = sum(1 for v in videos if srt_for_video(v) is None)
	print(f" [skip-download] {missing} episodes still have no subtitle")

	hits = find_occurrences(videos)

	if not hits:
	print("No occurrences found — check that subtitles were downloaded.")
	sys.exit(0)

	clip_paths = cut_clips(hits)

	if not clip_paths:
	print("No clips were produced.")
	sys.exit(1)

	if args.no_merge:
	print(f"\n{len(clip_paths)} clips saved to {CLIPS_DIR}")
	print("Review and remove unwanted clips, then run with --merge-only to merge.")
	return

	concatenate_clips(clip_paths)
	print(f"\nAll done! {len(clip_paths)} clips -> {OUTPUT_FILE}")


	if __name__ == "__main__":
	main()
No results found