nibzard/scribe.sh

## scribe.sh
#!/usr/bin/env bash
set -euo pipefail

#########################################################################
# scribe
#
# 1) Converts any audio/video file (mp4, m4a, wav, etc.) → 16 kHz mono WAV
# 2) Uses whisper.cpp (via whisper-cli) to produce a text transcript
# 3) If a prompt is found (either a user-passed prompt file OR a “prompt.md”
#    in the same folder), pipes that transcript into llm to generate a Markdown
#    description.
# 4) Cleans up the intermediate WAV, leaving only:
#      • original media file
#      • <basename>.md                   (the transcript)
#      • <basename>_description.md       (only if a prompt is used)
#
# ────────────────────────────────────────────────────────────────────────
# PREREQUISITES:
#   • ffmpeg           (e.g. brew install ffmpeg or download from https://ffmpeg.org)
#   • whisper-cli      (build from https://github.com/ggerganov/whisper.cpp)
#   • llm CLI          (https://github.com/llm-cli/llm; e.g. “pip install llm” or “brew install llm”)
#        – After installing llm, set an API key or default model. For example:
#            llm keys set openai
#            llm set-default-model gpt-4o-mini
#        – If you want to force a specific model for description, edit generate_description()
#          (see “-m <model-name>” example below).
#
# CONFIG (edit these to match your machine):
#   FFMPEG_PATH   — path to ffmpeg binary (e.g. /opt/homebrew/bin/ffmpeg)
#   WHISPER_PATH  — path to whisper-cli binary (e.g. /Users/nikola/dev/whisper.cpp/build/bin/whisper-cli)
#   MODELS_DIR    — directory containing whisper.cpp model files (e.g. “…/whisper.cpp/models”)
#
# USAGE:
#   1) Single file, no explicit prompt given → looks for “prompt.md” next to that file:
#        ./scribe lecture.mp4
#      → Produces:
#        lecture.wav         (intermediate, then deleted)
#        lecture.md          (transcript)
#        lecture_description.md  (only if “lecture_directory/prompt.md” existed)
#
#   2) Single file + custom prompt:
#        ./scribe podcast.wav youtube-description.md
#      → Produces:
#        podcast_converted.wav   (intermediate, then deleted)
#        podcast.md              (transcript)
#        podcast_description.md  (based on youtube-description.md)
#
#   3) Directory of files:
#        ./scribe "/Users/you/AudioFolder"         (uses prompt.md if present in that folder)
#      OR:
#        ./scribe "/Users/you/AudioFolder" blog-prompt.md
#      → Loops through all top-level *.mp4, *.m4a, *.wav in that folder.
#        For each file:
#          • Converts → <basename>_converted.wav
#          • Transcribes → <basename>.md
#          • If “prompt.md” (or user-passed prompt) exists → <basename>_description.md
#          • Deletes <basename>_converted.wav
#
# EXAMPLES:
#   • Transcribe & (if prompt.md exists) describe one file:
#       ./scribe Andrija.WAV
#   • Transcribe & describe using a different prompt:
#       ./scribe Andrija.WAV youtube-description.md
#   • Batch mode (uses prompt.md if present in folder):
#       ./scribe "/Users/nikola/Documents/AudioNotes"
#
# NOTES:
#   – Whisper always needs a 16 kHz mono WAV → we avoid clobbering raw WAV by naming it
#     <basename>_converted.wav. After transcription, we clean that up.
#   – The transcript is written as Markdown (.md). You can rename or edit as needed.
#   – Description files always end in “_description.md.”
#   – If you want a specific LLM model for description, edit generate_description() to:
#         llm -m "<your-model-here>" -s "$prompt"
# ────────────────────────────────────────────────────────────────────────

# ──────────────────────────────  CONFIG  ──────────────────────────────
FFMPEG_PATH="/opt/homebrew/bin/ffmpeg"                               # ← adjust if needed
WHISPER_PATH="/Users/nikola/dev/whisper.cpp/build/bin/whisper-cli"   # ← adjust
MODELS_DIR="/Users/nikola/dev/whisper.cpp/models"                    # ← adjust

# ──────────────────────────────  FUNCTIONS  ──────────────────────────────

# convert():
#   • Input: any media file (mp4, m4a, wav, etc.)
#   • Output (echoed): path to a 16 kHz mono WAV
#       – If input extension ≠ .wav → “<basename>.wav”
#       – If input extension = .wav  → “<basename>_converted.wav” (to avoid overwriting)
#   • Caller is responsible for deleting the returned WAV.
convert() {
  local input="$1"
  local ext="${input##*.}"
  local base="${input%.*}"
  local output_wav

  if [[ "${ext,,}" == "wav" ]]; then
    output_wav="${base}_converted.wav"
  else
    output_wav="${base}.wav"
  fi

  echo "Converting → ${output_wav} (16 kHz mono) ..." >&2
  "$FFMPEG_PATH" -y -i "$input" -ar 16000 -ac 1 -c:a pcm_s16le "$output_wav"
  echo "$output_wav"
}

# transcribe():
#   • Inputs:
#       1) wav_path  → the 16 kHz mono WAV from convert()
#       2) orig_file → the original media path (so we name transcript correctly)
#   • Output: writes “<orig-basename>.txt” via whisper-cli, then renames it to “.md”
#   • Returns: the final transcript path (“<orig-basename>.md”)
transcribe() {
  local wav_path="$1"
  local orig_file="$2"
  local orig_base="${orig_file%.*}"
  local txt_path="${orig_base}.txt"
  local md_path="${orig_base}.md"

  echo "Transcribing → ${txt_path} ..." >&2
  "$WHISPER_PATH" \
    -m "${MODELS_DIR}/ggml-medium.bin" \
    -l en \
    -f "$wav_path" \
    -otxt \
    -of "$orig_base"

  if [[ ! -f "$txt_path" ]]; then
    echo "Error: Whisper did not write ${txt_path}" >&2
    exit 1
  fi

  mv "$txt_path" "$md_path"
  echo "Renamed transcript → ${md_path}" >&2
  echo "$md_path"
}

# generate_description():
#   • Inputs:
#       1) prompt_file → a text file containing the LLM prompt
#       2) orig_file   → the original media path (so the transcript is “<orig-basename>.md”)
#   • Output: "<orig-basename>_description.md"
#   • Uses: llm CLI, piping transcript into -s "$prompt"
generate_description() {
  local prompt_file="$1"
  local orig_file="$2"
  local transcript_md="${orig_file%.*}.md"
  local output_md="${orig_file%.*}_description.md"

  if [[ ! -f "$prompt_file" ]]; then
    echo "Warning: Prompt file “$prompt_file” not found. Skipping description." >&2
    return 0
  fi

  if [[ ! -f "$transcript_md" ]]; then
    echo "Error: Transcript “${transcript_md}” not found; cannot generate description." >&2
    return 1
  fi

  echo "Generating description → ${output_md} ..." >&2
  local prompt
  prompt=$(<"$prompt_file")

  cat "$transcript_md" | llm -s "$prompt" > "$output_md"
  echo "Wrote description → ${output_md}" >&2
}

# find_default_prompt():
#   • Given an orig_file (absolute path), looks in the same folder for “prompt.md.”
#   • If found, echoes that path; otherwise, echoes empty string.
find_default_prompt() {
  local dir
  dir="$(dirname "$1")"
  if [[ -f "${dir}/prompt.md" ]]; then
    echo "${dir}/prompt.md"
  else
    echo ""
  fi
}

# process_file():
#   • Inputs:
#       1) file_path      → absolute path to media (mp4/m4a/wav)
#       2) optional_prompt→ explicit prompt file (can be empty string)
#   • Workflow:
#       a) convert()      → get 16kHz WAV
#       b) transcribe()   → write "<basename>.md"
#       c) decide prompt  → explicit or default “prompt.md”
#       d) generate_description() if a prompt was chosen
#       e) delete the intermediate WAV
process_file() {
  local abs_file
  abs_file="$(realpath "$1")"
  local explicit_prompt="$2"

  # Step a: convert
  local wav_path
  wav_path="$(convert "$abs_file")" || exit 1

  # Step b: transcribe
  transcribe "$wav_path" "$abs_file"

  # Step c: choose prompt
  local prompt_to_use=""
  if [[ -n "$explicit_prompt" && -f "$explicit_prompt" ]]; then
    prompt_to_use="$explicit_prompt"
  fi

  if [[ -z "$prompt_to_use" ]]; then
    prompt_to_use="$(find_default_prompt "$abs_file")"
  fi

  # Step d: generate description (if we have a prompt)
  if [[ -n "$prompt_to_use" ]]; then
    generate_description "$prompt_to_use" "$abs_file"
  else
    echo "No prompt found for “$abs_file” → skipping description." >&2
  fi

  # Step e: clean up WAV
  if [[ -f "$wav_path" ]]; then
    rm "$wav_path"
    echo "Deleted intermediate WAV: $wav_path" >&2
  fi
}

# ───────────────────────────────  MAIN  ───────────────────────────────

if (( $# < 1 )); then
  cat <<EOF
Usage: $(basename "$0") <file-or-directory> [prompt_file]

  <file-or-directory> :
      • If you supply a single file (mp4/m4a/wav), scribe processes that
        one file.
      • If you supply a directory, scribe finds all top-level *.mp4, *.m4a, *.wav
        (case-insensitive) in it and processes each in turn.

  [prompt_file] :
      • (Optional) Path to a custom prompt (e.g. “youtube-description.md”).
        If given, that file is always used for description.
      • Otherwise, scribe looks for “prompt.md” in the same folder as each input.

Examples:
  1) Transcribe “lecture.mp4” and, if “prompt.md” exists next to it, also make a description:
       ./scribe lecture.mp4

  2) Transcribe “podcast.wav” and use “youtube-description.md” for the description:
       ./scribe podcast.wav youtube-description.md

  3) Batch mode on a directory (uses prompt.md if present):
       ./scribe "/Users/you/AudioFolder"

  4) Batch mode on a directory but force “blog-prompt.md” for every file:
       ./scribe "/Users/you/AudioFolder" blog-prompt.md
EOF
  exit 1
fi

input="$1"
prompt_arg="${2:-}"

if [[ -d "$input" ]]; then
  # Directory mode: loop through top-level mp4/m4a/wav only
  while IFS= read -r -d '' media; do
    echo "===============================" >&2
    echo "Processing: $media" >&2
    process_file "$media" "$prompt_arg"
    echo "===============================" >&2
  done < <(find "$input" -maxdepth 1 -type f \( -iname '*.mp4' -o -iname '*.m4a' -o -iname '*.wav' \) -print0)

elif [[ -f "$input" ]]; then
  echo "===============================" >&2
  echo "Processing single file: $input" >&2
  process_file "$input" "$prompt_arg"
  echo "===============================" >&2

else
  echo "Error: '$input' is not a valid file or directory." >&2
  exit 1
fi

echo "All done!" >&2
	#!/usr/bin/env bash
	set -euo pipefail

	#########################################################################
	# scribe
	#
	# 1) Converts any audio/video file (mp4, m4a, wav, etc.) → 16 kHz mono WAV
	# 2) Uses whisper.cpp (via whisper-cli) to produce a text transcript
	# 3) If a prompt is found (either a user-passed prompt file OR a “prompt.md”
	# in the same folder), pipes that transcript into llm to generate a Markdown
	# description.
	# 4) Cleans up the intermediate WAV, leaving only:
	# • original media file
	# • <basename>.md (the transcript)
	# • <basename>_description.md (only if a prompt is used)
	#
	# ────────────────────────────────────────────────────────────────────────
	# PREREQUISITES:
	# • ffmpeg (e.g. brew install ffmpeg or download from https://ffmpeg.org)
	# • whisper-cli (build from https://github.com/ggerganov/whisper.cpp)
	# • llm CLI (https://github.com/llm-cli/llm; e.g. “pip install llm” or “brew install llm”)
	# – After installing llm, set an API key or default model. For example:
	# llm keys set openai
	# llm set-default-model gpt-4o-mini
	# – If you want to force a specific model for description, edit generate_description()
	# (see “-m <model-name>” example below).
	#
	# CONFIG (edit these to match your machine):
	# FFMPEG_PATH — path to ffmpeg binary (e.g. /opt/homebrew/bin/ffmpeg)
	# WHISPER_PATH — path to whisper-cli binary (e.g. /Users/nikola/dev/whisper.cpp/build/bin/whisper-cli)
	# MODELS_DIR — directory containing whisper.cpp model files (e.g. “…/whisper.cpp/models”)
	#
	# USAGE:
	# 1) Single file, no explicit prompt given → looks for “prompt.md” next to that file:
	# ./scribe lecture.mp4
	# → Produces:
	# lecture.wav (intermediate, then deleted)
	# lecture.md (transcript)
	# lecture_description.md (only if “lecture_directory/prompt.md” existed)
	#
	# 2) Single file + custom prompt:
	# ./scribe podcast.wav youtube-description.md
	# → Produces:
	# podcast_converted.wav (intermediate, then deleted)
	# podcast.md (transcript)
	# podcast_description.md (based on youtube-description.md)
	#
	# 3) Directory of files:
	# ./scribe "/Users/you/AudioFolder" (uses prompt.md if present in that folder)
	# OR:
	# ./scribe "/Users/you/AudioFolder" blog-prompt.md
	# → Loops through all top-level .mp4, .m4a, *.wav in that folder.
	# For each file:
	# • Converts → <basename>_converted.wav
	# • Transcribes → <basename>.md
	# • If “prompt.md” (or user-passed prompt) exists → <basename>_description.md
	# • Deletes <basename>_converted.wav
	#
	# EXAMPLES:
	# • Transcribe & (if prompt.md exists) describe one file:
	# ./scribe Andrija.WAV
	# • Transcribe & describe using a different prompt:
	# ./scribe Andrija.WAV youtube-description.md
	# • Batch mode (uses prompt.md if present in folder):
	# ./scribe "/Users/nikola/Documents/AudioNotes"
	#
	# NOTES:
	# – Whisper always needs a 16 kHz mono WAV → we avoid clobbering raw WAV by naming it
	# <basename>_converted.wav. After transcription, we clean that up.
	# – The transcript is written as Markdown (.md). You can rename or edit as needed.
	# – Description files always end in “_description.md.”
	# – If you want a specific LLM model for description, edit generate_description() to:
	# llm -m "<your-model-here>" -s "$prompt"
	# ────────────────────────────────────────────────────────────────────────

	# ────────────────────────────── CONFIG ──────────────────────────────
	FFMPEG_PATH="/opt/homebrew/bin/ffmpeg" # ← adjust if needed
	WHISPER_PATH="/Users/nikola/dev/whisper.cpp/build/bin/whisper-cli" # ← adjust
	MODELS_DIR="/Users/nikola/dev/whisper.cpp/models" # ← adjust

	# ────────────────────────────── FUNCTIONS ──────────────────────────────

	# convert():
	# • Input: any media file (mp4, m4a, wav, etc.)
	# • Output (echoed): path to a 16 kHz mono WAV
	# – If input extension ≠ .wav → “<basename>.wav”
	# – If input extension = .wav → “<basename>_converted.wav” (to avoid overwriting)
	# • Caller is responsible for deleting the returned WAV.
	convert() {
	local input="$1"
	local ext="${input##*.}"
	local base="${input%.*}"
	local output_wav

	if [[ "${ext,,}" == "wav" ]]; then
	output_wav="${base}_converted.wav"
	else
	output_wav="${base}.wav"
	fi

	echo "Converting → ${output_wav} (16 kHz mono) ..." >&2
	"$FFMPEG_PATH" -y -i "$input" -ar 16000 -ac 1 -c:a pcm_s16le "$output_wav"
	echo "$output_wav"
	}

	# transcribe():
	# • Inputs:
	# 1) wav_path → the 16 kHz mono WAV from convert()
	# 2) orig_file → the original media path (so we name transcript correctly)
	# • Output: writes “<orig-basename>.txt” via whisper-cli, then renames it to “.md”
	# • Returns: the final transcript path (“<orig-basename>.md”)
	transcribe() {
	local wav_path="$1"
	local orig_file="$2"
	local orig_base="${orig_file%.*}"
	local txt_path="${orig_base}.txt"
	local md_path="${orig_base}.md"

	echo "Transcribing → ${txt_path} ..." >&2
	"$WHISPER_PATH" \
	-m "${MODELS_DIR}/ggml-medium.bin" \
	-l en \
	-f "$wav_path" \
	-otxt \
	-of "$orig_base"

	if [[ ! -f "$txt_path" ]]; then
	echo "Error: Whisper did not write ${txt_path}" >&2
	exit 1
	fi

	mv "$txt_path" "$md_path"
	echo "Renamed transcript → ${md_path}" >&2
	echo "$md_path"
	}

	# generate_description():
	# • Inputs:
	# 1) prompt_file → a text file containing the LLM prompt
	# 2) orig_file → the original media path (so the transcript is “<orig-basename>.md”)
	# • Output: "<orig-basename>_description.md"
	# • Uses: llm CLI, piping transcript into -s "$prompt"
	generate_description() {
	local prompt_file="$1"
	local orig_file="$2"
	local transcript_md="${orig_file%.*}.md"
	local output_md="${orig_file%.*}_description.md"

	if [[ ! -f "$prompt_file" ]]; then
	echo "Warning: Prompt file “$prompt_file” not found. Skipping description." >&2
	return 0
	fi

	if [[ ! -f "$transcript_md" ]]; then
	echo "Error: Transcript “${transcript_md}” not found; cannot generate description." >&2
	return 1
	fi

	echo "Generating description → ${output_md} ..." >&2
	local prompt
	prompt=$(<"$prompt_file")

	cat "$transcript_md" \| llm -s "$prompt" > "$output_md"
	echo "Wrote description → ${output_md}" >&2
	}

	# find_default_prompt():
	# • Given an orig_file (absolute path), looks in the same folder for “prompt.md.”
	# • If found, echoes that path; otherwise, echoes empty string.
	find_default_prompt() {
	local dir
	dir="$(dirname "$1")"
	if [[ -f "${dir}/prompt.md" ]]; then
	echo "${dir}/prompt.md"
	else
	echo ""
	fi
	}

	# process_file():
	# • Inputs:
	# 1) file_path → absolute path to media (mp4/m4a/wav)
	# 2) optional_prompt→ explicit prompt file (can be empty string)
	# • Workflow:
	# a) convert() → get 16kHz WAV
	# b) transcribe() → write "<basename>.md"
	# c) decide prompt → explicit or default “prompt.md”
	# d) generate_description() if a prompt was chosen
	# e) delete the intermediate WAV
	process_file() {
	local abs_file
	abs_file="$(realpath "$1")"
	local explicit_prompt="$2"

	# Step a: convert
	local wav_path
	wav_path="$(convert "$abs_file")" \|\| exit 1

	# Step b: transcribe
	transcribe "$wav_path" "$abs_file"

	# Step c: choose prompt
	local prompt_to_use=""
	if [[ -n "$explicit_prompt" && -f "$explicit_prompt" ]]; then
	prompt_to_use="$explicit_prompt"
	fi

	if [[ -z "$prompt_to_use" ]]; then
	prompt_to_use="$(find_default_prompt "$abs_file")"
	fi

	# Step d: generate description (if we have a prompt)
	if [[ -n "$prompt_to_use" ]]; then
	generate_description "$prompt_to_use" "$abs_file"
	else
	echo "No prompt found for “$abs_file” → skipping description." >&2
	fi

	# Step e: clean up WAV
	if [[ -f "$wav_path" ]]; then
	rm "$wav_path"
	echo "Deleted intermediate WAV: $wav_path" >&2
	fi
	}

	# ─────────────────────────────── MAIN ───────────────────────────────

	if (( $# < 1 )); then
	cat <<EOF
	Usage: $(basename "$0") <file-or-directory> [prompt_file]

	<file-or-directory> :
	• If you supply a single file (mp4/m4a/wav), scribe processes that
	one file.
	• If you supply a directory, scribe finds all top-level .mp4, .m4a, *.wav
	(case-insensitive) in it and processes each in turn.

	[prompt_file] :
	• (Optional) Path to a custom prompt (e.g. “youtube-description.md”).
	If given, that file is always used for description.
	• Otherwise, scribe looks for “prompt.md” in the same folder as each input.

	Examples:
	1) Transcribe “lecture.mp4” and, if “prompt.md” exists next to it, also make a description:
	./scribe lecture.mp4

	2) Transcribe “podcast.wav” and use “youtube-description.md” for the description:
	./scribe podcast.wav youtube-description.md

	3) Batch mode on a directory (uses prompt.md if present):
	./scribe "/Users/you/AudioFolder"

	4) Batch mode on a directory but force “blog-prompt.md” for every file:
	./scribe "/Users/you/AudioFolder" blog-prompt.md
	EOF
	exit 1
	fi

	input="$1"
	prompt_arg="${2:-}"

	if [[ -d "$input" ]]; then
	# Directory mode: loop through top-level mp4/m4a/wav only
	while IFS= read -r -d '' media; do
	echo "===============================" >&2
	echo "Processing: $media" >&2
	process_file "$media" "$prompt_arg"
	echo "===============================" >&2
	done < <(find "$input" -maxdepth 1 -type f \( -iname '.mp4' -o -iname '.m4a' -o -iname '*.wav' \) -print0)

	elif [[ -f "$input" ]]; then
	echo "===============================" >&2
	echo "Processing single file: $input" >&2
	process_file "$input" "$prompt_arg"
	echo "===============================" >&2

	else
	echo "Error: '$input' is not a valid file or directory." >&2
	exit 1
	fi

	echo "All done!" >&2
No results found