Skip to content

Instantly share code, notes, and snippets.

@hexawulf
Last active March 1, 2026 06:20
Show Gist options
  • Select an option

  • Save hexawulf/7ad3b9ad7dc04e93f4767e55c2663287 to your computer and use it in GitHub Desktop.

Select an option

Save hexawulf/7ad3b9ad7dc04e93f4767e55c2663287 to your computer and use it in GitHub Desktop.
Interactive MarkItDown batch converter (PDF/DOCX/etc -> Markdown
#!/bin/bash
# Author: 0xWulf
# Email: 0xwulf@proton.me
# Desc.: Interactive batch converter (MarkItDown) — converts a selected file type in a directory to Markdown.
# Create Date: 2026-03-01
# Modified Date: 2026-03-01
set -euo pipefail
# ---------- helpers ----------
say() { /usr/bin/printf "%s\n" "$*"; }
die() { /usr/bin/printf "ERROR: %s\n" "$*" >&2; exit 1; }
need_cmd() {
local cmd="$1"
# If an absolute path is provided, just verify it exists and is executable.
if [[ "$cmd" == /* ]]; then
[ -x "$cmd" ] || die "Missing required command: $cmd"
return 0
fi
# Otherwise resolve via PATH.
/usr/bin/command -v "$cmd" >/dev/null 2>&1 || die "Missing required command: $cmd"
}
absdir() {
# Convert to absolute directory, resolve symlinks if possible
# Usage: absdir "/path/or/relative"
local p="$1"
if [ -d "$p" ]; then
(cd "$p" && /usr/bin/pwd -P)
else
die "Not a directory: $p"
fi
}
prompt() {
# Usage: prompt "Question" varname
local q="$1"
local __varname="$2"
local ans=""
/usr/bin/printf "%s" "$q"
IFS= read -r ans
eval "$__varname=\"\$ans\""
}
prompt_nonempty_dir() {
local q="$1"
local var="$2"
local val=""
while true; do
prompt "$q" val
[ -n "$val" ] || { say "Please enter a path."; continue; }
if [ -d "$val" ]; then
val="$(absdir "$val")"
eval "$var=\"\$val\""
return 0
fi
say "That directory does not exist: $val"
done
}
prompt_out_dir() {
local q="$1"
local var="$2"
local val=""
while true; do
prompt "$q" val
[ -n "$val" ] || { say "Please enter a path."; continue; }
# create if missing
if [ ! -d "$val" ]; then
say "Output dir does not exist. Create it? (y/n): "
local yn=""
IFS= read -r yn
case "$yn" in
y|Y)
/bin/mkdir -p "$val"
;;
*)
say "Not creating. Try again."
continue
;;
esac
fi
val="$(absdir "$val")"
eval "$var=\"\$val\""
return 0
done
}
prompt_timeout() {
local q="$1"
local var="$2"
local val=""
while true; do
prompt "$q" val
if [ -z "$val" ]; then
val="120"
fi
if [[ "$val" =~ ^[0-9]+$ ]] && [ "$val" -ge 1 ]; then
eval "$var=\"\$val\""
return 0
fi
say "Please enter a positive integer number of seconds (e.g., 120)."
done
}
prompt_depth() {
local q="$1"
local var="$2"
local val=""
while true; do
prompt "$q" val
if [ -z "$val" ]; then
val="0"
fi
if [[ "$val" =~ ^[0-9]+$ ]]; then
eval "$var=\"\$val\""
return 0
fi
say "Please enter 0 (unlimited) or a positive integer."
done
}
# ---------- prereqs ----------
need_cmd /usr/bin/find
need_cmd /usr/bin/basename
need_cmd /usr/bin/dirname
need_cmd /usr/bin/printf
need_cmd /bin/mkdir
need_cmd /usr/bin/timeout
need_cmd /bin/rm
need_cmd /usr/bin/date
# ---------- choose type ----------
say ""
say "=== MarkItDown Batch Convert (to Markdown) ==="
say "Select the file type to convert:"
say " 1) Word (DOC/DOCX)"
say " 2) PDF"
say " 3) OTHER (you specify extension)"
say ""
choice=""
while true; do
prompt "Enter choice (1/2/3): " choice
case "$choice" in
1|2|3) break ;;
*) say "Invalid choice. Enter 1, 2, or 3." ;;
esac
done
exts=()
type_label=""
case "$choice" in
1)
type_label="Word"
exts=("doc" "docx")
;;
2)
type_label="PDF"
exts=("pdf")
;;
3)
type_label="OTHER"
other_ext=""
while true; do
prompt "Enter file extension (examples: txt, md, rtf) — no dot: " other_ext
other_ext="${other_ext#.}"
other_ext="${other_ext,,}" # lowercase
if [[ "$other_ext" =~ ^[a-z0-9]+$ ]]; then
exts=("$other_ext")
break
fi
say "Please enter a simple extension (letters/numbers only)."
done
;;
esac
# ---------- dirs ----------
in_dir=""
out_dir=""
prompt_nonempty_dir "Input directory (absolute path preferred): " in_dir
prompt_out_dir "Output directory (will be created if missing): " out_dir
# ---------- timeout ----------
timeout_s=""
prompt_timeout "Per-file timeout seconds (Enter = 120): " timeout_s
# ---------- depth ----------
depth=""
prompt_depth "Max directory depth (0 = unlimited, Enter = 0): " depth
# ---------- venv / markitdown ----------
say ""
say "MarkItDown setup:"
say "Default venv path: /home/zk/.venvs/markitdown"
say "Press Enter to accept, or paste a different venv path."
venv_path=""
prompt "Venv path: " venv_path
if [ -z "$venv_path" ]; then
venv_path="/home/zk/.venvs/markitdown"
fi
# Normalize: venv_path should be absolute for your style preference
case "$venv_path" in
/*) ;;
*) die "Please provide an absolute venv path (example: /home/zk/.venvs/markitdown)" ;;
esac
activate="$venv_path/bin/activate"
# Create venv if missing
if [ ! -f "$activate" ]; then
say "Venv not found at: $venv_path"
yn_venv=""
/usr/bin/printf "Create it now? (y/n): "
IFS= read -r yn_venv
case "$yn_venv" in
y|Y)
need_cmd python3
python3 -m venv "$venv_path" || die "Failed to create venv at: $venv_path"
;;
*)
die "Venv required. Create one with: python3 -m venv $venv_path"
;;
esac
fi
# shellcheck disable=SC1090
source "$activate"
# Install markitdown if not present in the venv
if ! /usr/bin/command -v markitdown >/dev/null 2>&1; then
say "markitdown not found in venv."
yn_mid=""
/usr/bin/printf "Install it now via pip? (y/n): "
IFS= read -r yn_mid
case "$yn_mid" in
y|Y)
pip install markitdown || die "pip install markitdown failed"
;;
*)
die "markitdown required. Activate the venv and run: pip install markitdown"
;;
esac
fi
# ---------- find + convert ----------
say ""
depth_display="unlimited"
[ "$depth" -gt 0 ] && depth_display="$depth"
say "Converting type: $type_label"
say "Extensions: ${exts[*]}"
say "Input dir: $in_dir"
say "Output dir: $out_dir"
say "Timeout: ${timeout_s}s per file"
say "Depth: $depth_display"
say ""
# Build find expression like: \( -iname "*.pdf" -o -iname "*.docx" \)
find_expr=()
for e in "${exts[@]}"; do
[ ${#find_expr[@]} -gt 0 ] && find_expr+=(-o)
find_expr+=(-iname "*.${e}")
done
log_file="$out_dir/markitdown-batch-$(/usr/bin/date +%Y%m%d-%H%M%S).log"
: > "$log_file"
say "Log file: $log_file"
say ""
# Build find args; -maxdepth must precede -type for efficiency
find_args=("$in_dir")
[ "$depth" -gt 0 ] && find_args+=(-maxdepth "$depth")
# Preserve folder structure under OUT_DIR
# Process substitution (< <(...)) keeps the while loop in the current shell
# so counters survive after the loop.
cnt_ok=0; cnt_fail=0; cnt_skip=0
while IFS= read -r -d '' f; do
rel="${f#"$in_dir"/}"
# Strip final extension
out_rel="${rel%.*}.md"
out_path="$out_dir/$out_rel"
/bin/mkdir -p "$(/usr/bin/dirname "$out_path")"
# Skip if already converted (non-empty output exists)
if [ -s "$out_path" ]; then
say "↷ SKIP (exists): $rel"
cnt_skip=$(( cnt_skip + 1 ))
continue
fi
say "→ $rel"
# Convert with timeout; on failure/timeout remove partial output
if ! /usr/bin/timeout "$timeout_s" markitdown "$f" > "$out_path" 2>>"$log_file"; then
say "!! FAILED/TIMEOUT: $rel (see log)"
/bin/rm -f "$out_path"
cnt_fail=$(( cnt_fail + 1 ))
else
cnt_ok=$(( cnt_ok + 1 ))
fi
done < <(/usr/bin/find "${find_args[@]}" -type f \( "${find_expr[@]}" \) -print0)
say ""
say "Done. Summary: $cnt_ok converted, $cnt_skip skipped, $cnt_fail failed."
say "Output written under: $out_dir"
say "Log: $log_file"
say ""
say "Tip: Count outputs with:"
say " /usr/bin/find \"$out_dir\" -type f -name \"*.md\" | /usr/bin/wc -l"
@hexawulf
Copy link
Author

hexawulf commented Mar 1, 2026

Interactive MarkItDown batch converter: choose PDF/DOCX/other, input dir, output dir. Skips already-converted files and uses a per-file timeout. Output mirrors folder structure + writes a log.

Usage: save as markitdown-batch.sh, chmod +x markitdown-batch.sh, run ./markitdown-batch.sh. Requires markitdown installed in a venv (default: /home/zk/.venvs/markitdown). Skips existing .md and times out stuck files.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment