Last active
March 1, 2026 06:20
-
-
Save hexawulf/7ad3b9ad7dc04e93f4767e55c2663287 to your computer and use it in GitHub Desktop.
Interactive MarkItDown batch converter (PDF/DOCX/etc -> Markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Author: 0xWulf | |
| # Email: 0xwulf@proton.me | |
| # Desc.: Interactive batch converter (MarkItDown) — converts a selected file type in a directory to Markdown. | |
| # Create Date: 2026-03-01 | |
| # Modified Date: 2026-03-01 | |
| set -euo pipefail | |
| # ---------- helpers ---------- | |
| say() { /usr/bin/printf "%s\n" "$*"; } | |
| die() { /usr/bin/printf "ERROR: %s\n" "$*" >&2; exit 1; } | |
| need_cmd() { | |
| local cmd="$1" | |
| # If an absolute path is provided, just verify it exists and is executable. | |
| if [[ "$cmd" == /* ]]; then | |
| [ -x "$cmd" ] || die "Missing required command: $cmd" | |
| return 0 | |
| fi | |
| # Otherwise resolve via PATH. | |
| /usr/bin/command -v "$cmd" >/dev/null 2>&1 || die "Missing required command: $cmd" | |
| } | |
| absdir() { | |
| # Convert to absolute directory, resolve symlinks if possible | |
| # Usage: absdir "/path/or/relative" | |
| local p="$1" | |
| if [ -d "$p" ]; then | |
| (cd "$p" && /usr/bin/pwd -P) | |
| else | |
| die "Not a directory: $p" | |
| fi | |
| } | |
| prompt() { | |
| # Usage: prompt "Question" varname | |
| local q="$1" | |
| local __varname="$2" | |
| local ans="" | |
| /usr/bin/printf "%s" "$q" | |
| IFS= read -r ans | |
| eval "$__varname=\"\$ans\"" | |
| } | |
| prompt_nonempty_dir() { | |
| local q="$1" | |
| local var="$2" | |
| local val="" | |
| while true; do | |
| prompt "$q" val | |
| [ -n "$val" ] || { say "Please enter a path."; continue; } | |
| if [ -d "$val" ]; then | |
| val="$(absdir "$val")" | |
| eval "$var=\"\$val\"" | |
| return 0 | |
| fi | |
| say "That directory does not exist: $val" | |
| done | |
| } | |
| prompt_out_dir() { | |
| local q="$1" | |
| local var="$2" | |
| local val="" | |
| while true; do | |
| prompt "$q" val | |
| [ -n "$val" ] || { say "Please enter a path."; continue; } | |
| # create if missing | |
| if [ ! -d "$val" ]; then | |
| say "Output dir does not exist. Create it? (y/n): " | |
| local yn="" | |
| IFS= read -r yn | |
| case "$yn" in | |
| y|Y) | |
| /bin/mkdir -p "$val" | |
| ;; | |
| *) | |
| say "Not creating. Try again." | |
| continue | |
| ;; | |
| esac | |
| fi | |
| val="$(absdir "$val")" | |
| eval "$var=\"\$val\"" | |
| return 0 | |
| done | |
| } | |
| prompt_timeout() { | |
| local q="$1" | |
| local var="$2" | |
| local val="" | |
| while true; do | |
| prompt "$q" val | |
| if [ -z "$val" ]; then | |
| val="120" | |
| fi | |
| if [[ "$val" =~ ^[0-9]+$ ]] && [ "$val" -ge 1 ]; then | |
| eval "$var=\"\$val\"" | |
| return 0 | |
| fi | |
| say "Please enter a positive integer number of seconds (e.g., 120)." | |
| done | |
| } | |
| prompt_depth() { | |
| local q="$1" | |
| local var="$2" | |
| local val="" | |
| while true; do | |
| prompt "$q" val | |
| if [ -z "$val" ]; then | |
| val="0" | |
| fi | |
| if [[ "$val" =~ ^[0-9]+$ ]]; then | |
| eval "$var=\"\$val\"" | |
| return 0 | |
| fi | |
| say "Please enter 0 (unlimited) or a positive integer." | |
| done | |
| } | |
| # ---------- prereqs ---------- | |
| need_cmd /usr/bin/find | |
| need_cmd /usr/bin/basename | |
| need_cmd /usr/bin/dirname | |
| need_cmd /usr/bin/printf | |
| need_cmd /bin/mkdir | |
| need_cmd /usr/bin/timeout | |
| need_cmd /bin/rm | |
| need_cmd /usr/bin/date | |
| # ---------- choose type ---------- | |
| say "" | |
| say "=== MarkItDown Batch Convert (to Markdown) ===" | |
| say "Select the file type to convert:" | |
| say " 1) Word (DOC/DOCX)" | |
| say " 2) PDF" | |
| say " 3) OTHER (you specify extension)" | |
| say "" | |
| choice="" | |
| while true; do | |
| prompt "Enter choice (1/2/3): " choice | |
| case "$choice" in | |
| 1|2|3) break ;; | |
| *) say "Invalid choice. Enter 1, 2, or 3." ;; | |
| esac | |
| done | |
| exts=() | |
| type_label="" | |
| case "$choice" in | |
| 1) | |
| type_label="Word" | |
| exts=("doc" "docx") | |
| ;; | |
| 2) | |
| type_label="PDF" | |
| exts=("pdf") | |
| ;; | |
| 3) | |
| type_label="OTHER" | |
| other_ext="" | |
| while true; do | |
| prompt "Enter file extension (examples: txt, md, rtf) — no dot: " other_ext | |
| other_ext="${other_ext#.}" | |
| other_ext="${other_ext,,}" # lowercase | |
| if [[ "$other_ext" =~ ^[a-z0-9]+$ ]]; then | |
| exts=("$other_ext") | |
| break | |
| fi | |
| say "Please enter a simple extension (letters/numbers only)." | |
| done | |
| ;; | |
| esac | |
| # ---------- dirs ---------- | |
| in_dir="" | |
| out_dir="" | |
| prompt_nonempty_dir "Input directory (absolute path preferred): " in_dir | |
| prompt_out_dir "Output directory (will be created if missing): " out_dir | |
| # ---------- timeout ---------- | |
| timeout_s="" | |
| prompt_timeout "Per-file timeout seconds (Enter = 120): " timeout_s | |
| # ---------- depth ---------- | |
| depth="" | |
| prompt_depth "Max directory depth (0 = unlimited, Enter = 0): " depth | |
| # ---------- venv / markitdown ---------- | |
| say "" | |
| say "MarkItDown setup:" | |
| say "Default venv path: /home/zk/.venvs/markitdown" | |
| say "Press Enter to accept, or paste a different venv path." | |
| venv_path="" | |
| prompt "Venv path: " venv_path | |
| if [ -z "$venv_path" ]; then | |
| venv_path="/home/zk/.venvs/markitdown" | |
| fi | |
| # Normalize: venv_path should be absolute for your style preference | |
| case "$venv_path" in | |
| /*) ;; | |
| *) die "Please provide an absolute venv path (example: /home/zk/.venvs/markitdown)" ;; | |
| esac | |
| activate="$venv_path/bin/activate" | |
| # Create venv if missing | |
| if [ ! -f "$activate" ]; then | |
| say "Venv not found at: $venv_path" | |
| yn_venv="" | |
| /usr/bin/printf "Create it now? (y/n): " | |
| IFS= read -r yn_venv | |
| case "$yn_venv" in | |
| y|Y) | |
| need_cmd python3 | |
| python3 -m venv "$venv_path" || die "Failed to create venv at: $venv_path" | |
| ;; | |
| *) | |
| die "Venv required. Create one with: python3 -m venv $venv_path" | |
| ;; | |
| esac | |
| fi | |
| # shellcheck disable=SC1090 | |
| source "$activate" | |
| # Install markitdown if not present in the venv | |
| if ! /usr/bin/command -v markitdown >/dev/null 2>&1; then | |
| say "markitdown not found in venv." | |
| yn_mid="" | |
| /usr/bin/printf "Install it now via pip? (y/n): " | |
| IFS= read -r yn_mid | |
| case "$yn_mid" in | |
| y|Y) | |
| pip install markitdown || die "pip install markitdown failed" | |
| ;; | |
| *) | |
| die "markitdown required. Activate the venv and run: pip install markitdown" | |
| ;; | |
| esac | |
| fi | |
| # ---------- find + convert ---------- | |
| say "" | |
| depth_display="unlimited" | |
| [ "$depth" -gt 0 ] && depth_display="$depth" | |
| say "Converting type: $type_label" | |
| say "Extensions: ${exts[*]}" | |
| say "Input dir: $in_dir" | |
| say "Output dir: $out_dir" | |
| say "Timeout: ${timeout_s}s per file" | |
| say "Depth: $depth_display" | |
| say "" | |
| # Build find expression like: \( -iname "*.pdf" -o -iname "*.docx" \) | |
| find_expr=() | |
| for e in "${exts[@]}"; do | |
| [ ${#find_expr[@]} -gt 0 ] && find_expr+=(-o) | |
| find_expr+=(-iname "*.${e}") | |
| done | |
| log_file="$out_dir/markitdown-batch-$(/usr/bin/date +%Y%m%d-%H%M%S).log" | |
| : > "$log_file" | |
| say "Log file: $log_file" | |
| say "" | |
| # Build find args; -maxdepth must precede -type for efficiency | |
| find_args=("$in_dir") | |
| [ "$depth" -gt 0 ] && find_args+=(-maxdepth "$depth") | |
| # Preserve folder structure under OUT_DIR | |
| # Process substitution (< <(...)) keeps the while loop in the current shell | |
| # so counters survive after the loop. | |
| cnt_ok=0; cnt_fail=0; cnt_skip=0 | |
| while IFS= read -r -d '' f; do | |
| rel="${f#"$in_dir"/}" | |
| # Strip final extension | |
| out_rel="${rel%.*}.md" | |
| out_path="$out_dir/$out_rel" | |
| /bin/mkdir -p "$(/usr/bin/dirname "$out_path")" | |
| # Skip if already converted (non-empty output exists) | |
| if [ -s "$out_path" ]; then | |
| say "↷ SKIP (exists): $rel" | |
| cnt_skip=$(( cnt_skip + 1 )) | |
| continue | |
| fi | |
| say "→ $rel" | |
| # Convert with timeout; on failure/timeout remove partial output | |
| if ! /usr/bin/timeout "$timeout_s" markitdown "$f" > "$out_path" 2>>"$log_file"; then | |
| say "!! FAILED/TIMEOUT: $rel (see log)" | |
| /bin/rm -f "$out_path" | |
| cnt_fail=$(( cnt_fail + 1 )) | |
| else | |
| cnt_ok=$(( cnt_ok + 1 )) | |
| fi | |
| done < <(/usr/bin/find "${find_args[@]}" -type f \( "${find_expr[@]}" \) -print0) | |
| say "" | |
| say "Done. Summary: $cnt_ok converted, $cnt_skip skipped, $cnt_fail failed." | |
| say "Output written under: $out_dir" | |
| say "Log: $log_file" | |
| say "" | |
| say "Tip: Count outputs with:" | |
| say " /usr/bin/find \"$out_dir\" -type f -name \"*.md\" | /usr/bin/wc -l" |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Interactive MarkItDown batch converter: choose PDF/DOCX/other, input dir, output dir. Skips already-converted files and uses a per-file timeout. Output mirrors folder structure + writes a log.
Usage: save as markitdown-batch.sh, chmod +x markitdown-batch.sh, run ./markitdown-batch.sh. Requires markitdown installed in a venv (default: /home/zk/.venvs/markitdown). Skips existing .md and times out stuck files.