oneryalcin/crawl-claude-docs.sh

## crawl-claude-docs.sh
#!/bin/bash
# Crawl and download Claude platform docs as markdown

set -e

BASE_URL="https://platform.claude.com/docs/en"
SITEMAP_URL="https://platform.claude.com/sitemap.xml"
WORK_DIR="/tmp/claude-docs"
OUT_DIR="$WORK_DIR/docs"
URLS_FILE="$WORK_DIR/urls.txt"

usage() {
    cat <<EOF
Usage: $(basename "$0") [--all | <filter>]

Download Claude platform docs as markdown files.

Options:
  --all           Download all docs (~530 files)
  --help, -h      Show this help
  <filter>        Filter by path prefix

Examples:
  $(basename "$0") --all                              # all docs
  $(basename "$0") agent-sdk                          # agent-sdk docs only
  $(basename "$0") api/python                         # python API docs
  $(basename "$0") build-with-claude/prompt-engineering

Output: $OUT_DIR
EOF
    exit 0
}

# No args or help flag
if [[ $# -eq 0 || "$1" == "--help" || "$1" == "-h" ]]; then
    usage
fi

FILTER=""

if [[ "$1" == "--all" ]]; then
    FILTER=""
else
    FILTER="$1"
fi

mkdir -p "$WORK_DIR" "$OUT_DIR"

# Step 1: Fetch sitemap (cached)
if [[ ! -f "$URLS_FILE" ]]; then
    echo "Fetching sitemap..."
    curl -s "$SITEMAP_URL" | grep -oE 'https://platform\.claude\.com/docs/en/[^<]+' > "$URLS_FILE"
fi

# Step 2: Filter URLs
CLEAN_URLS=$(cat "$URLS_FILE" | sort -u)

if [[ -n "$FILTER" ]]; then
    CLEAN_URLS=$(echo "$CLEAN_URLS" | grep "/docs/en/${FILTER}" || true)
    if [[ -z "$CLEAN_URLS" ]]; then
        echo "No URLs found matching filter: $FILTER"
        exit 1
    fi
    echo "Filter: $FILTER"
fi

URL_COUNT=$(echo "$CLEAN_URLS" | wc -l | tr -d ' ')
echo "Found $URL_COUNT URLs"

# Step 3: Download
DOWNLOADED=0
SKIPPED=0

while read -r url; do
    [[ -z "$url" ]] && continue

    path="${url#https://platform.claude.com/docs/en/}"
    dir="$OUT_DIR/$(dirname "$path")"
    file="$OUT_DIR/${path}.md"

    mkdir -p "$dir"

    if [[ -f "$file" ]]; then
        ((SKIPPED++))
        continue
    fi

    echo "GET: $path"
    curl -s "${url}.md" -o "$file"
    ((DOWNLOADED++))
    sleep 0.1
done <<< "$CLEAN_URLS"

echo ""
echo "Downloaded: $DOWNLOADED, Skipped: $SKIPPED"
echo "Saved to: $OUT_DIR"
	#!/bin/bash
	# Crawl and download Claude platform docs as markdown

	set -e

	BASE_URL="https://platform.claude.com/docs/en"
	SITEMAP_URL="https://platform.claude.com/sitemap.xml"
	WORK_DIR="/tmp/claude-docs"
	OUT_DIR="$WORK_DIR/docs"
	URLS_FILE="$WORK_DIR/urls.txt"

	usage() {
	cat <<EOF
	Usage: $(basename "$0") [--all \| <filter>]

	Download Claude platform docs as markdown files.

	Options:
	--all Download all docs (~530 files)
	--help, -h Show this help
	<filter> Filter by path prefix

	Examples:
	$(basename "$0") --all # all docs
	$(basename "$0") agent-sdk # agent-sdk docs only
	$(basename "$0") api/python # python API docs
	$(basename "$0") build-with-claude/prompt-engineering

	Output: $OUT_DIR
	EOF
	exit 0
	}

	# No args or help flag
	if [[ $# -eq 0 \|\| "$1" == "--help" \|\| "$1" == "-h" ]]; then
	usage
	fi

	FILTER=""

	if [[ "$1" == "--all" ]]; then
	FILTER=""
	else
	FILTER="$1"
	fi

	mkdir -p "$WORK_DIR" "$OUT_DIR"

	# Step 1: Fetch sitemap (cached)
	if [[ ! -f "$URLS_FILE" ]]; then
	echo "Fetching sitemap..."
	curl -s "$SITEMAP_URL" \| grep -oE 'https://platform\.claude\.com/docs/en/[^<]+' > "$URLS_FILE"
	fi

	# Step 2: Filter URLs
	CLEAN_URLS=$(cat "$URLS_FILE" \| sort -u)

	if [[ -n "$FILTER" ]]; then
	CLEAN_URLS=$(echo "$CLEAN_URLS" \| grep "/docs/en/${FILTER}" \|\| true)
	if [[ -z "$CLEAN_URLS" ]]; then
	echo "No URLs found matching filter: $FILTER"
	exit 1
	fi
	echo "Filter: $FILTER"
	fi

	URL_COUNT=$(echo "$CLEAN_URLS" \| wc -l \| tr -d ' ')
	echo "Found $URL_COUNT URLs"

	# Step 3: Download
	DOWNLOADED=0
	SKIPPED=0

	while read -r url; do
	[[ -z "$url" ]] && continue

	path="${url#https://platform.claude.com/docs/en/}"
	dir="$OUT_DIR/$(dirname "$path")"
	file="$OUT_DIR/${path}.md"

	mkdir -p "$dir"

	if [[ -f "$file" ]]; then
	((SKIPPED++))
	continue
	fi

	echo "GET: $path"
	curl -s "${url}.md" -o "$file"
	((DOWNLOADED++))
	sleep 0.1
	done <<< "$CLEAN_URLS"

	echo ""
	echo "Downloaded: $DOWNLOADED, Skipped: $SKIPPED"
	echo "Saved to: $OUT_DIR"
No results found