Skip to content

Instantly share code, notes, and snippets.

@oneryalcin
Created January 22, 2026 11:41
Show Gist options
  • Select an option

  • Save oneryalcin/30138206a2794c978b71ab6411d557bb to your computer and use it in GitHub Desktop.

Select an option

Save oneryalcin/30138206a2794c978b71ab6411d557bb to your computer and use it in GitHub Desktop.
Crawling Claude Developer Page
#!/bin/bash
# Crawl and download Claude platform docs as markdown
set -e
BASE_URL="https://platform.claude.com/docs/en"
SITEMAP_URL="https://platform.claude.com/sitemap.xml"
WORK_DIR="/tmp/claude-docs"
OUT_DIR="$WORK_DIR/docs"
URLS_FILE="$WORK_DIR/urls.txt"
usage() {
cat <<EOF
Usage: $(basename "$0") [--all | <filter>]
Download Claude platform docs as markdown files.
Options:
--all Download all docs (~530 files)
--help, -h Show this help
<filter> Filter by path prefix
Examples:
$(basename "$0") --all # all docs
$(basename "$0") agent-sdk # agent-sdk docs only
$(basename "$0") api/python # python API docs
$(basename "$0") build-with-claude/prompt-engineering
Output: $OUT_DIR
EOF
exit 0
}
# No args or help flag
if [[ $# -eq 0 || "$1" == "--help" || "$1" == "-h" ]]; then
usage
fi
FILTER=""
if [[ "$1" == "--all" ]]; then
FILTER=""
else
FILTER="$1"
fi
mkdir -p "$WORK_DIR" "$OUT_DIR"
# Step 1: Fetch sitemap (cached)
if [[ ! -f "$URLS_FILE" ]]; then
echo "Fetching sitemap..."
curl -s "$SITEMAP_URL" | grep -oE 'https://platform\.claude\.com/docs/en/[^<]+' > "$URLS_FILE"
fi
# Step 2: Filter URLs
CLEAN_URLS=$(cat "$URLS_FILE" | sort -u)
if [[ -n "$FILTER" ]]; then
CLEAN_URLS=$(echo "$CLEAN_URLS" | grep "/docs/en/${FILTER}" || true)
if [[ -z "$CLEAN_URLS" ]]; then
echo "No URLs found matching filter: $FILTER"
exit 1
fi
echo "Filter: $FILTER"
fi
URL_COUNT=$(echo "$CLEAN_URLS" | wc -l | tr -d ' ')
echo "Found $URL_COUNT URLs"
# Step 3: Download
DOWNLOADED=0
SKIPPED=0
while read -r url; do
[[ -z "$url" ]] && continue
path="${url#https://platform.claude.com/docs/en/}"
dir="$OUT_DIR/$(dirname "$path")"
file="$OUT_DIR/${path}.md"
mkdir -p "$dir"
if [[ -f "$file" ]]; then
((SKIPPED++))
continue
fi
echo "GET: $path"
curl -s "${url}.md" -o "$file"
((DOWNLOADED++))
sleep 0.1
done <<< "$CLEAN_URLS"
echo ""
echo "Downloaded: $DOWNLOADED, Skipped: $SKIPPED"
echo "Saved to: $OUT_DIR"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment