Skip to content

Instantly share code, notes, and snippets.

@aarora79
Created October 29, 2025 01:43
Show Gist options
  • Select an option

  • Save aarora79/8b7ff820a456bd84752d8ac67f218acc to your computer and use it in GitHub Desktop.

Select an option

Save aarora79/8b7ff820a456bd84752d8ac67f218acc to your computer and use it in GitHub Desktop.
Get SageMaker Processing Job logs
#!/bin/bash
# Retrieve complete logs for a SageMaker Processing Job
# Outputs to simple text format (no JSON merging issues)
# Usage: ./get-sagemaker-logs-simple.sh <processing-job-name> [region] [format]
set -e
# Parameters
PROCESSING_JOB_NAME="${1:-}"
REGION="${2:-us-east-1}"
FORMAT="${3:-text}" # text or csv
# Validation
if [ -z "$PROCESSING_JOB_NAME" ]; then
echo "Error: Processing job name is required"
echo "Usage: $0 <processing-job-name> [region] [format]"
echo "Formats: text (default), csv"
echo "Example: $0 sm-spark-project-2024-10-06-18-56-30-716 us-east-1 text"
exit 1
fi
# Set output file extension
if [ "$FORMAT" = "csv" ]; then
OUTPUT_FILE="sagemaker-logs-${PROCESSING_JOB_NAME}.csv"
else
OUTPUT_FILE="sagemaker-logs-${PROCESSING_JOB_NAME}.txt"
fi
echo "================================================"
echo "SageMaker Processing Job Log Retrieval"
echo "================================================"
echo "Processing Job: $PROCESSING_JOB_NAME"
echo "Region: $REGION"
echo "Output Format: $FORMAT"
echo "Output File: $OUTPUT_FILE"
echo ""
# Step 1: Get log streams for the processing job
echo "Step 1/3: Retrieving log streams..."
LOG_STREAMS=$(aws logs describe-log-streams \
--log-group-name "/aws/sagemaker/ProcessingJobs" \
--order-by LastEventTime \
--descending \
--region "$REGION" \
--query "logStreams[?contains(logStreamName, '$PROCESSING_JOB_NAME')].logStreamName" \
--output text)
if [ -z "$LOG_STREAMS" ]; then
echo "❌ Error: No log streams found for processing job: $PROCESSING_JOB_NAME"
echo "Available log streams in the region:"
aws logs describe-log-streams \
--log-group-name "/aws/sagemaker/ProcessingJobs" \
--order-by LastEventTime \
--descending \
--max-items 10 \
--region "$REGION" \
--query "logStreams[].logStreamName" \
--output text
exit 1
fi
echo "✅ Found log streams:"
echo "$LOG_STREAMS" | tr ' ' '\n' | sed 's/^/ - /'
echo ""
# Step 2: Get logs and write directly to file
echo "Step 2/3: Retrieving log events..."
# Clear output file
> "$OUTPUT_FILE"
# Add header for CSV
if [ "$FORMAT" = "csv" ]; then
echo "timestamp,log_stream,message" >> "$OUTPUT_FILE"
fi
LOG_STREAM_COUNT=0
TOTAL_EVENTS=0
for LOG_STREAM in $LOG_STREAMS; do
LOG_STREAM_COUNT=$((LOG_STREAM_COUNT + 1))
echo " Processing log stream [$LOG_STREAM_COUNT]: $LOG_STREAM"
# Get log stream details (first and last event timestamps)
STREAM_INFO=$(aws logs describe-log-streams \
--log-group-name "/aws/sagemaker/ProcessingJobs" \
--log-stream-name-prefix "$LOG_STREAM" \
--region "$REGION" \
--query "logStreams[0].[firstEventTimestamp,lastEventTimestamp]" \
--output text)
if [ -z "$STREAM_INFO" ] || [ "$STREAM_INFO" = "None None" ]; then
echo " (empty stream, skipping)"
continue
fi
FIRST_TIMESTAMP=$(echo "$STREAM_INFO" | awk '{print $1}')
LAST_TIMESTAMP=$(echo "$STREAM_INFO" | awk '{print $2}')
# Get log events for this stream
STREAM_LOGS=$(aws logs filter-log-events \
--log-group-name "/aws/sagemaker/ProcessingJobs" \
--log-stream-names "$LOG_STREAM" \
--start-time "$FIRST_TIMESTAMP" \
--end-time "$LAST_TIMESTAMP" \
--output json \
--region "$REGION")
# Count events in this stream
STREAM_EVENT_COUNT=$(echo "$STREAM_LOGS" | jq '.events | length')
TOTAL_EVENTS=$((TOTAL_EVENTS + STREAM_EVENT_COUNT))
echo " Found $STREAM_EVENT_COUNT log events"
# Process events and append to file
if [ "$FORMAT" = "csv" ]; then
# CSV format
echo "$STREAM_LOGS" | jq -r '.events[] | "\(.timestamp),\"'"$LOG_STREAM"'\",\(.message | @csv)"' >> "$OUTPUT_FILE"
else
# Text format
echo "" >> "$OUTPUT_FILE"
echo "===== LOG STREAM: $LOG_STREAM =====" >> "$OUTPUT_FILE"
echo "$STREAM_LOGS" | jq -r '.events[] | "[\(.timestamp)] \(.message)"' >> "$OUTPUT_FILE"
fi
done
echo "✅ Retrieved $TOTAL_EVENTS log events from $LOG_STREAM_COUNT log streams"
echo ""
# Step 3: Summary
echo "Step 3/3: Saving logs to file..."
echo "✅ Successfully saved logs to: $OUTPUT_FILE"
echo ""
echo "================================================"
echo "Summary:"
echo " Processing Job: $PROCESSING_JOB_NAME"
echo " Log Streams: $LOG_STREAM_COUNT"
echo " Total Events: $TOTAL_EVENTS"
echo " Output File: $OUTPUT_FILE"
echo " File Size: $(du -h "$OUTPUT_FILE" | cut -f1)"
echo "================================================"
echo ""
if [ "$FORMAT" = "text" ]; then
echo "View logs with:"
echo " cat $OUTPUT_FILE"
echo " grep 'ERROR' $OUTPUT_FILE"
echo " tail -100 $OUTPUT_FILE"
else
echo "View logs with:"
echo " cat $OUTPUT_FILE"
echo " head -20 $OUTPUT_FILE"
echo " grep 'ERROR' $OUTPUT_FILE"
echo " column -t -s',' $OUTPUT_FILE | less"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment