Created
October 29, 2025 01:43
-
-
Save aarora79/8b7ff820a456bd84752d8ac67f218acc to your computer and use it in GitHub Desktop.
Get SageMaker Processing Job logs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Retrieve complete logs for a SageMaker Processing Job | |
| # Outputs to simple text format (no JSON merging issues) | |
| # Usage: ./get-sagemaker-logs-simple.sh <processing-job-name> [region] [format] | |
| set -e | |
| # Parameters | |
| PROCESSING_JOB_NAME="${1:-}" | |
| REGION="${2:-us-east-1}" | |
| FORMAT="${3:-text}" # text or csv | |
| # Validation | |
| if [ -z "$PROCESSING_JOB_NAME" ]; then | |
| echo "Error: Processing job name is required" | |
| echo "Usage: $0 <processing-job-name> [region] [format]" | |
| echo "Formats: text (default), csv" | |
| echo "Example: $0 sm-spark-project-2024-10-06-18-56-30-716 us-east-1 text" | |
| exit 1 | |
| fi | |
| # Set output file extension | |
| if [ "$FORMAT" = "csv" ]; then | |
| OUTPUT_FILE="sagemaker-logs-${PROCESSING_JOB_NAME}.csv" | |
| else | |
| OUTPUT_FILE="sagemaker-logs-${PROCESSING_JOB_NAME}.txt" | |
| fi | |
| echo "================================================" | |
| echo "SageMaker Processing Job Log Retrieval" | |
| echo "================================================" | |
| echo "Processing Job: $PROCESSING_JOB_NAME" | |
| echo "Region: $REGION" | |
| echo "Output Format: $FORMAT" | |
| echo "Output File: $OUTPUT_FILE" | |
| echo "" | |
| # Step 1: Get log streams for the processing job | |
| echo "Step 1/3: Retrieving log streams..." | |
| LOG_STREAMS=$(aws logs describe-log-streams \ | |
| --log-group-name "/aws/sagemaker/ProcessingJobs" \ | |
| --order-by LastEventTime \ | |
| --descending \ | |
| --region "$REGION" \ | |
| --query "logStreams[?contains(logStreamName, '$PROCESSING_JOB_NAME')].logStreamName" \ | |
| --output text) | |
| if [ -z "$LOG_STREAMS" ]; then | |
| echo "❌ Error: No log streams found for processing job: $PROCESSING_JOB_NAME" | |
| echo "Available log streams in the region:" | |
| aws logs describe-log-streams \ | |
| --log-group-name "/aws/sagemaker/ProcessingJobs" \ | |
| --order-by LastEventTime \ | |
| --descending \ | |
| --max-items 10 \ | |
| --region "$REGION" \ | |
| --query "logStreams[].logStreamName" \ | |
| --output text | |
| exit 1 | |
| fi | |
| echo "✅ Found log streams:" | |
| echo "$LOG_STREAMS" | tr ' ' '\n' | sed 's/^/ - /' | |
| echo "" | |
| # Step 2: Get logs and write directly to file | |
| echo "Step 2/3: Retrieving log events..." | |
| # Clear output file | |
| > "$OUTPUT_FILE" | |
| # Add header for CSV | |
| if [ "$FORMAT" = "csv" ]; then | |
| echo "timestamp,log_stream,message" >> "$OUTPUT_FILE" | |
| fi | |
| LOG_STREAM_COUNT=0 | |
| TOTAL_EVENTS=0 | |
| for LOG_STREAM in $LOG_STREAMS; do | |
| LOG_STREAM_COUNT=$((LOG_STREAM_COUNT + 1)) | |
| echo " Processing log stream [$LOG_STREAM_COUNT]: $LOG_STREAM" | |
| # Get log stream details (first and last event timestamps) | |
| STREAM_INFO=$(aws logs describe-log-streams \ | |
| --log-group-name "/aws/sagemaker/ProcessingJobs" \ | |
| --log-stream-name-prefix "$LOG_STREAM" \ | |
| --region "$REGION" \ | |
| --query "logStreams[0].[firstEventTimestamp,lastEventTimestamp]" \ | |
| --output text) | |
| if [ -z "$STREAM_INFO" ] || [ "$STREAM_INFO" = "None None" ]; then | |
| echo " (empty stream, skipping)" | |
| continue | |
| fi | |
| FIRST_TIMESTAMP=$(echo "$STREAM_INFO" | awk '{print $1}') | |
| LAST_TIMESTAMP=$(echo "$STREAM_INFO" | awk '{print $2}') | |
| # Get log events for this stream | |
| STREAM_LOGS=$(aws logs filter-log-events \ | |
| --log-group-name "/aws/sagemaker/ProcessingJobs" \ | |
| --log-stream-names "$LOG_STREAM" \ | |
| --start-time "$FIRST_TIMESTAMP" \ | |
| --end-time "$LAST_TIMESTAMP" \ | |
| --output json \ | |
| --region "$REGION") | |
| # Count events in this stream | |
| STREAM_EVENT_COUNT=$(echo "$STREAM_LOGS" | jq '.events | length') | |
| TOTAL_EVENTS=$((TOTAL_EVENTS + STREAM_EVENT_COUNT)) | |
| echo " Found $STREAM_EVENT_COUNT log events" | |
| # Process events and append to file | |
| if [ "$FORMAT" = "csv" ]; then | |
| # CSV format | |
| echo "$STREAM_LOGS" | jq -r '.events[] | "\(.timestamp),\"'"$LOG_STREAM"'\",\(.message | @csv)"' >> "$OUTPUT_FILE" | |
| else | |
| # Text format | |
| echo "" >> "$OUTPUT_FILE" | |
| echo "===== LOG STREAM: $LOG_STREAM =====" >> "$OUTPUT_FILE" | |
| echo "$STREAM_LOGS" | jq -r '.events[] | "[\(.timestamp)] \(.message)"' >> "$OUTPUT_FILE" | |
| fi | |
| done | |
| echo "✅ Retrieved $TOTAL_EVENTS log events from $LOG_STREAM_COUNT log streams" | |
| echo "" | |
| # Step 3: Summary | |
| echo "Step 3/3: Saving logs to file..." | |
| echo "✅ Successfully saved logs to: $OUTPUT_FILE" | |
| echo "" | |
| echo "================================================" | |
| echo "Summary:" | |
| echo " Processing Job: $PROCESSING_JOB_NAME" | |
| echo " Log Streams: $LOG_STREAM_COUNT" | |
| echo " Total Events: $TOTAL_EVENTS" | |
| echo " Output File: $OUTPUT_FILE" | |
| echo " File Size: $(du -h "$OUTPUT_FILE" | cut -f1)" | |
| echo "================================================" | |
| echo "" | |
| if [ "$FORMAT" = "text" ]; then | |
| echo "View logs with:" | |
| echo " cat $OUTPUT_FILE" | |
| echo " grep 'ERROR' $OUTPUT_FILE" | |
| echo " tail -100 $OUTPUT_FILE" | |
| else | |
| echo "View logs with:" | |
| echo " cat $OUTPUT_FILE" | |
| echo " head -20 $OUTPUT_FILE" | |
| echo " grep 'ERROR' $OUTPUT_FILE" | |
| echo " column -t -s',' $OUTPUT_FILE | less" | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment