Mazyod/gpu-utilization.py

## gpu-utilization.py
#!/usr/bin/env -S uv run
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "pyyaml>=6.0",
#     "rich>=13.0.0",
# ]
# ///

"""
Docker Compose GPU Utilization Analyzer

Parses a docker-compose.yaml file and shows GPU allocation summary
with utilization percentages from --gpu-memory-utilization flags.
"""

import argparse
import re
import sys
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set

import yaml
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich.progress import Progress, BarColumn, TextColumn
from rich.text import Text
from rich.layout import Layout
from rich import box


def parse_gpu_memory_utilization(command) -> float:
    """
    Extract GPU memory utilization percentage from command.

    Args:
        command: Can be a string or list of command arguments

    Returns:
        float: Utilization percentage (0.0 to 1.0), or 0.0 if not found
    """
    # Convert command to string if it's a list
    if isinstance(command, list):
        command_str = " ".join(str(arg) for arg in command)
    else:
        command_str = str(command)

    # Look for --gpu-memory-utilization flag followed by a number
    pattern = r'--gpu-memory-utilization[=\s]+([0-9.]+)'
    match = re.search(pattern, command_str)

    if match:
        value = float(match.group(1))
        # If value is > 1, assume it's a percentage (e.g., 90 means 90%)
        if value > 1:
            return value / 100.0
        return value

    return 0.0


def parse_gpu_devices(service_config: dict) -> List[str]:
    """
    Extract GPU device IDs from service configuration.

    Args:
        service_config: Service configuration dictionary

    Returns:
        List of GPU device IDs
    """
    gpu_ids = []

    try:
        deploy = service_config.get('deploy', {})
        resources = deploy.get('resources', {})
        reservations = resources.get('reservations', {})
        devices = reservations.get('devices', [])

        for device in devices:
            if device.get('driver') == 'nvidia':
                device_ids = device.get('device_ids', [])
                # device_ids can be a list or a single string
                if isinstance(device_ids, list):
                    gpu_ids.extend(str(did) for did in device_ids)
                else:
                    gpu_ids.append(str(device_ids))
    except (KeyError, AttributeError):
        pass

    return gpu_ids


def analyze_docker_compose(file_path: Path) -> Dict:
    """
    Analyze docker-compose file for GPU utilization.

    Args:
        file_path: Path to docker-compose.yaml file

    Returns:
        Dictionary with analysis results
    """
    with open(file_path, 'r') as f:
        compose_data = yaml.safe_load(f)

    services = compose_data.get('services', {})

    # Track GPU allocations: {gpu_id: [(service_name, utilization)]}
    gpu_allocations = defaultdict(list)
    all_gpu_ids: Set[str] = set()

    for service_name, service_config in services.items():
        # Get GPU device IDs
        gpu_ids = parse_gpu_devices(service_config)

        # Get GPU memory utilization from command
        command = service_config.get('command')
        if command:
            utilization = parse_gpu_memory_utilization(command)

            if gpu_ids and utilization > 0:
                for gpu_id in gpu_ids:
                    gpu_allocations[gpu_id].append({
                        'service': service_name,
                        'utilization': utilization
                    })
                    all_gpu_ids.add(gpu_id)

    # Calculate totals per GPU
    gpu_summary = {}
    for gpu_id in sorted(all_gpu_ids):
        allocations = gpu_allocations[gpu_id]
        total_utilization = sum(alloc['utilization'] for alloc in allocations)
        gpu_summary[gpu_id] = {
            'allocations': allocations,
            'total_utilization': total_utilization,
            'remaining': max(0, 1.0 - total_utilization)
        }

    return gpu_summary


def create_gpu_table(gpu_summary: Dict, console: Console) -> Table:
    """Create a rich table showing GPU allocation summary."""

    table = Table(
        title="🎮 GPU Allocation Summary",
        box=box.ROUNDED,
        title_style="bold magenta",
        show_header=True,
        header_style="bold cyan"
    )

    table.add_column("GPU ID", style="cyan", justify="center", width=10)
    table.add_column("Service", style="yellow")
    table.add_column("Utilization", justify="right", style="green")
    table.add_column("Total Used", justify="right", style="bold blue")
    table.add_column("Remaining", justify="right", style="magenta")
    table.add_column("Status", justify="center")

    if not gpu_summary:
        table.add_row("—", "No GPU allocations found", "—", "—", "—", "❌")
        return table

    for gpu_id, data in sorted(gpu_summary.items()):
        allocations = data['allocations']
        total_util = data['total_utilization']
        remaining = data['remaining']

        # Determine status
        if total_util > 1.0:
            status = "⚠️  OVER"
            status_style = "bold red"
        elif total_util > 0.9:
            status = "⚡ HIGH"
            status_style = "bold yellow"
        elif total_util > 0.5:
            status = "✓ OK"
            status_style = "bold green"
        else:
            status = "✓ LOW"
            status_style = "bold green"

        # Add rows for each allocation
        for idx, alloc in enumerate(allocations):
            if idx == 0:
                # First row includes GPU ID and totals
                table.add_row(
                    f"[bold]{gpu_id}[/bold]",
                    alloc['service'],
                    f"{alloc['utilization']*100:.1f}%",
                    f"[bold]{total_util*100:.1f}%[/bold]",
                    f"{remaining*100:.1f}%",
                    f"[{status_style}]{status}[/{status_style}]"
                )
            else:
                # Subsequent rows for same GPU
                table.add_row(
                    "",
                    alloc['service'],
                    f"{alloc['utilization']*100:.1f}%",
                    "",
                    "",
                    ""
                )

        # Add separator between GPUs if not last
        if gpu_id != list(gpu_summary.keys())[-1]:
            table.add_section()

    return table


def create_utilization_bars(gpu_summary: Dict, console: Console):
    """Create visual progress bars for GPU utilization."""

    if not gpu_summary:
        return

    console.print("\n[bold cyan]📊 GPU Memory Utilization Bars[/bold cyan]\n")

    for gpu_id, data in sorted(gpu_summary.items()):
        total_util = data['total_utilization']

        # Determine color based on utilization
        if total_util > 1.0:
            color = "red"
            bar_total = 100
        elif total_util > 0.9:
            color = "yellow"
            bar_total = int(total_util * 100)
        elif total_util > 0.5:
            color = "green"
            bar_total = int(total_util * 100)
        else:
            color = "blue"
            bar_total = int(total_util * 100)

        # Create progress bar
        text = Text(f"GPU {gpu_id}: ", style="bold cyan")

        # Build the bar
        bar_width = 40
        filled = int((min(total_util, 1.0) * bar_width))
        bar = "█" * filled + "░" * (bar_width - filled)

        text.append(f"[{color}]{bar}[/{color}] ")
        text.append(f"{total_util*100:.1f}%", style=f"bold {color}")

        if total_util > 1.0:
            text.append(" ⚠️  OVERALLOCATED!", style="bold red")

        console.print(text)


def create_statistics_panel(gpu_summary: Dict) -> Panel:
    """Create a panel with overall statistics."""

    if not gpu_summary:
        content = "[yellow]No GPU allocations found in docker-compose.yaml[/yellow]"
        return Panel(content, title="📈 Statistics", border_style="yellow")

    total_gpus = len(gpu_summary)
    overallocated = sum(1 for data in gpu_summary.values() if data['total_utilization'] > 1.0)
    highly_utilized = sum(1 for data in gpu_summary.values() if 0.9 < data['total_utilization'] <= 1.0)
    avg_utilization = sum(data['total_utilization'] for data in gpu_summary.values()) / total_gpus

    total_services = sum(len(data['allocations']) for data in gpu_summary.values())

    content = f"""[bold cyan]Total GPUs:[/bold cyan] {total_gpus}
[bold cyan]Total Services:[/bold cyan] {total_services}
[bold cyan]Average Utilization:[/bold cyan] {avg_utilization*100:.1f}%

[bold green]Well Utilized:[/bold green] {total_gpus - overallocated - highly_utilized}
[bold yellow]Highly Utilized (>90%):[/bold yellow] {highly_utilized}
[bold red]Overallocated (>100%):[/bold red] {overallocated}
"""

    return Panel(content, title="📈 Statistics", border_style="cyan", box=box.ROUNDED)


def main():
    parser = argparse.ArgumentParser(
        description="Analyze GPU utilization from docker-compose.yaml",
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument(
        'file',
        type=Path,
        nargs='?',
        default=Path('docker-compose.yaml'),
        help='Path to docker-compose.yaml file (default: docker-compose.yaml)'
    )

    args = parser.parse_args()

    console = Console()

    # Check if file exists
    if not args.file.exists():
        console.print(f"[bold red]Error:[/bold red] File '{args.file}' not found!")
        sys.exit(1)

    # Show loading message
    with console.status("[bold green]Analyzing docker-compose.yaml...", spinner="dots"):
        gpu_summary = analyze_docker_compose(args.file)

    # Display results
    console.print()
    console.print(Panel.fit(
        f"[bold cyan]Docker Compose GPU Analyzer[/bold cyan]\n"
        f"File: [yellow]{args.file}[/yellow]",
        border_style="blue"
    ))
    console.print()

    # Statistics panel
    console.print(create_statistics_panel(gpu_summary))
    console.print()

    # Main table
    table = create_gpu_table(gpu_summary, console)
    console.print(table)

    # Utilization bars
    if gpu_summary:
        create_utilization_bars(gpu_summary, console)

    console.print()

    # Warnings for overallocation
    overallocated = {gpu_id: data for gpu_id, data in gpu_summary.items()
                     if data['total_utilization'] > 1.0}

    if overallocated:
        console.print(Panel(
            "[bold red]⚠️  Warning: Some GPUs are overallocated![/bold red]\n\n"
            + "\n".join(
                f"GPU {gpu_id}: {data['total_utilization']*100:.1f}% allocated"
                for gpu_id, data in overallocated.items()
            ),
            title="⚠️  Overallocation Alert",
            border_style="red",
            box=box.DOUBLE
        ))


if __name__ == "__main__":
    main()
	#!/usr/bin/env -S uv run
	# /// script
	# requires-python = ">=3.10"
	# dependencies = [
	# "pyyaml>=6.0",
	# "rich>=13.0.0",
	# ]
	# ///

	"""
	Docker Compose GPU Utilization Analyzer

	Parses a docker-compose.yaml file and shows GPU allocation summary
	with utilization percentages from --gpu-memory-utilization flags.
	"""

	import argparse
	import re
	import sys
	from collections import defaultdict
	from pathlib import Path
	from typing import Dict, List, Set

	import yaml
	from rich.console import Console
	from rich.panel import Panel
	from rich.table import Table
	from rich.progress import Progress, BarColumn, TextColumn
	from rich.text import Text
	from rich.layout import Layout
	from rich import box


	def parse_gpu_memory_utilization(command) -> float:
	"""
	Extract GPU memory utilization percentage from command.

	Args:
	command: Can be a string or list of command arguments

	Returns:
	float: Utilization percentage (0.0 to 1.0), or 0.0 if not found
	"""
	# Convert command to string if it's a list
	if isinstance(command, list):
	command_str = " ".join(str(arg) for arg in command)
	else:
	command_str = str(command)

	# Look for --gpu-memory-utilization flag followed by a number
	pattern = r'--gpu-memory-utilization[=\s]+([0-9.]+)'
	match = re.search(pattern, command_str)

	if match:
	value = float(match.group(1))
	# If value is > 1, assume it's a percentage (e.g., 90 means 90%)
	if value > 1:
	return value / 100.0
	return value

	return 0.0


	def parse_gpu_devices(service_config: dict) -> List[str]:
	"""
	Extract GPU device IDs from service configuration.

	Args:
	service_config: Service configuration dictionary

	Returns:
	List of GPU device IDs
	"""
	gpu_ids = []

	try:
	deploy = service_config.get('deploy', {})
	resources = deploy.get('resources', {})
	reservations = resources.get('reservations', {})
	devices = reservations.get('devices', [])

	for device in devices:
	if device.get('driver') == 'nvidia':
	device_ids = device.get('device_ids', [])
	# device_ids can be a list or a single string
	if isinstance(device_ids, list):
	gpu_ids.extend(str(did) for did in device_ids)
	else:
	gpu_ids.append(str(device_ids))
	except (KeyError, AttributeError):
	pass

	return gpu_ids


	def analyze_docker_compose(file_path: Path) -> Dict:
	"""
	Analyze docker-compose file for GPU utilization.

	Args:
	file_path: Path to docker-compose.yaml file

	Returns:
	Dictionary with analysis results
	"""
	with open(file_path, 'r') as f:
	compose_data = yaml.safe_load(f)

	services = compose_data.get('services', {})

	# Track GPU allocations: {gpu_id: [(service_name, utilization)]}
	gpu_allocations = defaultdict(list)
	all_gpu_ids: Set[str] = set()

	for service_name, service_config in services.items():
	# Get GPU device IDs
	gpu_ids = parse_gpu_devices(service_config)

	# Get GPU memory utilization from command
	command = service_config.get('command')
	if command:
	utilization = parse_gpu_memory_utilization(command)

	if gpu_ids and utilization > 0:
	for gpu_id in gpu_ids:
	gpu_allocations[gpu_id].append({
	'service': service_name,
	'utilization': utilization
	})
	all_gpu_ids.add(gpu_id)

	# Calculate totals per GPU
	gpu_summary = {}
	for gpu_id in sorted(all_gpu_ids):
	allocations = gpu_allocations[gpu_id]
	total_utilization = sum(alloc['utilization'] for alloc in allocations)
	gpu_summary[gpu_id] = {
	'allocations': allocations,
	'total_utilization': total_utilization,
	'remaining': max(0, 1.0 - total_utilization)
	}

	return gpu_summary


	def create_gpu_table(gpu_summary: Dict, console: Console) -> Table:
	"""Create a rich table showing GPU allocation summary."""

	table = Table(
	title="🎮 GPU Allocation Summary",
	box=box.ROUNDED,
	title_style="bold magenta",
	show_header=True,
	header_style="bold cyan"
	)

	table.add_column("GPU ID", style="cyan", justify="center", width=10)
	table.add_column("Service", style="yellow")
	table.add_column("Utilization", justify="right", style="green")
	table.add_column("Total Used", justify="right", style="bold blue")
	table.add_column("Remaining", justify="right", style="magenta")
	table.add_column("Status", justify="center")

	if not gpu_summary:
	table.add_row("—", "No GPU allocations found", "—", "—", "—", "❌")
	return table

	for gpu_id, data in sorted(gpu_summary.items()):
	allocations = data['allocations']
	total_util = data['total_utilization']
	remaining = data['remaining']

	# Determine status
	if total_util > 1.0:
	status = "⚠️ OVER"
	status_style = "bold red"
	elif total_util > 0.9:
	status = "⚡ HIGH"
	status_style = "bold yellow"
	elif total_util > 0.5:
	status = "✓ OK"
	status_style = "bold green"
	else:
	status = "✓ LOW"
	status_style = "bold green"

	# Add rows for each allocation
	for idx, alloc in enumerate(allocations):
	if idx == 0:
	# First row includes GPU ID and totals
	table.add_row(
	f"[bold]{gpu_id}[/bold]",
	alloc['service'],
	f"{alloc['utilization']*100:.1f}%",
	f"[bold]{total_util*100:.1f}%[/bold]",
	f"{remaining*100:.1f}%",
	f"[{status_style}]{status}[/{status_style}]"
	)
	else:
	# Subsequent rows for same GPU
	table.add_row(
	"",
	alloc['service'],
	f"{alloc['utilization']*100:.1f}%",
	"",
	"",
	""
	)

	# Add separator between GPUs if not last
	if gpu_id != list(gpu_summary.keys())[-1]:
	table.add_section()

	return table


	def create_utilization_bars(gpu_summary: Dict, console: Console):
	"""Create visual progress bars for GPU utilization."""

	if not gpu_summary:
	return

	console.print("\n[bold cyan]📊 GPU Memory Utilization Bars[/bold cyan]\n")

	for gpu_id, data in sorted(gpu_summary.items()):
	total_util = data['total_utilization']

	# Determine color based on utilization
	if total_util > 1.0:
	color = "red"
	bar_total = 100
	elif total_util > 0.9:
	color = "yellow"
	bar_total = int(total_util * 100)
	elif total_util > 0.5:
	color = "green"
	bar_total = int(total_util * 100)
	else:
	color = "blue"
	bar_total = int(total_util * 100)

	# Create progress bar
	text = Text(f"GPU {gpu_id}: ", style="bold cyan")

	# Build the bar
	bar_width = 40
	filled = int((min(total_util, 1.0) * bar_width))
	bar = "█" * filled + "░" * (bar_width - filled)

	text.append(f"[{color}]{bar}[/{color}] ")
	text.append(f"{total_util*100:.1f}%", style=f"bold {color}")

	if total_util > 1.0:
	text.append(" ⚠️ OVERALLOCATED!", style="bold red")

	console.print(text)


	def create_statistics_panel(gpu_summary: Dict) -> Panel:
	"""Create a panel with overall statistics."""

	if not gpu_summary:
	content = "[yellow]No GPU allocations found in docker-compose.yaml[/yellow]"
	return Panel(content, title="📈 Statistics", border_style="yellow")

	total_gpus = len(gpu_summary)
	overallocated = sum(1 for data in gpu_summary.values() if data['total_utilization'] > 1.0)
	highly_utilized = sum(1 for data in gpu_summary.values() if 0.9 < data['total_utilization'] <= 1.0)
	avg_utilization = sum(data['total_utilization'] for data in gpu_summary.values()) / total_gpus

	total_services = sum(len(data['allocations']) for data in gpu_summary.values())

	content = f"""[bold cyan]Total GPUs:[/bold cyan] {total_gpus}
	[bold cyan]Total Services:[/bold cyan] {total_services}
	[bold cyan]Average Utilization:[/bold cyan] {avg_utilization*100:.1f}%

	[bold green]Well Utilized:[/bold green] {total_gpus - overallocated - highly_utilized}
	[bold yellow]Highly Utilized (>90%):[/bold yellow] {highly_utilized}
	[bold red]Overallocated (>100%):[/bold red] {overallocated}
	"""

	return Panel(content, title="📈 Statistics", border_style="cyan", box=box.ROUNDED)


	def main():
	parser = argparse.ArgumentParser(
	description="Analyze GPU utilization from docker-compose.yaml",
	formatter_class=argparse.RawDescriptionHelpFormatter
	)
	parser.add_argument(
	'file',
	type=Path,
	nargs='?',
	default=Path('docker-compose.yaml'),
	help='Path to docker-compose.yaml file (default: docker-compose.yaml)'
	)

	args = parser.parse_args()

	console = Console()

	# Check if file exists
	if not args.file.exists():
	console.print(f"[bold red]Error:[/bold red] File '{args.file}' not found!")
	sys.exit(1)

	# Show loading message
	with console.status("[bold green]Analyzing docker-compose.yaml...", spinner="dots"):
	gpu_summary = analyze_docker_compose(args.file)

	# Display results
	console.print()
	console.print(Panel.fit(
	f"[bold cyan]Docker Compose GPU Analyzer[/bold cyan]\n"
	f"File: [yellow]{args.file}[/yellow]",
	border_style="blue"
	))
	console.print()

	# Statistics panel
	console.print(create_statistics_panel(gpu_summary))
	console.print()

	# Main table
	table = create_gpu_table(gpu_summary, console)
	console.print(table)

	# Utilization bars
	if gpu_summary:
	create_utilization_bars(gpu_summary, console)

	console.print()

	# Warnings for overallocation
	overallocated = {gpu_id: data for gpu_id, data in gpu_summary.items()
	if data['total_utilization'] > 1.0}

	if overallocated:
	console.print(Panel(
	"[bold red]⚠️ Warning: Some GPUs are overallocated![/bold red]\n\n"
	+ "\n".join(
	f"GPU {gpu_id}: {data['total_utilization']*100:.1f}% allocated"
	for gpu_id, data in overallocated.items()
	),
	title="⚠️ Overallocation Alert",
	border_style="red",
	box=box.DOUBLE
	))


	if __name__ == "__main__":
	main()
No results found