Created
September 23, 2025 15:33
-
-
Save bwasti/e94ee02e6c55dd7e48f08b33cf21eea2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Real-time GPU Process Monitor with TensorCore Inference | |
| Monitors all GPU processes and infers TensorCore usage based on workload patterns | |
| """ | |
| import subprocess | |
| import json | |
| import time | |
| import psutil | |
| import signal | |
| import sys | |
| from datetime import datetime | |
| from collections import defaultdict | |
| class GPUMonitor: | |
| def __init__(self): | |
| self.process_history = defaultdict(list) | |
| self.running = True | |
| signal.signal(signal.SIGINT, self._signal_handler) | |
| signal.signal(signal.SIGTERM, self._signal_handler) | |
| def _signal_handler(self, signum, frame): | |
| print("\nShutting down monitor...") | |
| self.running = False | |
| def get_gpu_info(self): | |
| """Get GPU utilization and temperature""" | |
| try: | |
| result = subprocess.run([ | |
| 'nvidia-sml', '--query-gpu=index,utilization.gpu,utilization.memory,temperature.gpu,power.draw', | |
| '--format=csv,noheader,nounits' | |
| ], capture_output=True, text=True, check=True) | |
| gpus = [] | |
| for line in result.stdout.strip().split('\n'): | |
| if line: | |
| idx, gpu_util, mem_util, temp, power = line.split(', ') | |
| gpus.append({ | |
| 'index': int(idx), | |
| 'gpu_utilization': int(gpu_util), | |
| 'memory_utilization': int(mem_util), | |
| 'temperature': int(temp), | |
| 'power_draw': float(power) | |
| }) | |
| return gpus | |
| except (subprocess.CalledProcessError, ValueError): | |
| return [] | |
| def get_gpu_processes(self): | |
| """Get all processes using GPU""" | |
| try: | |
| result = subprocess.run([ | |
| 'nvidia-sml', '--query-compute-apps=pid,process_name,used_gpu_memory,gpu_instance_id', | |
| '--format=csv,noheader,nounits' | |
| ], capture_output=True, text=True, check=True) | |
| processes = [] | |
| for line in result.stdout.strip().split('\n'): | |
| if line and line != 'No running processes found': | |
| parts = line.split(', ') | |
| if len(parts) >= 3: | |
| pid = int(parts[0]) | |
| name = parts[1] | |
| gpu_memory = int(parts[2]) | |
| gpu_id = int(parts[3]) if len(parts) > 3 and parts[3] != 'N/A' else 0 | |
| processes.append({ | |
| 'pid': pid, | |
| 'name': name, | |
| 'gpu_memory_mb': gpu_memory, | |
| 'gpu_id': gpu_id | |
| }) | |
| return processes | |
| except (subprocess.CalledProcessError, ValueError, IndexError): | |
| return [] | |
| def get_process_details(self, pid): | |
| """Get additional process details""" | |
| try: | |
| proc = psutil.Process(pid) | |
| return { | |
| 'cpu_percent': proc.cpu_percent(), | |
| 'memory_mb': proc.memory_info().rss / 1024 / 1024, | |
| 'command': ' '.join(proc.cmdline()[:3]), # First 3 command parts | |
| 'create_time': proc.create_time() | |
| } | |
| except (psutil.NoSuchProcess, psutil.AccessDenied): | |
| return None | |
| def infer_tensorcore_usage(self, process_info, gpu_info): | |
| """ | |
| Infer TensorCore usage based on patterns: | |
| - High GPU utilization with specific frameworks | |
| - Memory usage patterns | |
| - Process names | |
| """ | |
| tensorcore_indicators = { | |
| 'framework_score': 0, | |
| 'workload_score': 0, | |
| 'utilization_score': 0, | |
| 'confidence': 'Low' | |
| } | |
| process_name = process_info['name'].lower() | |
| command = process_info.get('command', '').lower() | |
| # Framework detection | |
| ml_frameworks = ['python', 'pytorch', 'tensorflow', 'triton', 'tensorrt'] | |
| if any(fw in process_name or fw in command for fw in ml_frameworks): | |
| tensorcore_indicators['framework_score'] = 30 | |
| # Model inference/training patterns | |
| ml_keywords = ['train', 'inference', 'model', 'bert', 'gpt', 'transformer', 'resnet'] | |
| if any(kw in command for kw in ml_keywords): | |
| tensorcore_indicators['workload_score'] = 25 | |
| # GPU utilization patterns (TensorCore workloads often have high utilization) | |
| gpu_util = gpu_info['gpu_utilization'] | |
| if gpu_util > 80: | |
| tensorcore_indicators['utilization_score'] = 30 | |
| elif gpu_util > 50: | |
| tensorcore_indicators['utilization_score'] = 20 | |
| elif gpu_util > 20: | |
| tensorcore_indicators['utilization_score'] = 10 | |
| # Memory usage patterns | |
| gpu_memory = process_info['gpu_memory_mb'] | |
| if gpu_memory > 1000: # > 1GB suggests ML workload | |
| tensorcore_indicators['workload_score'] += 15 | |
| total_score = sum(tensorcore_indicators.values() if k != 'confidence' | |
| for k in tensorcore_indicators.keys()) | |
| if total_score > 70: | |
| tensorcore_indicators['confidence'] = 'High' | |
| elif total_score > 40: | |
| tensorcore_indicators['confidence'] = 'Medium' | |
| else: | |
| tensorcore_indicators['confidence'] = 'Low' | |
| return tensorcore_indicators, total_score | |
| def profile_with_nsys(self, pid, duration=5): | |
| """Launch nsys profiling for a specific process""" | |
| try: | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_file = f"profile_pid_{pid}_{timestamp}.nsys-rep" | |
| cmd = [ | |
| 'nsys', 'profile', | |
| '--trace=cuda,nvtx', | |
| '--force-overwrite=true', | |
| f'--output={output_file}', | |
| f'--target-processes={pid}', | |
| f'--duration={duration}', | |
| '--capture-range=cudaProfilerApi', | |
| '--stop-on-exit=false' | |
| ] | |
| print(f" Launching nsys profile for PID {pid} -> {output_file}") | |
| subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
| return output_file | |
| except Exception as e: | |
| print(f" Failed to launch nsys for PID {pid}: {e}") | |
| return None | |
| def display_status(self, gpu_infos, processes_with_details): | |
| """Display current status""" | |
| print(f"\n{'='*80}") | |
| print(f"GPU TensorCore Monitor - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| print(f"{'='*80}") | |
| # GPU Summary | |
| print("\nGPU Status:") | |
| for gpu in gpu_infos: | |
| print(f" GPU {gpu['index']}: {gpu['gpu_utilization']:3d}% util, " | |
| f"{gpu['memory_utilization']:3d}% mem, {gpu['temperature']:2d}°C, " | |
| f"{gpu['power_draw']:5.1f}W") | |
| # Process Details | |
| print(f"\nRunning GPU Processes ({len(processes_with_details)}):") | |
| print(f"{'PID':<8} {'Process':<20} {'GPU':<3} {'GPU Mem':<8} {'CPU%':<6} {'TC Conf':<8} {'Score':<5} {'Command':<30}") | |
| print("-" * 100) | |
| for proc in processes_with_details: | |
| tc_info, score = self.infer_tensorcore_usage(proc, | |
| gpu_infos[proc['gpu_id']] if proc['gpu_id'] < len(gpu_infos) else gpu_infos[0]) | |
| print(f"{proc['pid']:<8} {proc['name'][:19]:<20} {proc['gpu_id']:<3} " | |
| f"{proc['gpu_memory_mb']:<8} {proc['cpu_percent']:<6.1f} " | |
| f"{tc_info['confidence']:<8} {score:<5} {proc['command'][:29]:<30}") | |
| def run(self, profile_interval=30): | |
| """Main monitoring loop""" | |
| print("Starting GPU TensorCore Monitor...") | |
| print("Press Ctrl+C to stop") | |
| last_profile_time = 0 | |
| profile_candidates = set() | |
| while self.running: | |
| try: | |
| # Get current GPU and process info | |
| gpu_infos = self.get_gpu_info() | |
| gpu_processes = self.get_gpu_processes() | |
| if not gpu_infos: | |
| print("No GPUs found or nvidia-sml not available") | |
| time.sleep(5) | |
| continue | |
| # Enrich process info | |
| processes_with_details = [] | |
| for proc in gpu_processes: | |
| details = self.get_process_details(proc['pid']) | |
| if details: | |
| proc.update(details) | |
| processes_with_details.append(proc) | |
| # Track high-confidence TensorCore processes for profiling | |
| tc_info, score = self.infer_tensorcore_usage(proc, | |
| gpu_infos[proc['gpu_id']] if proc['gpu_id'] < len(gpu_infos) else gpu_infos[0]) | |
| if tc_info['confidence'] in ['High', 'Medium'] and score > 60: | |
| profile_candidates.add(proc['pid']) | |
| # Display status | |
| self.display_status(gpu_infos, processes_with_details) | |
| # Periodic detailed profiling of high-confidence processes | |
| current_time = time.time() | |
| if (current_time - last_profile_time > profile_interval and | |
| profile_candidates and | |
| subprocess.run(['which', 'nsys'], capture_output=True).returncode == 0): | |
| print(f"\nLaunching detailed profiling for {len(profile_candidates)} processes...") | |
| for pid in list(profile_candidates): | |
| self.profile_with_nsys(pid, duration=10) | |
| last_profile_time = current_time | |
| profile_candidates.clear() | |
| time.sleep(2) | |
| except KeyboardInterrupt: | |
| break | |
| except Exception as e: | |
| print(f"Error in monitoring loop: {e}") | |
| time.sleep(5) | |
| print("Monitor stopped.") | |
| def main(): | |
| import argparse | |
| parser = argparse.ArgumentParser(description='GPU TensorCore Process Monitor') | |
| parser.add_argument('--profile-interval', type=int, default=60, | |
| help='Interval between nsys profiling sessions (seconds)') | |
| args = parser.parse_args() | |
| # Check dependencies | |
| try: | |
| subprocess.run(['nvidia-sml', '--version'], capture_output=True, check=True) | |
| except (subprocess.CalledProcessError, FileNotFoundError): | |
| print("Error: nvidia-sml not found. Please install NVIDIA drivers.") | |
| sys.exit(1) | |
| monitor = GPUMonitor() | |
| monitor.run(profile_interval=args.profile_interval) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment