Skip to content

Instantly share code, notes, and snippets.

@bwasti
Created September 23, 2025 15:33
Show Gist options
  • Select an option

  • Save bwasti/e94ee02e6c55dd7e48f08b33cf21eea2 to your computer and use it in GitHub Desktop.

Select an option

Save bwasti/e94ee02e6c55dd7e48f08b33cf21eea2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Real-time GPU Process Monitor with TensorCore Inference
Monitors all GPU processes and infers TensorCore usage based on workload patterns
"""
import subprocess
import json
import time
import psutil
import signal
import sys
from datetime import datetime
from collections import defaultdict
class GPUMonitor:
def __init__(self):
self.process_history = defaultdict(list)
self.running = True
signal.signal(signal.SIGINT, self._signal_handler)
signal.signal(signal.SIGTERM, self._signal_handler)
def _signal_handler(self, signum, frame):
print("\nShutting down monitor...")
self.running = False
def get_gpu_info(self):
"""Get GPU utilization and temperature"""
try:
result = subprocess.run([
'nvidia-sml', '--query-gpu=index,utilization.gpu,utilization.memory,temperature.gpu,power.draw',
'--format=csv,noheader,nounits'
], capture_output=True, text=True, check=True)
gpus = []
for line in result.stdout.strip().split('\n'):
if line:
idx, gpu_util, mem_util, temp, power = line.split(', ')
gpus.append({
'index': int(idx),
'gpu_utilization': int(gpu_util),
'memory_utilization': int(mem_util),
'temperature': int(temp),
'power_draw': float(power)
})
return gpus
except (subprocess.CalledProcessError, ValueError):
return []
def get_gpu_processes(self):
"""Get all processes using GPU"""
try:
result = subprocess.run([
'nvidia-sml', '--query-compute-apps=pid,process_name,used_gpu_memory,gpu_instance_id',
'--format=csv,noheader,nounits'
], capture_output=True, text=True, check=True)
processes = []
for line in result.stdout.strip().split('\n'):
if line and line != 'No running processes found':
parts = line.split(', ')
if len(parts) >= 3:
pid = int(parts[0])
name = parts[1]
gpu_memory = int(parts[2])
gpu_id = int(parts[3]) if len(parts) > 3 and parts[3] != 'N/A' else 0
processes.append({
'pid': pid,
'name': name,
'gpu_memory_mb': gpu_memory,
'gpu_id': gpu_id
})
return processes
except (subprocess.CalledProcessError, ValueError, IndexError):
return []
def get_process_details(self, pid):
"""Get additional process details"""
try:
proc = psutil.Process(pid)
return {
'cpu_percent': proc.cpu_percent(),
'memory_mb': proc.memory_info().rss / 1024 / 1024,
'command': ' '.join(proc.cmdline()[:3]), # First 3 command parts
'create_time': proc.create_time()
}
except (psutil.NoSuchProcess, psutil.AccessDenied):
return None
def infer_tensorcore_usage(self, process_info, gpu_info):
"""
Infer TensorCore usage based on patterns:
- High GPU utilization with specific frameworks
- Memory usage patterns
- Process names
"""
tensorcore_indicators = {
'framework_score': 0,
'workload_score': 0,
'utilization_score': 0,
'confidence': 'Low'
}
process_name = process_info['name'].lower()
command = process_info.get('command', '').lower()
# Framework detection
ml_frameworks = ['python', 'pytorch', 'tensorflow', 'triton', 'tensorrt']
if any(fw in process_name or fw in command for fw in ml_frameworks):
tensorcore_indicators['framework_score'] = 30
# Model inference/training patterns
ml_keywords = ['train', 'inference', 'model', 'bert', 'gpt', 'transformer', 'resnet']
if any(kw in command for kw in ml_keywords):
tensorcore_indicators['workload_score'] = 25
# GPU utilization patterns (TensorCore workloads often have high utilization)
gpu_util = gpu_info['gpu_utilization']
if gpu_util > 80:
tensorcore_indicators['utilization_score'] = 30
elif gpu_util > 50:
tensorcore_indicators['utilization_score'] = 20
elif gpu_util > 20:
tensorcore_indicators['utilization_score'] = 10
# Memory usage patterns
gpu_memory = process_info['gpu_memory_mb']
if gpu_memory > 1000: # > 1GB suggests ML workload
tensorcore_indicators['workload_score'] += 15
total_score = sum(tensorcore_indicators.values() if k != 'confidence'
for k in tensorcore_indicators.keys())
if total_score > 70:
tensorcore_indicators['confidence'] = 'High'
elif total_score > 40:
tensorcore_indicators['confidence'] = 'Medium'
else:
tensorcore_indicators['confidence'] = 'Low'
return tensorcore_indicators, total_score
def profile_with_nsys(self, pid, duration=5):
"""Launch nsys profiling for a specific process"""
try:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"profile_pid_{pid}_{timestamp}.nsys-rep"
cmd = [
'nsys', 'profile',
'--trace=cuda,nvtx',
'--force-overwrite=true',
f'--output={output_file}',
f'--target-processes={pid}',
f'--duration={duration}',
'--capture-range=cudaProfilerApi',
'--stop-on-exit=false'
]
print(f" Launching nsys profile for PID {pid} -> {output_file}")
subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return output_file
except Exception as e:
print(f" Failed to launch nsys for PID {pid}: {e}")
return None
def display_status(self, gpu_infos, processes_with_details):
"""Display current status"""
print(f"\n{'='*80}")
print(f"GPU TensorCore Monitor - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"{'='*80}")
# GPU Summary
print("\nGPU Status:")
for gpu in gpu_infos:
print(f" GPU {gpu['index']}: {gpu['gpu_utilization']:3d}% util, "
f"{gpu['memory_utilization']:3d}% mem, {gpu['temperature']:2d}°C, "
f"{gpu['power_draw']:5.1f}W")
# Process Details
print(f"\nRunning GPU Processes ({len(processes_with_details)}):")
print(f"{'PID':<8} {'Process':<20} {'GPU':<3} {'GPU Mem':<8} {'CPU%':<6} {'TC Conf':<8} {'Score':<5} {'Command':<30}")
print("-" * 100)
for proc in processes_with_details:
tc_info, score = self.infer_tensorcore_usage(proc,
gpu_infos[proc['gpu_id']] if proc['gpu_id'] < len(gpu_infos) else gpu_infos[0])
print(f"{proc['pid']:<8} {proc['name'][:19]:<20} {proc['gpu_id']:<3} "
f"{proc['gpu_memory_mb']:<8} {proc['cpu_percent']:<6.1f} "
f"{tc_info['confidence']:<8} {score:<5} {proc['command'][:29]:<30}")
def run(self, profile_interval=30):
"""Main monitoring loop"""
print("Starting GPU TensorCore Monitor...")
print("Press Ctrl+C to stop")
last_profile_time = 0
profile_candidates = set()
while self.running:
try:
# Get current GPU and process info
gpu_infos = self.get_gpu_info()
gpu_processes = self.get_gpu_processes()
if not gpu_infos:
print("No GPUs found or nvidia-sml not available")
time.sleep(5)
continue
# Enrich process info
processes_with_details = []
for proc in gpu_processes:
details = self.get_process_details(proc['pid'])
if details:
proc.update(details)
processes_with_details.append(proc)
# Track high-confidence TensorCore processes for profiling
tc_info, score = self.infer_tensorcore_usage(proc,
gpu_infos[proc['gpu_id']] if proc['gpu_id'] < len(gpu_infos) else gpu_infos[0])
if tc_info['confidence'] in ['High', 'Medium'] and score > 60:
profile_candidates.add(proc['pid'])
# Display status
self.display_status(gpu_infos, processes_with_details)
# Periodic detailed profiling of high-confidence processes
current_time = time.time()
if (current_time - last_profile_time > profile_interval and
profile_candidates and
subprocess.run(['which', 'nsys'], capture_output=True).returncode == 0):
print(f"\nLaunching detailed profiling for {len(profile_candidates)} processes...")
for pid in list(profile_candidates):
self.profile_with_nsys(pid, duration=10)
last_profile_time = current_time
profile_candidates.clear()
time.sleep(2)
except KeyboardInterrupt:
break
except Exception as e:
print(f"Error in monitoring loop: {e}")
time.sleep(5)
print("Monitor stopped.")
def main():
import argparse
parser = argparse.ArgumentParser(description='GPU TensorCore Process Monitor')
parser.add_argument('--profile-interval', type=int, default=60,
help='Interval between nsys profiling sessions (seconds)')
args = parser.parse_args()
# Check dependencies
try:
subprocess.run(['nvidia-sml', '--version'], capture_output=True, check=True)
except (subprocess.CalledProcessError, FileNotFoundError):
print("Error: nvidia-sml not found. Please install NVIDIA drivers.")
sys.exit(1)
monitor = GPUMonitor()
monitor.run(profile_interval=args.profile_interval)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment