bwasti/gpu_tensorcore_monitor.py

## gpu_tensorcore_monitor.py
#!/usr/bin/env python3
"""
Real-time GPU Process Monitor with TensorCore Inference
Monitors all GPU processes and infers TensorCore usage based on workload patterns
"""

import subprocess
import json
import time
import psutil
import signal
import sys
from datetime import datetime
from collections import defaultdict

class GPUMonitor:
    def __init__(self):
        self.process_history = defaultdict(list)
        self.running = True
        signal.signal(signal.SIGINT, self._signal_handler)
        signal.signal(signal.SIGTERM, self._signal_handler)

    def _signal_handler(self, signum, frame):
        print("\nShutting down monitor...")
        self.running = False

    def get_gpu_info(self):
        """Get GPU utilization and temperature"""
        try:
            result = subprocess.run([
                'nvidia-sml', '--query-gpu=index,utilization.gpu,utilization.memory,temperature.gpu,power.draw',
                '--format=csv,noheader,nounits'
            ], capture_output=True, text=True, check=True)

            gpus = []
            for line in result.stdout.strip().split('\n'):
                if line:
                    idx, gpu_util, mem_util, temp, power = line.split(', ')
                    gpus.append({
                        'index': int(idx),
                        'gpu_utilization': int(gpu_util),
                        'memory_utilization': int(mem_util),
                        'temperature': int(temp),
                        'power_draw': float(power)
                    })
            return gpus
        except (subprocess.CalledProcessError, ValueError):
            return []

    def get_gpu_processes(self):
        """Get all processes using GPU"""
        try:
            result = subprocess.run([
                'nvidia-sml', '--query-compute-apps=pid,process_name,used_gpu_memory,gpu_instance_id',
                '--format=csv,noheader,nounits'
            ], capture_output=True, text=True, check=True)

            processes = []
            for line in result.stdout.strip().split('\n'):
                if line and line != 'No running processes found':
                    parts = line.split(', ')
                    if len(parts) >= 3:
                        pid = int(parts[0])
                        name = parts[1]
                        gpu_memory = int(parts[2])
                        gpu_id = int(parts[3]) if len(parts) > 3 and parts[3] != 'N/A' else 0

                        processes.append({
                            'pid': pid,
                            'name': name,
                            'gpu_memory_mb': gpu_memory,
                            'gpu_id': gpu_id
                        })
            return processes
        except (subprocess.CalledProcessError, ValueError, IndexError):
            return []

    def get_process_details(self, pid):
        """Get additional process details"""
        try:
            proc = psutil.Process(pid)
            return {
                'cpu_percent': proc.cpu_percent(),
                'memory_mb': proc.memory_info().rss / 1024 / 1024,
                'command': ' '.join(proc.cmdline()[:3]),  # First 3 command parts
                'create_time': proc.create_time()
            }
        except (psutil.NoSuchProcess, psutil.AccessDenied):
            return None

    def infer_tensorcore_usage(self, process_info, gpu_info):
        """
        Infer TensorCore usage based on patterns:
        - High GPU utilization with specific frameworks
        - Memory usage patterns
        - Process names
        """
        tensorcore_indicators = {
            'framework_score': 0,
            'workload_score': 0,
            'utilization_score': 0,
            'confidence': 'Low'
        }

        process_name = process_info['name'].lower()
        command = process_info.get('command', '').lower()

        # Framework detection
        ml_frameworks = ['python', 'pytorch', 'tensorflow', 'triton', 'tensorrt']
        if any(fw in process_name or fw in command for fw in ml_frameworks):
            tensorcore_indicators['framework_score'] = 30

        # Model inference/training patterns
        ml_keywords = ['train', 'inference', 'model', 'bert', 'gpt', 'transformer', 'resnet']
        if any(kw in command for kw in ml_keywords):
            tensorcore_indicators['workload_score'] = 25

        # GPU utilization patterns (TensorCore workloads often have high utilization)
        gpu_util = gpu_info['gpu_utilization']
        if gpu_util > 80:
            tensorcore_indicators['utilization_score'] = 30
        elif gpu_util > 50:
            tensorcore_indicators['utilization_score'] = 20
        elif gpu_util > 20:
            tensorcore_indicators['utilization_score'] = 10

        # Memory usage patterns
        gpu_memory = process_info['gpu_memory_mb']
        if gpu_memory > 1000:  # > 1GB suggests ML workload
            tensorcore_indicators['workload_score'] += 15

        total_score = sum(tensorcore_indicators.values() if k != 'confidence'
                         for k in tensorcore_indicators.keys())

        if total_score > 70:
            tensorcore_indicators['confidence'] = 'High'
        elif total_score > 40:
            tensorcore_indicators['confidence'] = 'Medium'
        else:
            tensorcore_indicators['confidence'] = 'Low'

        return tensorcore_indicators, total_score

    def profile_with_nsys(self, pid, duration=5):
        """Launch nsys profiling for a specific process"""
        try:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_file = f"profile_pid_{pid}_{timestamp}.nsys-rep"

            cmd = [
                'nsys', 'profile',
                '--trace=cuda,nvtx',
                '--force-overwrite=true',
                f'--output={output_file}',
                f'--target-processes={pid}',
                f'--duration={duration}',
                '--capture-range=cudaProfilerApi',
                '--stop-on-exit=false'
            ]

            print(f"  Launching nsys profile for PID {pid} -> {output_file}")
            subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            return output_file
        except Exception as e:
            print(f"  Failed to launch nsys for PID {pid}: {e}")
            return None

    def display_status(self, gpu_infos, processes_with_details):
        """Display current status"""
        print(f"\n{'='*80}")
        print(f"GPU TensorCore Monitor - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"{'='*80}")

        # GPU Summary
        print("\nGPU Status:")
        for gpu in gpu_infos:
            print(f"  GPU {gpu['index']}: {gpu['gpu_utilization']:3d}% util, "
                  f"{gpu['memory_utilization']:3d}% mem, {gpu['temperature']:2d}°C, "
                  f"{gpu['power_draw']:5.1f}W")

        # Process Details
        print(f"\nRunning GPU Processes ({len(processes_with_details)}):")
        print(f"{'PID':<8} {'Process':<20} {'GPU':<3} {'GPU Mem':<8} {'CPU%':<6} {'TC Conf':<8} {'Score':<5} {'Command':<30}")
        print("-" * 100)

        for proc in processes_with_details:
            tc_info, score = self.infer_tensorcore_usage(proc,
                                                        gpu_infos[proc['gpu_id']] if proc['gpu_id'] < len(gpu_infos) else gpu_infos[0])

            print(f"{proc['pid']:<8} {proc['name'][:19]:<20} {proc['gpu_id']:<3} "
                  f"{proc['gpu_memory_mb']:<8} {proc['cpu_percent']:<6.1f} "
                  f"{tc_info['confidence']:<8} {score:<5} {proc['command'][:29]:<30}")

    def run(self, profile_interval=30):
        """Main monitoring loop"""
        print("Starting GPU TensorCore Monitor...")
        print("Press Ctrl+C to stop")

        last_profile_time = 0
        profile_candidates = set()

        while self.running:
            try:
                # Get current GPU and process info
                gpu_infos = self.get_gpu_info()
                gpu_processes = self.get_gpu_processes()

                if not gpu_infos:
                    print("No GPUs found or nvidia-sml not available")
                    time.sleep(5)
                    continue

                # Enrich process info
                processes_with_details = []
                for proc in gpu_processes:
                    details = self.get_process_details(proc['pid'])
                    if details:
                        proc.update(details)
                        processes_with_details.append(proc)

                        # Track high-confidence TensorCore processes for profiling
                        tc_info, score = self.infer_tensorcore_usage(proc,
                                                                    gpu_infos[proc['gpu_id']] if proc['gpu_id'] < len(gpu_infos) else gpu_infos[0])
                        if tc_info['confidence'] in ['High', 'Medium'] and score > 60:
                            profile_candidates.add(proc['pid'])

                # Display status
                self.display_status(gpu_infos, processes_with_details)

                # Periodic detailed profiling of high-confidence processes
                current_time = time.time()
                if (current_time - last_profile_time > profile_interval and
                    profile_candidates and
                    subprocess.run(['which', 'nsys'], capture_output=True).returncode == 0):

                    print(f"\nLaunching detailed profiling for {len(profile_candidates)} processes...")
                    for pid in list(profile_candidates):
                        self.profile_with_nsys(pid, duration=10)

                    last_profile_time = current_time
                    profile_candidates.clear()

                time.sleep(2)

            except KeyboardInterrupt:
                break
            except Exception as e:
                print(f"Error in monitoring loop: {e}")
                time.sleep(5)

        print("Monitor stopped.")

def main():
    import argparse
    parser = argparse.ArgumentParser(description='GPU TensorCore Process Monitor')
    parser.add_argument('--profile-interval', type=int, default=60,
                       help='Interval between nsys profiling sessions (seconds)')

    args = parser.parse_args()

    # Check dependencies
    try:
        subprocess.run(['nvidia-sml', '--version'], capture_output=True, check=True)
    except (subprocess.CalledProcessError, FileNotFoundError):
        print("Error: nvidia-sml not found. Please install NVIDIA drivers.")
        sys.exit(1)

    monitor = GPUMonitor()
    monitor.run(profile_interval=args.profile_interval)

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	"""
	Real-time GPU Process Monitor with TensorCore Inference
	Monitors all GPU processes and infers TensorCore usage based on workload patterns
	"""

	import subprocess
	import json
	import time
	import psutil
	import signal
	import sys
	from datetime import datetime
	from collections import defaultdict

	class GPUMonitor:
	def __init__(self):
	self.process_history = defaultdict(list)
	self.running = True
	signal.signal(signal.SIGINT, self._signal_handler)
	signal.signal(signal.SIGTERM, self._signal_handler)

	def _signal_handler(self, signum, frame):
	print("\nShutting down monitor...")
	self.running = False

	def get_gpu_info(self):
	"""Get GPU utilization and temperature"""
	try:
	result = subprocess.run([
	'nvidia-sml', '--query-gpu=index,utilization.gpu,utilization.memory,temperature.gpu,power.draw',
	'--format=csv,noheader,nounits'
	], capture_output=True, text=True, check=True)

	gpus = []
	for line in result.stdout.strip().split('\n'):
	if line:
	idx, gpu_util, mem_util, temp, power = line.split(', ')
	gpus.append({
	'index': int(idx),
	'gpu_utilization': int(gpu_util),
	'memory_utilization': int(mem_util),
	'temperature': int(temp),
	'power_draw': float(power)
	})
	return gpus
	except (subprocess.CalledProcessError, ValueError):
	return []

	def get_gpu_processes(self):
	"""Get all processes using GPU"""
	try:
	result = subprocess.run([
	'nvidia-sml', '--query-compute-apps=pid,process_name,used_gpu_memory,gpu_instance_id',
	'--format=csv,noheader,nounits'
	], capture_output=True, text=True, check=True)

	processes = []
	for line in result.stdout.strip().split('\n'):
	if line and line != 'No running processes found':
	parts = line.split(', ')
	if len(parts) >= 3:
	pid = int(parts[0])
	name = parts[1]
	gpu_memory = int(parts[2])
	gpu_id = int(parts[3]) if len(parts) > 3 and parts[3] != 'N/A' else 0

	processes.append({
	'pid': pid,
	'name': name,
	'gpu_memory_mb': gpu_memory,
	'gpu_id': gpu_id
	})
	return processes
	except (subprocess.CalledProcessError, ValueError, IndexError):
	return []

	def get_process_details(self, pid):
	"""Get additional process details"""
	try:
	proc = psutil.Process(pid)
	return {
	'cpu_percent': proc.cpu_percent(),
	'memory_mb': proc.memory_info().rss / 1024 / 1024,
	'command': ' '.join(proc.cmdline()[:3]), # First 3 command parts
	'create_time': proc.create_time()
	}
	except (psutil.NoSuchProcess, psutil.AccessDenied):
	return None

	def infer_tensorcore_usage(self, process_info, gpu_info):
	"""
	Infer TensorCore usage based on patterns:
	- High GPU utilization with specific frameworks
	- Memory usage patterns
	- Process names
	"""
	tensorcore_indicators = {
	'framework_score': 0,
	'workload_score': 0,
	'utilization_score': 0,
	'confidence': 'Low'
	}

	process_name = process_info['name'].lower()
	command = process_info.get('command', '').lower()

	# Framework detection
	ml_frameworks = ['python', 'pytorch', 'tensorflow', 'triton', 'tensorrt']
	if any(fw in process_name or fw in command for fw in ml_frameworks):
	tensorcore_indicators['framework_score'] = 30

	# Model inference/training patterns
	ml_keywords = ['train', 'inference', 'model', 'bert', 'gpt', 'transformer', 'resnet']
	if any(kw in command for kw in ml_keywords):
	tensorcore_indicators['workload_score'] = 25

	# GPU utilization patterns (TensorCore workloads often have high utilization)
	gpu_util = gpu_info['gpu_utilization']
	if gpu_util > 80:
	tensorcore_indicators['utilization_score'] = 30
	elif gpu_util > 50:
	tensorcore_indicators['utilization_score'] = 20
	elif gpu_util > 20:
	tensorcore_indicators['utilization_score'] = 10

	# Memory usage patterns
	gpu_memory = process_info['gpu_memory_mb']
	if gpu_memory > 1000: # > 1GB suggests ML workload
	tensorcore_indicators['workload_score'] += 15

	total_score = sum(tensorcore_indicators.values() if k != 'confidence'
	for k in tensorcore_indicators.keys())

	if total_score > 70:
	tensorcore_indicators['confidence'] = 'High'
	elif total_score > 40:
	tensorcore_indicators['confidence'] = 'Medium'
	else:
	tensorcore_indicators['confidence'] = 'Low'

	return tensorcore_indicators, total_score

	def profile_with_nsys(self, pid, duration=5):
	"""Launch nsys profiling for a specific process"""
	try:
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_file = f"profile_pid_{pid}_{timestamp}.nsys-rep"

	cmd = [
	'nsys', 'profile',
	'--trace=cuda,nvtx',
	'--force-overwrite=true',
	f'--output={output_file}',
	f'--target-processes={pid}',
	f'--duration={duration}',
	'--capture-range=cudaProfilerApi',
	'--stop-on-exit=false'
	]

	print(f" Launching nsys profile for PID {pid} -> {output_file}")
	subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
	return output_file
	except Exception as e:
	print(f" Failed to launch nsys for PID {pid}: {e}")
	return None

	def display_status(self, gpu_infos, processes_with_details):
	"""Display current status"""
	print(f"\n{'='*80}")
	print(f"GPU TensorCore Monitor - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	print(f"{'='*80}")

	# GPU Summary
	print("\nGPU Status:")
	for gpu in gpu_infos:
	print(f" GPU {gpu['index']}: {gpu['gpu_utilization']:3d}% util, "
	f"{gpu['memory_utilization']:3d}% mem, {gpu['temperature']:2d}°C, "
	f"{gpu['power_draw']:5.1f}W")

	# Process Details
	print(f"\nRunning GPU Processes ({len(processes_with_details)}):")
	print(f"{'PID':<8} {'Process':<20} {'GPU':<3} {'GPU Mem':<8} {'CPU%':<6} {'TC Conf':<8} {'Score':<5} {'Command':<30}")
	print("-" * 100)

	for proc in processes_with_details:
	tc_info, score = self.infer_tensorcore_usage(proc,
	gpu_infos[proc['gpu_id']] if proc['gpu_id'] < len(gpu_infos) else gpu_infos[0])

	print(f"{proc['pid']:<8} {proc['name'][:19]:<20} {proc['gpu_id']:<3} "
	f"{proc['gpu_memory_mb']:<8} {proc['cpu_percent']:<6.1f} "
	f"{tc_info['confidence']:<8} {score:<5} {proc['command'][:29]:<30}")

	def run(self, profile_interval=30):
	"""Main monitoring loop"""
	print("Starting GPU TensorCore Monitor...")
	print("Press Ctrl+C to stop")

	last_profile_time = 0
	profile_candidates = set()

	while self.running:
	try:
	# Get current GPU and process info
	gpu_infos = self.get_gpu_info()
	gpu_processes = self.get_gpu_processes()

	if not gpu_infos:
	print("No GPUs found or nvidia-sml not available")
	time.sleep(5)
	continue

	# Enrich process info
	processes_with_details = []
	for proc in gpu_processes:
	details = self.get_process_details(proc['pid'])
	if details:
	proc.update(details)
	processes_with_details.append(proc)

	# Track high-confidence TensorCore processes for profiling
	tc_info, score = self.infer_tensorcore_usage(proc,
	gpu_infos[proc['gpu_id']] if proc['gpu_id'] < len(gpu_infos) else gpu_infos[0])
	if tc_info['confidence'] in ['High', 'Medium'] and score > 60:
	profile_candidates.add(proc['pid'])

	# Display status
	self.display_status(gpu_infos, processes_with_details)

	# Periodic detailed profiling of high-confidence processes
	current_time = time.time()
	if (current_time - last_profile_time > profile_interval and
	profile_candidates and
	subprocess.run(['which', 'nsys'], capture_output=True).returncode == 0):

	print(f"\nLaunching detailed profiling for {len(profile_candidates)} processes...")
	for pid in list(profile_candidates):
	self.profile_with_nsys(pid, duration=10)

	last_profile_time = current_time
	profile_candidates.clear()

	time.sleep(2)

	except KeyboardInterrupt:
	break
	except Exception as e:
	print(f"Error in monitoring loop: {e}")
	time.sleep(5)

	print("Monitor stopped.")

	def main():
	import argparse
	parser = argparse.ArgumentParser(description='GPU TensorCore Process Monitor')
	parser.add_argument('--profile-interval', type=int, default=60,
	help='Interval between nsys profiling sessions (seconds)')

	args = parser.parse_args()

	# Check dependencies
	try:
	subprocess.run(['nvidia-sml', '--version'], capture_output=True, check=True)
	except (subprocess.CalledProcessError, FileNotFoundError):
	print("Error: nvidia-sml not found. Please install NVIDIA drivers.")
	sys.exit(1)

	monitor = GPUMonitor()
	monitor.run(profile_interval=args.profile_interval)

	if __name__ == '__main__':
	main()
No results found