arichardson/analyze_instr_freq.py

## analyze_instr_freq.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Description:
#   This script recursively finds all RISC-V ELF files in a specified directory
#   and categorizes them into "purecap" (CHERI-enabled) and "integer" buckets
#   based on the EF_RISCV_CHERIABI flag in the ELF header. It only includes
#   executables and shared libraries, ignoring object files. It counts every
#   assembly instruction mnemonic and provides grand totals and optional CSV
#   reports for each bucket. It correctly handles symbolic links to avoid
#   double-counting files.
#
# Dependencies:
#   pip install pyelftools
#   A RISC-V llvm-objdump binary accessible on your system.
#
# Usage:
#   python riscv_instruction_counter.py \
#       --folder /path/to/your/binaries \
#       --objdump /path/to/your/llvm-objdump \
#       --csv-purecap purecap_stats.csv \
#       --csv-integer integer_stats.csv
#

import os
import argparse
import subprocess
import csv
from pathlib import Path
from elftools.elf.elffile import ELFFile
from elftools.common.exceptions import ELFError

# The e_flags bit that identifies a CHERI pure-capability ABI file.
EF_RISCV_CHERIABI = 0x00010000

def get_riscv_bucket(filepath):
    """
    Checks if a file is a RISC-V executable or shared library and categorizes it.

    Args:
        filepath (str): The path to the file.

    Returns:
        str or None: "purecap" if it's a CHERI-enabled ELF, "integer" if it's a
                     standard RISC-V ELF, or None if it's not a valid target.
    """
    try:
        with open(filepath, 'rb') as f:
            if f.read(4) != b'\x7fELF':
                return None
            f.seek(0)
            elffile = ELFFile(f)

            # Check for RISC-V architecture
            if elffile.get_machine_arch() != 'RISC-V':
                return None

            # Filter to include only executables (ET_EXEC) and shared libraries (ET_DYN)
            file_type = elffile.header['e_type']
            if file_type not in ('ET_EXEC', 'ET_DYN'):
                return None

            # Check the e_flags for the CHERIABI bit to determine the bucket
            if elffile.header['e_flags'] & EF_RISCV_CHERIABI:
                return "purecap"
            else:
                return "integer"
    except (IOError, ELFError):
        return None

def count_instructions_in_file(filepath, objdump_path):
    """
    Disassembles a RISC-V ELF file by streaming from a specified objdump
    binary and counts the occurrences of every instruction.

    Args:
        filepath (str): The full path to the ELF file.
        objdump_path (str): The path to the llvm-objdump binary.

    Returns:
        dict: A dictionary mapping each instruction mnemonic to its count.
    """
    instruction_counts = {}

    try:
        # -M no-aliases: Expands pseudo-instructions to their base instructions.
        # --no-show-raw-insn: Omits the hex machine code.
        # The leading address is now included to provide a reliable parsing anchor.
        command = [objdump_path, "-d", "-M", "no-aliases", "--no-show-raw-insn", filepath]

        process = subprocess.Popen(
            command,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
            encoding='utf-8',
            errors='ignore'
        )

        for line in process.stdout:
            stripped_line = line.strip()

            # An instruction line must contain a colon (from the address)
            if ':' not in stripped_line:
                continue

            # Split the line by the colon first to separate the address
            # from the instruction part.
            parts = stripped_line.split(':', 1)

            # Check if the part before the colon is a valid hex address.
            try:
                int(parts[0], 16)
            except (ValueError, IndexError):
                continue

            # The rest of the line contains the instruction.
            instruction_part = parts[1].strip()
            if not instruction_part:
                continue

            potential_mnemonic = instruction_part.split(None, 1)[0].lower()

            # A valid mnemonic must start with a letter.
            if potential_mnemonic and potential_mnemonic[0].isalpha():
                instruction_counts[potential_mnemonic] = instruction_counts.get(potential_mnemonic, 0) + 1
            else:
                # This case should be rare now but is kept as a safeguard.
                print(f"\nWarning: Invalid mnemonic '{potential_mnemonic}' detected in file {filepath}.")
                print(f" -> Offending line: {line.strip()}")

        process.wait()
        if process.returncode != 0:
            pass

    except FileNotFoundError:
        print(f"Error: objdump binary not found at '{objdump_path}'")
        exit(1)
    except Exception as e:
        print(f"An unexpected error occurred while processing {filepath}: {e}")

    return instruction_counts

def write_stats_to_csv(per_file_stats, csv_path):
    """
    Writes the collected instruction statistics to a CSV file.

    Args:
        per_file_stats (dict): A dictionary where keys are filepaths and values
                               are dictionaries of instruction counts.
        csv_path (str): The path to the output CSV file.
    """
    if not per_file_stats:
        print(f"No data to write to {csv_path}.")
        return

    all_mnemonics = set()
    for counts in per_file_stats.values():
        all_mnemonics.update(counts.keys())

    sorted_mnemonics = sorted(list(all_mnemonics))
    header = ['filepath'] + sorted_mnemonics

    try:
        with open(csv_path, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(header)

            for filepath, counts in per_file_stats.items():
                row = [filepath] + [counts.get(m, 0) for m in sorted_mnemonics]
                writer.writerow(row)
        print(f"\nSuccessfully wrote statistics to '{csv_path}'")
    except IOError as e:
        print(f"\nError writing to CSV file '{csv_path}': {e}")

def print_summary(title, grand_totals):
    """Prints a formatted summary of grand total instruction counts."""
    print(f"\n--- {title} ---")
    if not grand_totals:
        print("No instructions found for this category.")
    else:
        # Sort the instructions by mnemonic name (alphabetically)
        sorted_totals = sorted(grand_totals.items())
        for mnemonic, count in sorted_totals:
            # Print in a simple CSV format
            print(f"{mnemonic},{count}")

def main():
    """
    Main function to parse arguments and orchestrate the counting process.
    """
    parser = argparse.ArgumentParser(
        description="Count all assembly instructions in RISC-V ELF files, bucketing by ABI."
    )
    parser.add_argument("-f", "--folder", required=True, help="The folder to search recursively.")
    parser.add_argument(
        "--objdump",
        default=os.path.expanduser("~/cheri/output/sdk/bin/llvm-objdump"),
        help="Path to the custom llvm-objdump binary."
    )
    parser.add_argument("--csv-purecap", help="Optional CSV path for purecap file statistics.")
    parser.add_argument("--csv-integer", help="Optional CSV path for integer file statistics.")
    args = parser.parse_args()

    if not os.path.isdir(args.folder):
        print(f"Error: Folder not found at '{args.folder}'")
        return

    if not os.path.isfile(args.objdump):
        print(f"Error: llvm-objdump binary not found at '{args.objdump}'")
        return

    print(f"Searching for RISC-V ELF files in '{args.folder}'...")
    print(f"Using objdump at: '{args.objdump}'\n")

    stats = {
        "purecap": {"per_file": {}, "totals": {}},
        "integer": {"per_file": {}, "totals": {}}
    }

    processed_files = set()

    for root, _, files in os.walk(args.folder):
        for filename in files:
            filepath = os.path.join(root, filename)

            # Resolve symlinks to get the canonical path
            try:
                resolved_path = Path(filepath).resolve()
            except FileNotFoundError:
                # This can happen with broken symlinks, so we skip them.
                continue

            # Check if we have already processed this file
            if resolved_path in processed_files:
                continue

            processed_files.add(resolved_path)

            bucket = get_riscv_bucket(str(resolved_path))
            if bucket:
                # Print the original path for user context, but analyze the resolved path
                print(f"Analyzing ({bucket}): {filepath}")
                file_counts = count_instructions_in_file(str(resolved_path), args.objdump)
                if file_counts:
                    # Use the resolved path as the key in the stats dictionary
                    stats[bucket]["per_file"][str(resolved_path)] = file_counts
                    for mnemonic, count in file_counts.items():
                        stats[bucket]["totals"][mnemonic] = stats[bucket]["totals"].get(mnemonic, 0) + count

    # --- Output Console Summaries ---
    print_summary("Instruction Frequency Summary (Purecap)", stats["purecap"]["totals"])
    print_summary("Instruction Frequency Summary (Integer)", stats["integer"]["totals"])

    # --- Write CSV Outputs if requested ---
    if args.csv_purecap:
        write_stats_to_csv(stats["purecap"]["per_file"], args.csv_purecap)
    if args.csv_integer:
        write_stats_to_csv(stats["integer"]["per_file"], args.csv_integer)

if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	#
	# Description:
	# This script recursively finds all RISC-V ELF files in a specified directory
	# and categorizes them into "purecap" (CHERI-enabled) and "integer" buckets
	# based on the EF_RISCV_CHERIABI flag in the ELF header. It only includes
	# executables and shared libraries, ignoring object files. It counts every
	# assembly instruction mnemonic and provides grand totals and optional CSV
	# reports for each bucket. It correctly handles symbolic links to avoid
	# double-counting files.
	#
	# Dependencies:
	# pip install pyelftools
	# A RISC-V llvm-objdump binary accessible on your system.
	#
	# Usage:
	# python riscv_instruction_counter.py \
	# --folder /path/to/your/binaries \
	# --objdump /path/to/your/llvm-objdump \
	# --csv-purecap purecap_stats.csv \
	# --csv-integer integer_stats.csv
	#

	import os
	import argparse
	import subprocess
	import csv
	from pathlib import Path
	from elftools.elf.elffile import ELFFile
	from elftools.common.exceptions import ELFError

	# The e_flags bit that identifies a CHERI pure-capability ABI file.
	EF_RISCV_CHERIABI = 0x00010000

	def get_riscv_bucket(filepath):
	"""
	Checks if a file is a RISC-V executable or shared library and categorizes it.

	Args:
	filepath (str): The path to the file.

	Returns:
	str or None: "purecap" if it's a CHERI-enabled ELF, "integer" if it's a
	standard RISC-V ELF, or None if it's not a valid target.
	"""
	try:
	with open(filepath, 'rb') as f:
	if f.read(4) != b'\x7fELF':
	return None
	f.seek(0)
	elffile = ELFFile(f)

	# Check for RISC-V architecture
	if elffile.get_machine_arch() != 'RISC-V':
	return None

	# Filter to include only executables (ET_EXEC) and shared libraries (ET_DYN)
	file_type = elffile.header['e_type']
	if file_type not in ('ET_EXEC', 'ET_DYN'):
	return None

	# Check the e_flags for the CHERIABI bit to determine the bucket
	if elffile.header['e_flags'] & EF_RISCV_CHERIABI:
	return "purecap"
	else:
	return "integer"
	except (IOError, ELFError):
	return None

	def count_instructions_in_file(filepath, objdump_path):
	"""
	Disassembles a RISC-V ELF file by streaming from a specified objdump
	binary and counts the occurrences of every instruction.

	Args:
	filepath (str): The full path to the ELF file.
	objdump_path (str): The path to the llvm-objdump binary.

	Returns:
	dict: A dictionary mapping each instruction mnemonic to its count.
	"""
	instruction_counts = {}

	try:
	# -M no-aliases: Expands pseudo-instructions to their base instructions.
	# --no-show-raw-insn: Omits the hex machine code.
	# The leading address is now included to provide a reliable parsing anchor.
	command = [objdump_path, "-d", "-M", "no-aliases", "--no-show-raw-insn", filepath]

	process = subprocess.Popen(
	command,
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	encoding='utf-8',
	errors='ignore'
	)

	for line in process.stdout:
	stripped_line = line.strip()

	# An instruction line must contain a colon (from the address)
	if ':' not in stripped_line:
	continue

	# Split the line by the colon first to separate the address
	# from the instruction part.
	parts = stripped_line.split(':', 1)

	# Check if the part before the colon is a valid hex address.
	try:
	int(parts[0], 16)
	except (ValueError, IndexError):
	continue

	# The rest of the line contains the instruction.
	instruction_part = parts[1].strip()
	if not instruction_part:
	continue

	potential_mnemonic = instruction_part.split(None, 1)[0].lower()

	# A valid mnemonic must start with a letter.
	if potential_mnemonic and potential_mnemonic[0].isalpha():
	instruction_counts[potential_mnemonic] = instruction_counts.get(potential_mnemonic, 0) + 1
	else:
	# This case should be rare now but is kept as a safeguard.
	print(f"\nWarning: Invalid mnemonic '{potential_mnemonic}' detected in file {filepath}.")
	print(f" -> Offending line: {line.strip()}")

	process.wait()
	if process.returncode != 0:
	pass

	except FileNotFoundError:
	print(f"Error: objdump binary not found at '{objdump_path}'")
	exit(1)
	except Exception as e:
	print(f"An unexpected error occurred while processing {filepath}: {e}")

	return instruction_counts

	def write_stats_to_csv(per_file_stats, csv_path):
	"""
	Writes the collected instruction statistics to a CSV file.

	Args:
	per_file_stats (dict): A dictionary where keys are filepaths and values
	are dictionaries of instruction counts.
	csv_path (str): The path to the output CSV file.
	"""
	if not per_file_stats:
	print(f"No data to write to {csv_path}.")
	return

	all_mnemonics = set()
	for counts in per_file_stats.values():
	all_mnemonics.update(counts.keys())

	sorted_mnemonics = sorted(list(all_mnemonics))
	header = ['filepath'] + sorted_mnemonics

	try:
	with open(csv_path, 'w', newline='', encoding='utf-8') as f:
	writer = csv.writer(f)
	writer.writerow(header)

	for filepath, counts in per_file_stats.items():
	row = [filepath] + [counts.get(m, 0) for m in sorted_mnemonics]
	writer.writerow(row)
	print(f"\nSuccessfully wrote statistics to '{csv_path}'")
	except IOError as e:
	print(f"\nError writing to CSV file '{csv_path}': {e}")

	def print_summary(title, grand_totals):
	"""Prints a formatted summary of grand total instruction counts."""
	print(f"\n--- {title} ---")
	if not grand_totals:
	print("No instructions found for this category.")
	else:
	# Sort the instructions by mnemonic name (alphabetically)
	sorted_totals = sorted(grand_totals.items())
	for mnemonic, count in sorted_totals:
	# Print in a simple CSV format
	print(f"{mnemonic},{count}")

	def main():
	"""
	Main function to parse arguments and orchestrate the counting process.
	"""
	parser = argparse.ArgumentParser(
	description="Count all assembly instructions in RISC-V ELF files, bucketing by ABI."
	)
	parser.add_argument("-f", "--folder", required=True, help="The folder to search recursively.")
	parser.add_argument(
	"--objdump",
	default=os.path.expanduser("~/cheri/output/sdk/bin/llvm-objdump"),
	help="Path to the custom llvm-objdump binary."
	)
	parser.add_argument("--csv-purecap", help="Optional CSV path for purecap file statistics.")
	parser.add_argument("--csv-integer", help="Optional CSV path for integer file statistics.")
	args = parser.parse_args()

	if not os.path.isdir(args.folder):
	print(f"Error: Folder not found at '{args.folder}'")
	return

	if not os.path.isfile(args.objdump):
	print(f"Error: llvm-objdump binary not found at '{args.objdump}'")
	return

	print(f"Searching for RISC-V ELF files in '{args.folder}'...")
	print(f"Using objdump at: '{args.objdump}'\n")

	stats = {
	"purecap": {"per_file": {}, "totals": {}},
	"integer": {"per_file": {}, "totals": {}}
	}

	processed_files = set()

	for root, _, files in os.walk(args.folder):
	for filename in files:
	filepath = os.path.join(root, filename)

	# Resolve symlinks to get the canonical path
	try:
	resolved_path = Path(filepath).resolve()
	except FileNotFoundError:
	# This can happen with broken symlinks, so we skip them.
	continue

	# Check if we have already processed this file
	if resolved_path in processed_files:
	continue

	processed_files.add(resolved_path)

	bucket = get_riscv_bucket(str(resolved_path))
	if bucket:
	# Print the original path for user context, but analyze the resolved path
	print(f"Analyzing ({bucket}): {filepath}")
	file_counts = count_instructions_in_file(str(resolved_path), args.objdump)
	if file_counts:
	# Use the resolved path as the key in the stats dictionary
	stats[bucket]["per_file"][str(resolved_path)] = file_counts
	for mnemonic, count in file_counts.items():
	stats[bucket]["totals"][mnemonic] = stats[bucket]["totals"].get(mnemonic, 0) + count

	# --- Output Console Summaries ---
	print_summary("Instruction Frequency Summary (Purecap)", stats["purecap"]["totals"])
	print_summary("Instruction Frequency Summary (Integer)", stats["integer"]["totals"])

	# --- Write CSV Outputs if requested ---
	if args.csv_purecap:
	write_stats_to_csv(stats["purecap"]["per_file"], args.csv_purecap)
	if args.csv_integer:
	write_stats_to_csv(stats["integer"]["per_file"], args.csv_integer)

	if __name__ == '__main__':
	main()
No results found