Skip to content

Instantly share code, notes, and snippets.

@arichardson
Created August 19, 2025 15:21
Show Gist options
  • Select an option

  • Save arichardson/6716208c027174710447d5f253608fb6 to your computer and use it in GitHub Desktop.

Select an option

Save arichardson/6716208c027174710447d5f253608fb6 to your computer and use it in GitHub Desktop.
Count number of times each RISC-V instruction is emitted
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Description:
# This script recursively finds all RISC-V ELF files in a specified directory
# and categorizes them into "purecap" (CHERI-enabled) and "integer" buckets
# based on the EF_RISCV_CHERIABI flag in the ELF header. It only includes
# executables and shared libraries, ignoring object files. It counts every
# assembly instruction mnemonic and provides grand totals and optional CSV
# reports for each bucket. It correctly handles symbolic links to avoid
# double-counting files.
#
# Dependencies:
# pip install pyelftools
# A RISC-V llvm-objdump binary accessible on your system.
#
# Usage:
# python riscv_instruction_counter.py \
# --folder /path/to/your/binaries \
# --objdump /path/to/your/llvm-objdump \
# --csv-purecap purecap_stats.csv \
# --csv-integer integer_stats.csv
#
import os
import argparse
import subprocess
import csv
from pathlib import Path
from elftools.elf.elffile import ELFFile
from elftools.common.exceptions import ELFError
# The e_flags bit that identifies a CHERI pure-capability ABI file.
EF_RISCV_CHERIABI = 0x00010000
def get_riscv_bucket(filepath):
"""
Checks if a file is a RISC-V executable or shared library and categorizes it.
Args:
filepath (str): The path to the file.
Returns:
str or None: "purecap" if it's a CHERI-enabled ELF, "integer" if it's a
standard RISC-V ELF, or None if it's not a valid target.
"""
try:
with open(filepath, 'rb') as f:
if f.read(4) != b'\x7fELF':
return None
f.seek(0)
elffile = ELFFile(f)
# Check for RISC-V architecture
if elffile.get_machine_arch() != 'RISC-V':
return None
# Filter to include only executables (ET_EXEC) and shared libraries (ET_DYN)
file_type = elffile.header['e_type']
if file_type not in ('ET_EXEC', 'ET_DYN'):
return None
# Check the e_flags for the CHERIABI bit to determine the bucket
if elffile.header['e_flags'] & EF_RISCV_CHERIABI:
return "purecap"
else:
return "integer"
except (IOError, ELFError):
return None
def count_instructions_in_file(filepath, objdump_path):
"""
Disassembles a RISC-V ELF file by streaming from a specified objdump
binary and counts the occurrences of every instruction.
Args:
filepath (str): The full path to the ELF file.
objdump_path (str): The path to the llvm-objdump binary.
Returns:
dict: A dictionary mapping each instruction mnemonic to its count.
"""
instruction_counts = {}
try:
# -M no-aliases: Expands pseudo-instructions to their base instructions.
# --no-show-raw-insn: Omits the hex machine code.
# The leading address is now included to provide a reliable parsing anchor.
command = [objdump_path, "-d", "-M", "no-aliases", "--no-show-raw-insn", filepath]
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding='utf-8',
errors='ignore'
)
for line in process.stdout:
stripped_line = line.strip()
# An instruction line must contain a colon (from the address)
if ':' not in stripped_line:
continue
# Split the line by the colon first to separate the address
# from the instruction part.
parts = stripped_line.split(':', 1)
# Check if the part before the colon is a valid hex address.
try:
int(parts[0], 16)
except (ValueError, IndexError):
continue
# The rest of the line contains the instruction.
instruction_part = parts[1].strip()
if not instruction_part:
continue
potential_mnemonic = instruction_part.split(None, 1)[0].lower()
# A valid mnemonic must start with a letter.
if potential_mnemonic and potential_mnemonic[0].isalpha():
instruction_counts[potential_mnemonic] = instruction_counts.get(potential_mnemonic, 0) + 1
else:
# This case should be rare now but is kept as a safeguard.
print(f"\nWarning: Invalid mnemonic '{potential_mnemonic}' detected in file {filepath}.")
print(f" -> Offending line: {line.strip()}")
process.wait()
if process.returncode != 0:
pass
except FileNotFoundError:
print(f"Error: objdump binary not found at '{objdump_path}'")
exit(1)
except Exception as e:
print(f"An unexpected error occurred while processing {filepath}: {e}")
return instruction_counts
def write_stats_to_csv(per_file_stats, csv_path):
"""
Writes the collected instruction statistics to a CSV file.
Args:
per_file_stats (dict): A dictionary where keys are filepaths and values
are dictionaries of instruction counts.
csv_path (str): The path to the output CSV file.
"""
if not per_file_stats:
print(f"No data to write to {csv_path}.")
return
all_mnemonics = set()
for counts in per_file_stats.values():
all_mnemonics.update(counts.keys())
sorted_mnemonics = sorted(list(all_mnemonics))
header = ['filepath'] + sorted_mnemonics
try:
with open(csv_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(header)
for filepath, counts in per_file_stats.items():
row = [filepath] + [counts.get(m, 0) for m in sorted_mnemonics]
writer.writerow(row)
print(f"\nSuccessfully wrote statistics to '{csv_path}'")
except IOError as e:
print(f"\nError writing to CSV file '{csv_path}': {e}")
def print_summary(title, grand_totals):
"""Prints a formatted summary of grand total instruction counts."""
print(f"\n--- {title} ---")
if not grand_totals:
print("No instructions found for this category.")
else:
# Sort the instructions by mnemonic name (alphabetically)
sorted_totals = sorted(grand_totals.items())
for mnemonic, count in sorted_totals:
# Print in a simple CSV format
print(f"{mnemonic},{count}")
def main():
"""
Main function to parse arguments and orchestrate the counting process.
"""
parser = argparse.ArgumentParser(
description="Count all assembly instructions in RISC-V ELF files, bucketing by ABI."
)
parser.add_argument("-f", "--folder", required=True, help="The folder to search recursively.")
parser.add_argument(
"--objdump",
default=os.path.expanduser("~/cheri/output/sdk/bin/llvm-objdump"),
help="Path to the custom llvm-objdump binary."
)
parser.add_argument("--csv-purecap", help="Optional CSV path for purecap file statistics.")
parser.add_argument("--csv-integer", help="Optional CSV path for integer file statistics.")
args = parser.parse_args()
if not os.path.isdir(args.folder):
print(f"Error: Folder not found at '{args.folder}'")
return
if not os.path.isfile(args.objdump):
print(f"Error: llvm-objdump binary not found at '{args.objdump}'")
return
print(f"Searching for RISC-V ELF files in '{args.folder}'...")
print(f"Using objdump at: '{args.objdump}'\n")
stats = {
"purecap": {"per_file": {}, "totals": {}},
"integer": {"per_file": {}, "totals": {}}
}
processed_files = set()
for root, _, files in os.walk(args.folder):
for filename in files:
filepath = os.path.join(root, filename)
# Resolve symlinks to get the canonical path
try:
resolved_path = Path(filepath).resolve()
except FileNotFoundError:
# This can happen with broken symlinks, so we skip them.
continue
# Check if we have already processed this file
if resolved_path in processed_files:
continue
processed_files.add(resolved_path)
bucket = get_riscv_bucket(str(resolved_path))
if bucket:
# Print the original path for user context, but analyze the resolved path
print(f"Analyzing ({bucket}): {filepath}")
file_counts = count_instructions_in_file(str(resolved_path), args.objdump)
if file_counts:
# Use the resolved path as the key in the stats dictionary
stats[bucket]["per_file"][str(resolved_path)] = file_counts
for mnemonic, count in file_counts.items():
stats[bucket]["totals"][mnemonic] = stats[bucket]["totals"].get(mnemonic, 0) + count
# --- Output Console Summaries ---
print_summary("Instruction Frequency Summary (Purecap)", stats["purecap"]["totals"])
print_summary("Instruction Frequency Summary (Integer)", stats["integer"]["totals"])
# --- Write CSV Outputs if requested ---
if args.csv_purecap:
write_stats_to_csv(stats["purecap"]["per_file"], args.csv_purecap)
if args.csv_integer:
write_stats_to_csv(stats["integer"]["per_file"], args.csv_integer)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment