Created
August 19, 2025 15:21
-
-
Save arichardson/6716208c027174710447d5f253608fb6 to your computer and use it in GitHub Desktop.
Count number of times each RISC-V instruction is emitted
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| # | |
| # Description: | |
| # This script recursively finds all RISC-V ELF files in a specified directory | |
| # and categorizes them into "purecap" (CHERI-enabled) and "integer" buckets | |
| # based on the EF_RISCV_CHERIABI flag in the ELF header. It only includes | |
| # executables and shared libraries, ignoring object files. It counts every | |
| # assembly instruction mnemonic and provides grand totals and optional CSV | |
| # reports for each bucket. It correctly handles symbolic links to avoid | |
| # double-counting files. | |
| # | |
| # Dependencies: | |
| # pip install pyelftools | |
| # A RISC-V llvm-objdump binary accessible on your system. | |
| # | |
| # Usage: | |
| # python riscv_instruction_counter.py \ | |
| # --folder /path/to/your/binaries \ | |
| # --objdump /path/to/your/llvm-objdump \ | |
| # --csv-purecap purecap_stats.csv \ | |
| # --csv-integer integer_stats.csv | |
| # | |
| import os | |
| import argparse | |
| import subprocess | |
| import csv | |
| from pathlib import Path | |
| from elftools.elf.elffile import ELFFile | |
| from elftools.common.exceptions import ELFError | |
| # The e_flags bit that identifies a CHERI pure-capability ABI file. | |
| EF_RISCV_CHERIABI = 0x00010000 | |
| def get_riscv_bucket(filepath): | |
| """ | |
| Checks if a file is a RISC-V executable or shared library and categorizes it. | |
| Args: | |
| filepath (str): The path to the file. | |
| Returns: | |
| str or None: "purecap" if it's a CHERI-enabled ELF, "integer" if it's a | |
| standard RISC-V ELF, or None if it's not a valid target. | |
| """ | |
| try: | |
| with open(filepath, 'rb') as f: | |
| if f.read(4) != b'\x7fELF': | |
| return None | |
| f.seek(0) | |
| elffile = ELFFile(f) | |
| # Check for RISC-V architecture | |
| if elffile.get_machine_arch() != 'RISC-V': | |
| return None | |
| # Filter to include only executables (ET_EXEC) and shared libraries (ET_DYN) | |
| file_type = elffile.header['e_type'] | |
| if file_type not in ('ET_EXEC', 'ET_DYN'): | |
| return None | |
| # Check the e_flags for the CHERIABI bit to determine the bucket | |
| if elffile.header['e_flags'] & EF_RISCV_CHERIABI: | |
| return "purecap" | |
| else: | |
| return "integer" | |
| except (IOError, ELFError): | |
| return None | |
| def count_instructions_in_file(filepath, objdump_path): | |
| """ | |
| Disassembles a RISC-V ELF file by streaming from a specified objdump | |
| binary and counts the occurrences of every instruction. | |
| Args: | |
| filepath (str): The full path to the ELF file. | |
| objdump_path (str): The path to the llvm-objdump binary. | |
| Returns: | |
| dict: A dictionary mapping each instruction mnemonic to its count. | |
| """ | |
| instruction_counts = {} | |
| try: | |
| # -M no-aliases: Expands pseudo-instructions to their base instructions. | |
| # --no-show-raw-insn: Omits the hex machine code. | |
| # The leading address is now included to provide a reliable parsing anchor. | |
| command = [objdump_path, "-d", "-M", "no-aliases", "--no-show-raw-insn", filepath] | |
| process = subprocess.Popen( | |
| command, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.PIPE, | |
| text=True, | |
| encoding='utf-8', | |
| errors='ignore' | |
| ) | |
| for line in process.stdout: | |
| stripped_line = line.strip() | |
| # An instruction line must contain a colon (from the address) | |
| if ':' not in stripped_line: | |
| continue | |
| # Split the line by the colon first to separate the address | |
| # from the instruction part. | |
| parts = stripped_line.split(':', 1) | |
| # Check if the part before the colon is a valid hex address. | |
| try: | |
| int(parts[0], 16) | |
| except (ValueError, IndexError): | |
| continue | |
| # The rest of the line contains the instruction. | |
| instruction_part = parts[1].strip() | |
| if not instruction_part: | |
| continue | |
| potential_mnemonic = instruction_part.split(None, 1)[0].lower() | |
| # A valid mnemonic must start with a letter. | |
| if potential_mnemonic and potential_mnemonic[0].isalpha(): | |
| instruction_counts[potential_mnemonic] = instruction_counts.get(potential_mnemonic, 0) + 1 | |
| else: | |
| # This case should be rare now but is kept as a safeguard. | |
| print(f"\nWarning: Invalid mnemonic '{potential_mnemonic}' detected in file {filepath}.") | |
| print(f" -> Offending line: {line.strip()}") | |
| process.wait() | |
| if process.returncode != 0: | |
| pass | |
| except FileNotFoundError: | |
| print(f"Error: objdump binary not found at '{objdump_path}'") | |
| exit(1) | |
| except Exception as e: | |
| print(f"An unexpected error occurred while processing {filepath}: {e}") | |
| return instruction_counts | |
| def write_stats_to_csv(per_file_stats, csv_path): | |
| """ | |
| Writes the collected instruction statistics to a CSV file. | |
| Args: | |
| per_file_stats (dict): A dictionary where keys are filepaths and values | |
| are dictionaries of instruction counts. | |
| csv_path (str): The path to the output CSV file. | |
| """ | |
| if not per_file_stats: | |
| print(f"No data to write to {csv_path}.") | |
| return | |
| all_mnemonics = set() | |
| for counts in per_file_stats.values(): | |
| all_mnemonics.update(counts.keys()) | |
| sorted_mnemonics = sorted(list(all_mnemonics)) | |
| header = ['filepath'] + sorted_mnemonics | |
| try: | |
| with open(csv_path, 'w', newline='', encoding='utf-8') as f: | |
| writer = csv.writer(f) | |
| writer.writerow(header) | |
| for filepath, counts in per_file_stats.items(): | |
| row = [filepath] + [counts.get(m, 0) for m in sorted_mnemonics] | |
| writer.writerow(row) | |
| print(f"\nSuccessfully wrote statistics to '{csv_path}'") | |
| except IOError as e: | |
| print(f"\nError writing to CSV file '{csv_path}': {e}") | |
| def print_summary(title, grand_totals): | |
| """Prints a formatted summary of grand total instruction counts.""" | |
| print(f"\n--- {title} ---") | |
| if not grand_totals: | |
| print("No instructions found for this category.") | |
| else: | |
| # Sort the instructions by mnemonic name (alphabetically) | |
| sorted_totals = sorted(grand_totals.items()) | |
| for mnemonic, count in sorted_totals: | |
| # Print in a simple CSV format | |
| print(f"{mnemonic},{count}") | |
| def main(): | |
| """ | |
| Main function to parse arguments and orchestrate the counting process. | |
| """ | |
| parser = argparse.ArgumentParser( | |
| description="Count all assembly instructions in RISC-V ELF files, bucketing by ABI." | |
| ) | |
| parser.add_argument("-f", "--folder", required=True, help="The folder to search recursively.") | |
| parser.add_argument( | |
| "--objdump", | |
| default=os.path.expanduser("~/cheri/output/sdk/bin/llvm-objdump"), | |
| help="Path to the custom llvm-objdump binary." | |
| ) | |
| parser.add_argument("--csv-purecap", help="Optional CSV path for purecap file statistics.") | |
| parser.add_argument("--csv-integer", help="Optional CSV path for integer file statistics.") | |
| args = parser.parse_args() | |
| if not os.path.isdir(args.folder): | |
| print(f"Error: Folder not found at '{args.folder}'") | |
| return | |
| if not os.path.isfile(args.objdump): | |
| print(f"Error: llvm-objdump binary not found at '{args.objdump}'") | |
| return | |
| print(f"Searching for RISC-V ELF files in '{args.folder}'...") | |
| print(f"Using objdump at: '{args.objdump}'\n") | |
| stats = { | |
| "purecap": {"per_file": {}, "totals": {}}, | |
| "integer": {"per_file": {}, "totals": {}} | |
| } | |
| processed_files = set() | |
| for root, _, files in os.walk(args.folder): | |
| for filename in files: | |
| filepath = os.path.join(root, filename) | |
| # Resolve symlinks to get the canonical path | |
| try: | |
| resolved_path = Path(filepath).resolve() | |
| except FileNotFoundError: | |
| # This can happen with broken symlinks, so we skip them. | |
| continue | |
| # Check if we have already processed this file | |
| if resolved_path in processed_files: | |
| continue | |
| processed_files.add(resolved_path) | |
| bucket = get_riscv_bucket(str(resolved_path)) | |
| if bucket: | |
| # Print the original path for user context, but analyze the resolved path | |
| print(f"Analyzing ({bucket}): {filepath}") | |
| file_counts = count_instructions_in_file(str(resolved_path), args.objdump) | |
| if file_counts: | |
| # Use the resolved path as the key in the stats dictionary | |
| stats[bucket]["per_file"][str(resolved_path)] = file_counts | |
| for mnemonic, count in file_counts.items(): | |
| stats[bucket]["totals"][mnemonic] = stats[bucket]["totals"].get(mnemonic, 0) + count | |
| # --- Output Console Summaries --- | |
| print_summary("Instruction Frequency Summary (Purecap)", stats["purecap"]["totals"]) | |
| print_summary("Instruction Frequency Summary (Integer)", stats["integer"]["totals"]) | |
| # --- Write CSV Outputs if requested --- | |
| if args.csv_purecap: | |
| write_stats_to_csv(stats["purecap"]["per_file"], args.csv_purecap) | |
| if args.csv_integer: | |
| write_stats_to_csv(stats["integer"]["per_file"], args.csv_integer) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment