Steboss Steboss

## iteration_4.rs
use rayon::prelude::*;
use std::arch::aarch64::*;
use std::env;
use std::fs;

fn main() {
    let args: Vec<String> = env::args().collect();
    if args.len() != 4 {
        eprintln!("Usage: {} M K N", args[0]);
        return;

## iteration_0.rs
use rayon::prelude::*;
use std::env;

const N: usize = 2; // Fixed block size

// HELPER: Convert slice to fixed array for safety (optional but good for debugging)
fn to_array(slice: &[f64]) -> [f64; N * N] {
    slice.try_into().expect("Slice has incorrect length")
}

## matmulmanager.py
import logging
import re
from typing import List

from autogen_core import (
    DefaultTopicId,
    MessageContext,
    RoutedAgent,
    message_handler,
)

## basematmulagent.py
import logging
import json
from typing import List

from autogen_core import (
    DefaultTopicId,
    MessageContext,
    RoutedAgent,
    message_handler,
)

## semantic_chunker.py
import os
import re
import glob
import pymupdf4llm
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker

## baseagent.py
class BaseMatMulAgent(RoutedAgent):
    """
    A generic agent that can handle RequestToSpeak, call tools, and publish messages.
    """
    def __init__(
        self,
        description: str,
        group_chat_topic_type: str,
        model_client: ChatCompletionClient,
        system_message: str,

## submit2.sh
#!/bin/bash
#SBATCH -A your_account
#SBATCH -p your_partition
#SBATCH -N 4 # example with 4 nodes
#SBATCH -t 04:00:00 # max run time
#SBATCH -J "something_in_line_with_your_system"


export CONFIG="fuji-70B-v2-flash" # here you can insert, for example,  fuji-7B-v2-flash
export CONTAINER="ghcr.io/nvidia/jax:axlearn" # this is our public jax-axlearn container

## submit.sh
#!/bin/bash
#SBATCH -A something
#SBATCH -p some partition
#SBATCH -N 2 # number of nodes to use
#SBATCH -t
#SBATCH -J

export CONFIG="fuji-70B-v2-flash"
export CONTAINER="my-container"

## matmul_forward_results.csv

          
             matmul forward
             matmul backward

            
              C OpenMP
               7.82 +/- 0.12
               2.90 +/- 0.03

            
              Rust base
               28.52 +/- 0.32
               12.25 +/- 0.16

            
              Rust Rayon
               4.14 +/- 0.01
               4.24 +/- 0.02

            
              Rust Blas
               0.05 +/- 0.01
                0.19 +/- 0.01

## average_benchmark.csv

          
            # of lines
             average time (s)

            
              1000
               0.27 ± 0.01

            
              10000
               0.32 ± 0.01

            
              100000
               0.97 ± 0.02

            
              1000000
               7.27 ± 0.06

            
              10000000
               64.29 ± 0.10
	use rayon::prelude::*;
	use std::arch::aarch64::*;
	use std::env;
	use std::fs;

	fn main() {
	let args: Vec<String> = env::args().collect();
	if args.len() != 4 {
	eprintln!("Usage: {} M K N", args[0]);
	return;
	use rayon::prelude::*;
	use std::env;

	const N: usize = 2; // Fixed block size

	// HELPER: Convert slice to fixed array for safety (optional but good for debugging)
	fn to_array(slice: &[f64]) -> [f64; N * N] {
	slice.try_into().expect("Slice has incorrect length")
	}
	import logging
	import re
	from typing import List

	from autogen_core import (
	DefaultTopicId,
	MessageContext,
	RoutedAgent,
	message_handler,
	)
	import logging
	import json
	from typing import List

	from autogen_core import (
	DefaultTopicId,
	MessageContext,
	RoutedAgent,
	message_handler,
	)
	import os
	import re
	import glob
	import pymupdf4llm
	from marker.converters.pdf import PdfConverter
	from marker.models import create_model_dict
	from marker.output import text_from_rendered
	from langchain_core.documents import Document
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_experimental.text_splitter import SemanticChunker
	class BaseMatMulAgent(RoutedAgent):
	"""
	A generic agent that can handle RequestToSpeak, call tools, and publish messages.
	"""
	def __init__(
	self,
	description: str,
	group_chat_topic_type: str,
	model_client: ChatCompletionClient,
	system_message: str,
	#!/bin/bash
	#SBATCH -A your_account
	#SBATCH -p your_partition
	#SBATCH -N 4 # example with 4 nodes
	#SBATCH -t 04:00:00 # max run time
	#SBATCH -J "something_in_line_with_your_system"


	export CONFIG="fuji-70B-v2-flash" # here you can insert, for example, fuji-7B-v2-flash
	export CONTAINER="ghcr.io/nvidia/jax:axlearn" # this is our public jax-axlearn container
	#!/bin/bash
	#SBATCH -A something
	#SBATCH -p some partition
	#SBATCH -N 2 # number of nodes to use
	#SBATCH -t
	#SBATCH -J

	export CONFIG="fuji-70B-v2-flash"
	export CONTAINER="my-container"
	matmul forward	matmul backward
C OpenMP	7.82 +/- 0.12	2.90 +/- 0.03
Rust base	28.52 +/- 0.32	12.25 +/- 0.16
Rust Rayon	4.14 +/- 0.01	4.24 +/- 0.02
Rust Blas	0.05 +/- 0.01	0.19 +/- 0.01
	# of lines	average time (s)
	1000	0.27 ± 0.01
	10000	0.32 ± 0.01
	100000	0.97 ± 0.02
	1000000	7.27 ± 0.06
	10000000	64.29 ± 0.10