Baber Abbasi baberabb

## convert_to_parquet.py
# /// script
# dependencies = [
#   "datasets==3.6.0",
# ]
# ///
# copied from datasets-cli convert_to_parquet
# run with
# uv run convert_to_parquet.py marcob/lambada_multilingual EleutherAI/lambada_multilingual
# adapted from https://github.com/huggingface/datasets
# Licensed under Apache 2.0

## comparison_metrics.py
def compare_weights(tensor1, tensor2, epsilon=None):
    """
    Compare two tensors and provide detailed statistics about their differences.
    """
    import torch

    if tensor1.shape != tensor2.shape:
        raise ValueError(f"Shape mismatch: {tensor1.shape} vs {tensor2.shape}")

    if epsilon is None:

## 01_scores.md

      
              4 files
            
          
              0 forks
            
          
                4 comments
              
            
              0 stars
            
          
                baberabb
                / 01_scores.md
            
            
              Last active
              October 13, 2025 12:59
            
              
                Comma
              
          
Model
ARC-C
ARC-E
MMLU
BoolQ
HS
OBQA
CSQA
PIQA
SIQA
HEval
MBPP
Avg


togethercomputer/RedPajama-INCITE-7B-Base
44.7±1.45
66.8±0.97
26.4±0.37
70.9±0.79
70.3±0.46
50.2±2.24
57.7±1.41
77.0±0.98
46.4±1.13
10.3±2.2
17.16±1.48
52.2


huggyllama/llama-7b
48.2±1.46
68.2±0.96
32.2±0.39
75.0±0.76
76.2±0.42
53.6±2.23
61.8±1.39
79.3±0.95
49.0±1.13
17.07±2.66
28.09±1.82
56.9


stabilityai/stablelm-base-alpha-7b
43.69±1.45
65.78±0.97
40.04±0.41
70.31±0.80
74.27±0.44
52.0±2.24
57.25±1.42
79.0±0.95
48.36±1.13
24.76±3.11
36.54±1.95
53.8


mosaicml/mpt-7b
45.8±1.46
67.1±0.96
29.4±0.38
73.7±0.77
76.3±0.42
54.0±2.23
64.2±1.37
80.5±0.92
48.9±1.13
31.07±3.28
35.35±1.94
57.2


openlm-research/open_llama_7b_v2
46.8±1.46
67.3±0.96
33.1±0.39
72.3±0.78
74.5±0.43
50.8±2.24
62.9±1.38
79.8±0.94
49


## gpqa_diamond_cot_n_shot.yaml
dataset_path: Idavidrein/gpqa
tag: gpqa
output_type: generate_until
process_docs: !function utils.process_docs
test_split: train
fewshot_split: null
description: "Here are some example questions from experts. An explanation is given before the final answer. Answer the final question yourself, giving your reasoning beforehand.\n"
doc_to_text: "Question: {{Question|trim}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nGive step by step reasoning before you answer, and when you're ready to answer, please use the format \"The correct answer is (insert answer here)\":"
doc_to_target: answer
fewshot_delimiter: ""

## results.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                baberabb
                / results.md
            
            
              Created
              February 6, 2025 13:55
            
          
    Mathvista 100 samples

llama: 	51.5
hf-multimodal (pretrained=meta-llama/Llama-3.2-11B-Vision-Instruct), gen_kwargs: (None), limit: 50.0, num_fewshot: None, batch_size: 8


Tasks
Version
Filter
n-shot
Metric

Value

Stderr


mathvista
1
extract_answer
0
acc
↑
0.46
±
0.0712


hf-multimodal (pretrained=llava-hf/llava-onevision-qwen2-7b-ov-chat-hf), gen_kwargs: (None), limit: 50.0, num_fewshot: None, batch_size: 8


Tasks
Version
Filter
n-shot
Metric

Value

Stderr


## spo_loss.py
"""Scalar Preference Optimization."""

import torch
from torch.nn import functional as F


def logp_completion(logits, tokens, mask):
    """Compute the log probabilities of completions given their prompts.

    Args:

## openalex.py
import asyncio
import os
import random
from collections import deque
import ssl
from urllib.parse import urlparse

import aiohttp
import polars as pl
import aiofiles

## grouped_results.md

      
              2 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                baberabb
                / grouped_results.md
            
            
              Last active
              February 12, 2024 23:23
            
              
                mistral MMLU
              
          
    hf (pretrained=mistralai/Mistral-7B-v0.1), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 8


Tasks
Version
Filter
n-shot
Metric
Value

Stderr


mmlu
N/A
none
0
acc
0.6273
±
0.0294


- humanities
N/A
none
None
acc
0.6520
±
0.0233


- formal_logic
0
none
None
acc
0.3571
±
0.0429


- high_school_european_history
0
none
None
acc
0.7455
±
0.0340


- high_school_us_history
0
none
None
acc
0.7549
±
0.0302


- high_school_world_history
0
none
None
acc
0.7764
±
0.0271


- international_law
0
none
None
acc
0.7521
±
0.0394
	# /// script
	# dependencies = [
	# "datasets==3.6.0",
	# ]
	# ///
	# copied from datasets-cli convert_to_parquet
	# run with
	# uv run convert_to_parquet.py marcob/lambada_multilingual EleutherAI/lambada_multilingual
	# adapted from https://github.com/huggingface/datasets
	# Licensed under Apache 2.0
	def compare_weights(tensor1, tensor2, epsilon=None):
	"""
	Compare two tensors and provide detailed statistics about their differences.
	"""
	import torch

	if tensor1.shape != tensor2.shape:
	raise ValueError(f"Shape mismatch: {tensor1.shape} vs {tensor2.shape}")

	if epsilon is None:
Model	ARC-C	ARC-E	MMLU	BoolQ	HS	OBQA	CSQA	PIQA	SIQA	HEval	MBPP	Avg
togethercomputer/RedPajama-INCITE-7B-Base	44.7±1.45	66.8±0.97	26.4±0.37	70.9±0.79	70.3±0.46	50.2±2.24	57.7±1.41	77.0±0.98	46.4±1.13	10.3±2.2	17.16±1.48	52.2
huggyllama/llama-7b	48.2±1.46	68.2±0.96	32.2±0.39	75.0±0.76	76.2±0.42	53.6±2.23	61.8±1.39	79.3±0.95	49.0±1.13	17.07±2.66	28.09±1.82	56.9
stabilityai/stablelm-base-alpha-7b	43.69±1.45	65.78±0.97	40.04±0.41	70.31±0.80	74.27±0.44	52.0±2.24	57.25±1.42	79.0±0.95	48.36±1.13	24.76±3.11	36.54±1.95	53.8
mosaicml/mpt-7b	45.8±1.46	67.1±0.96	29.4±0.38	73.7±0.77	76.3±0.42	54.0±2.23	64.2±1.37	80.5±0.92	48.9±1.13	31.07±3.28	35.35±1.94	57.2
openlm-research/open_llama_7b_v2	46.8±1.46	67.3±0.96	33.1±0.39	72.3±0.78	74.5±0.43	50.8±2.24	62.9±1.38	79.8±0.94	49
	dataset_path: Idavidrein/gpqa
	tag: gpqa
	output_type: generate_until
	process_docs: !function utils.process_docs
	test_split: train
	fewshot_split: null
	description: "Here are some example questions from experts. An explanation is given before the final answer. Answer the final question yourself, giving your reasoning beforehand.\n"
	doc_to_text: "Question: {{Question\|trim}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nGive step by step reasoning before you answer, and when you're ready to answer, please use the format \"The correct answer is (insert answer here)\":"
	doc_to_target: answer
	fewshot_delimiter: ""
	"""Scalar Preference Optimization."""

	import torch
	from torch.nn import functional as F


	def logp_completion(logits, tokens, mask):
	"""Compute the log probabilities of completions given their prompts.

	Args:
	import asyncio
	import os
	import random
	from collections import deque
	import ssl
	from urllib.parse import urlparse

	import aiohttp
	import polars as pl
	import aiofiles
Tasks	Version	Filter	n-shot	Metric	Value		Stderr
mmlu	N/A	none	0	acc	0.6273	±	0.0294
- humanities	N/A	none	None	acc	0.6520	±	0.0233
- formal_logic	0	none	None	acc	0.3571	±	0.0429
- high_school_european_history	0	none	None	acc	0.7455	±	0.0340
- high_school_us_history	0	none	None	acc	0.7549	±	0.0302
- high_school_world_history	0	none	None	acc	0.7764	±	0.0271
- international_law	0	none	None	acc	0.7521	±	0.0394