Skip to content

Instantly share code, notes, and snippets.

@baberabb
Last active October 13, 2025 12:59
Show Gist options
  • Select an option

  • Save baberabb/cffd7cd69362fe6d42ef5a0218347cfd to your computer and use it in GitHub Desktop.

Select an option

Save baberabb/cffd7cd69362fe6d42ef5a0218347cfd to your computer and use it in GitHub Desktop.
Comma
Model ARC-C ARC-E MMLU BoolQ HS OBQA CSQA PIQA SIQA HEval MBPP Avg
togethercomputer/RedPajama-INCITE-7B-Base 44.7±1.45 66.8±0.97 26.4±0.37 70.9±0.79 70.3±0.46 50.2±2.24 57.7±1.41 77.0±0.98 46.4±1.13 10.3±2.2 17.16±1.48 52.2
huggyllama/llama-7b 48.2±1.46 68.2±0.96 32.2±0.39 75.0±0.76 76.2±0.42 53.6±2.23 61.8±1.39 79.3±0.95 49.0±1.13 17.07±2.66 28.09±1.82 56.9
stabilityai/stablelm-base-alpha-7b 43.69±1.45 65.78±0.97 40.04±0.41 70.31±0.80 74.27±0.44 52.0±2.24 57.25±1.42 79.0±0.95 48.36±1.13 24.76±3.11 36.54±1.95 53.8
mosaicml/mpt-7b 45.8±1.46 67.1±0.96 29.4±0.38 73.7±0.77 76.3±0.42 54.0±2.23 64.2±1.37 80.5±0.92 48.9±1.13 31.07±3.28 35.35±1.94 57.2
openlm-research/open_llama_7b_v2 46.8±1.46 67.3±0.96 33.1±0.39 72.3±0.78 74.5±0.43 50.8±2.24 62.9±1.38 79.8±0.94 49.5±1.13 26.46±3.08 33.18±1.88 54.2
common-pile/comma-v0.1-1t 50.1±1.46 68.4±0.95 36.0±0.40 74.6±0.76 64.3±0.48 49.8±2.24 59.8±1.40 72.7±1.04 49.3±1.13 35.64±3.34 34.35±1.95 55.9
Qwen/Qwen3-8B 59.7±1.43 80.6±0.81 73.0±0.35 86.6±0.60 74.9±0.43 52.0±2.24 68.1±1.33 77.8±0.97 51.1±1.13 94.26±1.73 61.35±2.10 68.6
Model ARC-C ARC-E MMLU BoolQ HS OBQA CSQA PIQA SIQA HEval MBPP Avg
allenai/OLMo-7B-Twin-2T-hf 46.3±1.46 65.3±0.98 26.0±0.37 69.8±0.80 74.1±0.44 53.4±2.23 61.8±1.39 79.3±0.94 48.5±1.13 19.9±2.9 23.9±1.71 51.7
meta-llama/Llama-2-7b-hf 50.0±1.46 69.3±0.95 41.8±0.41 77.7±0.73 76.0±0.43 57.2±2.21 62.7±1.38 78.8±0.95 49.5±1.13 26.4±3.11 31.1±1.86 56.4
common-pile/comma-v0.1-2t 51.0±1.46 70.6±0.94 46.1±0.41 79.0±0.71 67.8±0.47 56.0±2.22 64.2±1.37 73.0±1.04 51.0±1.13 44.9±3.43 40.7±2.0 58.6
deepseek-ai/deepseek-llm-7b-base 49.6±1.46 68.8±0.95 44.2±0.41 72.4±0.78 76.2±0.43 58.0±2.21 66.8±1.35 79.7±0.94 50.9±1.13 46.9±3.46 43.2±2.01 59.7
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Patch
# -------------------------------------------------------------
# Data
# -------------------------------------------------------------
benchmarks = [
"ARC-C",
"ARC-E",
"MMLU",
"BoolQ",
"HSwag",
"OBQA",
"CSQA",
"PIQA",
"SIQA", # Knowledge / Reasoning
"HumEval",
"MBPP", # Coding
]
models = ["Comma", "LLaMA", "MPT", "RPJ-INCITE", "Qwen3"]
scores = {
"Comma": [50.1, 68.4, 36.0, 74.6, 64.3, 49.8, 59.8, 72.7, 49.3, 35.64, 34.35],
"LLaMA": [48.2, 68.2, 32.2, 75.0, 76.2, 53.6, 61.8, 79.3, 49.0, 17.07, 28.09],
"MPT": [45.8, 67.1, 29.4, 73.7, 76.3, 54.0, 64.2, 80.5, 48.9, 31.07, 35.35],
"RPJ-INCITE": [44.7, 66.8, 26.4, 70.9, 70.3, 50.2, 57.7, 77.0, 46.4, 10.3, 17.16],
"Qwen3": [59.7, 80.6, 73.0, 86.6, 74.9, 52.0, 68.1, 77.8, 51.1, 94.26, 61.35],
}
df = pd.DataFrame(scores, index=benchmarks)
# Error bars from the data
yerr_data = {
"Comma": [1.46, 0.95, 0.40, 0.76, 0.48, 2.24, 1.40, 1.04, 1.13, 3.34, 1.95],
"LLaMA": [1.46, 0.96, 0.39, 0.76, 0.42, 2.23, 1.39, 0.95, 1.13, 2.66, 1.82],
"MPT": [1.46, 0.96, 0.38, 0.77, 0.42, 2.23, 1.37, 0.92, 1.13, 3.28, 1.94],
"RPJ-INCITE": [1.45, 0.97, 0.37, 0.79, 0.46, 2.24, 1.41, 0.98, 1.13, 2.2, 1.48],
"Qwen3": [1.43, 0.81, 0.35, 0.60, 0.43, 2.24, 1.33, 0.97, 1.13, 1.73, 2.1],
}
yerr_df = pd.DataFrame(yerr_data, index=benchmarks)
# Convert to numpy array and handle None values
yerr = yerr_df.values.T # Transpose to match expected shape
# -------------------------------------------------------------
# X positions with a gap between SIQA and HumEval
# -------------------------------------------------------------
base_x = np.arange(len(benchmarks), dtype=float)
gap_start = 9
base_x[gap_start:] += 1.0
# -------------------------------------------------------------
# Plot
# -------------------------------------------------------------
fig, ax = plt.subplots(figsize=(11, 4))
bar_w = 0.15
colours = {
"Comma": "gold",
"LLaMA": "hotpink",
"MPT": "mediumpurple",
"RPJ-INCITE": "cornflowerblue",
"Qwen3": "none",
}
hatch_for_qwen = "///"
for i, model in enumerate(models):
offsets = base_x + (i - 2) * bar_w
values = df[model].values
errors = yerr[i]
# Handle None values - replace with 0 for plotting
plot_values = np.where(pd.isna(values), 0, values)
plot_errors = np.where(pd.isna(errors), 0, errors)
bars = ax.bar(
offsets,
plot_values,
width=bar_w,
label=model,
color=colours[model],
edgecolor="dimgray",
linewidth=1.0,
yerr=plot_errors,
capsize=3,
hatch=hatch_for_qwen if model == "Qwen3" else None,
error_kw=dict(elinewidth=1, capthick=1, ecolor="dimgray"),
)
# -------------------------------------------------------------
# Axis and labels
# -------------------------------------------------------------
ax.set_ylabel("Performance")
ax.set_ylim(0, 100)
ax.set_xticks(base_x)
ax.set_xticklabels(benchmarks, rotation=45, ha="right")
# Add horizontal dotted lines for easy comparison
for y in [20, 40, 60, 80]:
ax.axhline(
y=y, color="lightgrey", linestyle="--", linewidth=0.8, alpha=0.3, zorder=0
)
# Add gold stars for Comma when it's top scoring (excluding Qwen)
for i, benchmark in enumerate(benchmarks):
comma_score = df["Comma"].iloc[i]
if pd.isna(comma_score) or comma_score == 0:
continue
other_scores = []
for model in ["LLaMA", "MPT", "RPJ-INCITE"]:
score = df[model].iloc[i]
if not pd.isna(score) and score > 0:
other_scores.append(score)
if other_scores and comma_score >= max(other_scores):
x_pos = base_x[i] + (0 - 2) * bar_w # Comma is index 0
comma_error = yerr_df["Comma"].iloc[i]
if not pd.isna(comma_error):
star_y_pos = comma_score + comma_error + 4
ax.scatter(
x_pos,
star_y_pos,
marker="*",
color="gold",
s=80,
edgecolor="darkgoldenrod",
linewidth=0.5,
zorder=10,
)
# Section labels
ax.text(base_x[4], -27, "Knowledge / Reasoning", ha="center", va="top", fontsize=11)
mid_coding = (base_x[9] + base_x[10]) / 2
ax.text(mid_coding, -27, "Coding", ha="center", va="top", fontsize=11)
# -------------------------------------------------------------
# Legend
# -------------------------------------------------------------
legend_patches = [
Patch(facecolor="gold", edgecolor="dimgray", label="Comma"),
Patch(facecolor="hotpink", edgecolor="dimgray", label="LLaMA"),
Patch(facecolor="mediumpurple", edgecolor="dimgray", label="MPT"),
Patch(facecolor="cornflowerblue", edgecolor="dimgray", label="RPJ-INCITE"),
Patch(facecolor="none", edgecolor="dimgray", hatch=hatch_for_qwen, label="Qwen3"),
]
ax.legend(
handles=legend_patches,
ncol=5,
frameon=False,
loc="upper center",
bbox_to_anchor=(0.5, 1.12),
)
# -------------------------------------------------------------
# Clean up
# -------------------------------------------------------------
ax.spines[["right", "top"]].set_visible(False)
fig.tight_layout()
fig.subplots_adjust(bottom=0.32)
# Save the plot
plt.savefig("model_comparison_plot.png", dpi=300, bbox_inches="tight")
plt.savefig("model_comparison_plot.pdf", bbox_inches="tight")
print("Plot saved as 'model_comparison_plot.png' and 'model_comparison_plot.pdf'")
plt.show()
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Patch
# -------------------------------------------------------------
# Data
# -------------------------------------------------------------
benchmarks = [
"ARC-C",
"ARC-E",
"MMLU",
"BoolQ",
"HSwag",
"OBQA",
"CSQA",
"PIQA",
"SIQA", # Knowledge / Reasoning
"HumEval",
"MBPP", # Coding
]
models = ["Comma-2T", "OLMo-Twin", "LLaMA-2", "DeepSeek", "Qwen3"]
scores = {
"Comma-2T": [51.0, 70.6, 46.1, 79.0, 67.8, 56.0, 64.2, 73.0, 51.0, 44.92, 40.7],
"LLaMA-2": [50.0, 69.3, 41.8, 77.7, 76.0, 57.2, 62.7, 78.8, 49.5, 26.4, 31.11],
"DeepSeek": [49.6, 68.8, 44.2, 72.4, 76.2, 58.0, 66.8, 79.7, 50.9, 46.86, 43.21],
"OLMo-Twin": [46.3, 65.3, 26.0, 69.8, 74.1, 53.4, 61.8, 79.3, 48.5, 19.92, 23.9],
"Qwen3": [59.7, 80.6, 73.0, 86.6, 74.9, 52.0, 68.1, 77.8, 51.1, 94.26, 61.35],
}
df = pd.DataFrame(scores, index=benchmarks)
# Error bars from the data
yerr_data = {
"Comma-2T": [1.46, 0.94, 0.41, 0.71, 0.47, 2.22, 1.37, 1.04, 1.13, 3.43, 2.0],
"LLaMA-2": [1.46, 0.95, 0.41, 0.73, 0.43, 2.21, 1.38, 0.95, 1.13, 3.11, 1.86],
"DeepSeek": [1.46, 0.95, 0.41, 0.78, 0.43, 2.21, 1.35, 0.94, 1.13, 3.46, 2.01],
"OLMo-Twin": [1.46, 0.98, 0.37, 0.80, 0.44, 2.23, 1.39, 0.94, 1.13, 2.9, 1.71],
"Qwen3": [1.43, 0.81, 0.35, 0.60, 0.43, 2.24, 1.33, 0.97, 1.13, 1.73, 2.1],
}
yerr_df = pd.DataFrame(yerr_data, index=benchmarks)
# Convert to numpy array and handle None values
yerr = yerr_df.values.T # Transpose to match expected shape
# -------------------------------------------------------------
# X positions with a gap between SIQA and HumEval
# -------------------------------------------------------------
base_x = np.arange(len(benchmarks), dtype=float)
gap_start = 9
base_x[gap_start:] += 1.0
# -------------------------------------------------------------
# Plot
# -------------------------------------------------------------
fig, ax = plt.subplots(figsize=(11, 4))
bar_w = 0.15
colours = {
"Comma-2T": "#E6B800", # golden/mustard yellow (like in screenshot)
"OLMo-Twin": "#CC5869", # muted red/pink (like in screenshot)
"LLaMA-2": "#C4A5C9", # light purple/mauve (like in screenshot)
"DeepSeek": "#6B9080", # sage green (like in screenshot)
"Qwen3": "none",
}
hatch_for_qwen = "///"
for i, model in enumerate(models):
offsets = base_x + (i - 2) * bar_w
values = df[model].values
errors = yerr[i]
# Handle None values - replace with 0 for plotting
plot_values = np.where(pd.isna(values), 0, values)
plot_errors = np.where(pd.isna(errors), 0, errors)
bars = ax.bar(
offsets,
plot_values,
width=bar_w,
label=model,
color=colours[model],
edgecolor="dimgray",
linewidth=1.0,
yerr=plot_errors,
capsize=3,
hatch=hatch_for_qwen if model == "Qwen3" else None,
error_kw=dict(elinewidth=1, capthick=1, ecolor="dimgray"),
)
# -------------------------------------------------------------
# Axis and labels
# -------------------------------------------------------------
ax.set_ylabel("Performance")
ax.set_ylim(0, 100)
ax.set_xticks(base_x)
ax.set_xticklabels(benchmarks, rotation=45, ha="right")
# Add horizontal dotted lines for easy comparison
for y in [20, 40, 60, 80]:
ax.axhline(y=y, color='lightgrey', linestyle='--', linewidth=0.8, alpha=0.3, zorder=0)
# Add gold stars for Comma-2T when it's top scoring (excluding Qwen3)
for i, benchmark in enumerate(benchmarks):
comma_score = df["Comma-2T"].iloc[i]
if pd.isna(comma_score) or comma_score == 0:
continue
other_scores = []
for model in ["OLMo-Twin", "LLaMA-2", "DeepSeek"]:
score = df[model].iloc[i]
if not pd.isna(score) and score > 0:
other_scores.append(score)
if other_scores and comma_score >= max(other_scores):
x_pos = base_x[i] + (0 - 2) * bar_w # Comma-2T is index 0
comma_error = yerr_df["Comma-2T"].iloc[i]
if not pd.isna(comma_error):
star_y_pos = comma_score + comma_error + 4
ax.scatter(x_pos, star_y_pos, marker='*', color='gold', s=80,
edgecolor='darkgoldenrod', linewidth=0.5, zorder=10)
# Section labels
ax.text(base_x[4], -27, "Knowledge / Reasoning", ha="center", va="top", fontsize=11)
mid_coding = (base_x[9] + base_x[10]) / 2
ax.text(mid_coding, -27, "Coding", ha="center", va="top", fontsize=11)
# -------------------------------------------------------------
# Legend
# -------------------------------------------------------------
legend_patches = [
Patch(facecolor="#E6B800", edgecolor="dimgray", label="Comma v0.1-2T"),
Patch(facecolor="#CC5869", edgecolor="dimgray", label="OLMo Twin"),
Patch(facecolor="#C4A5C9", edgecolor="dimgray", label="LLaMA 2"),
Patch(facecolor="#6B9080", edgecolor="dimgray", label="DeepSeekLLM"),
Patch(facecolor="none", edgecolor="dimgray", hatch=hatch_for_qwen, label="Qwen3"),
]
ax.legend(
handles=legend_patches,
ncol=5,
frameon=False,
loc="upper center",
bbox_to_anchor=(0.5, 1.12),
)
# -------------------------------------------------------------
# Clean up
# -------------------------------------------------------------
ax.spines[["right", "top"]].set_visible(False)
fig.tight_layout()
fig.subplots_adjust(bottom=0.32)
# Save the plot
plt.savefig('model_comparison_plot_v2.png', dpi=300, bbox_inches='tight')
plt.savefig('model_comparison_plot_v2.pdf', bbox_inches='tight')
print("Plot saved as 'model_comparison_plot_v2.png' and 'model_comparison_plot_v2.pdf'")
plt.show()
@baberabb
Copy link
Author

Figure 4 (2T) (paper below)
fig4

Screenshot 2025-09-16 at 10 20 01 PM

@baberabb
Copy link
Author

fig4_harness

(harness defaults)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment