baberabb/01_scores.md

## 01_scores.md

      
    Raw
  

              01_scores.md
            
          
Model
ARC-C
ARC-E
MMLU
BoolQ
HS
OBQA
CSQA
PIQA
SIQA
HEval
MBPP
Avg


togethercomputer/RedPajama-INCITE-7B-Base
44.7±1.45
66.8±0.97
26.4±0.37
70.9±0.79
70.3±0.46
50.2±2.24
57.7±1.41
77.0±0.98
46.4±1.13
10.3±2.2
17.16±1.48
52.2


huggyllama/llama-7b
48.2±1.46
68.2±0.96
32.2±0.39
75.0±0.76
76.2±0.42
53.6±2.23
61.8±1.39
79.3±0.95
49.0±1.13
17.07±2.66
28.09±1.82
56.9


stabilityai/stablelm-base-alpha-7b
43.69±1.45
65.78±0.97
40.04±0.41
70.31±0.80
74.27±0.44
52.0±2.24
57.25±1.42
79.0±0.95
48.36±1.13
24.76±3.11
36.54±1.95
53.8


mosaicml/mpt-7b
45.8±1.46
67.1±0.96
29.4±0.38
73.7±0.77
76.3±0.42
54.0±2.23
64.2±1.37
80.5±0.92
48.9±1.13
31.07±3.28
35.35±1.94
57.2


openlm-research/open_llama_7b_v2
46.8±1.46
67.3±0.96
33.1±0.39
72.3±0.78
74.5±0.43
50.8±2.24
62.9±1.38
79.8±0.94
49.5±1.13
26.46±3.08
33.18±1.88
54.2


common-pile/comma-v0.1-1t
50.1±1.46
68.4±0.95
36.0±0.40
74.6±0.76
64.3±0.48
49.8±2.24
59.8±1.40
72.7±1.04
49.3±1.13
35.64±3.34
34.35±1.95
55.9


Qwen/Qwen3-8B
59.7±1.43
80.6±0.81
73.0±0.35
86.6±0.60
74.9±0.43
52.0±2.24
68.1±1.33
77.8±0.97
51.1±1.13
94.26±1.73
61.35±2.10
68.6


## 02_scores.md

      
    Raw
  

              02_scores.md
            
          
Model
ARC-C
ARC-E
MMLU
BoolQ
HS
OBQA
CSQA
PIQA
SIQA
HEval
MBPP
Avg


allenai/OLMo-7B-Twin-2T-hf
46.3±1.46
65.3±0.98
26.0±0.37
69.8±0.80
74.1±0.44
53.4±2.23
61.8±1.39
79.3±0.94
48.5±1.13
19.9±2.9
23.9±1.71
51.7


meta-llama/Llama-2-7b-hf
50.0±1.46
69.3±0.95
41.8±0.41
77.7±0.73
76.0±0.43
57.2±2.21
62.7±1.38
78.8±0.95
49.5±1.13
26.4±3.11
31.1±1.86
56.4


common-pile/comma-v0.1-2t
51.0±1.46
70.6±0.94
46.1±0.41
79.0±0.71
67.8±0.47
56.0±2.22
64.2±1.37
73.0±1.04
51.0±1.13
44.9±3.43
40.7±2.0
58.6


deepseek-ai/deepseek-llm-7b-base
49.6±1.46
68.8±0.95
44.2±0.41
72.4±0.78
76.2±0.43
58.0±2.21
66.8±1.35
79.7±0.94
50.9±1.13
46.9±3.46
43.2±2.01
59.7


## 03_plot_1t.py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Patch


# -------------------------------------------------------------
# Data
# -------------------------------------------------------------
benchmarks = [
    "ARC-C",
    "ARC-E",
    "MMLU",
    "BoolQ",
    "HSwag",
    "OBQA",
    "CSQA",
    "PIQA",
    "SIQA",  # Knowledge / Reasoning
    "HumEval",
    "MBPP",  # Coding
]
models = ["Comma", "LLaMA", "MPT", "RPJ-INCITE", "Qwen3"]

scores = {
    "Comma": [50.1, 68.4, 36.0, 74.6, 64.3, 49.8, 59.8, 72.7, 49.3, 35.64, 34.35],
    "LLaMA": [48.2, 68.2, 32.2, 75.0, 76.2, 53.6, 61.8, 79.3, 49.0, 17.07, 28.09],
    "MPT": [45.8, 67.1, 29.4, 73.7, 76.3, 54.0, 64.2, 80.5, 48.9, 31.07, 35.35],
    "RPJ-INCITE": [44.7, 66.8, 26.4, 70.9, 70.3, 50.2, 57.7, 77.0, 46.4, 10.3, 17.16],
    "Qwen3": [59.7, 80.6, 73.0, 86.6, 74.9, 52.0, 68.1, 77.8, 51.1, 94.26, 61.35],
}
df = pd.DataFrame(scores, index=benchmarks)

# Error bars from the data
yerr_data = {
    "Comma": [1.46, 0.95, 0.40, 0.76, 0.48, 2.24, 1.40, 1.04, 1.13, 3.34, 1.95],
    "LLaMA": [1.46, 0.96, 0.39, 0.76, 0.42, 2.23, 1.39, 0.95, 1.13, 2.66, 1.82],
    "MPT": [1.46, 0.96, 0.38, 0.77, 0.42, 2.23, 1.37, 0.92, 1.13, 3.28, 1.94],
    "RPJ-INCITE": [1.45, 0.97, 0.37, 0.79, 0.46, 2.24, 1.41, 0.98, 1.13, 2.2, 1.48],
    "Qwen3": [1.43, 0.81, 0.35, 0.60, 0.43, 2.24, 1.33, 0.97, 1.13, 1.73, 2.1],
}
yerr_df = pd.DataFrame(yerr_data, index=benchmarks)

# Convert to numpy array and handle None values
yerr = yerr_df.values.T  # Transpose to match expected shape

# -------------------------------------------------------------
# X positions with a gap between SIQA and HumEval
# -------------------------------------------------------------
base_x = np.arange(len(benchmarks), dtype=float)
gap_start = 9
base_x[gap_start:] += 1.0

# -------------------------------------------------------------
# Plot
# -------------------------------------------------------------
fig, ax = plt.subplots(figsize=(11, 4))

bar_w = 0.15
colours = {
    "Comma": "gold",
    "LLaMA": "hotpink",
    "MPT": "mediumpurple",
    "RPJ-INCITE": "cornflowerblue",
    "Qwen3": "none",
}
hatch_for_qwen = "///"

for i, model in enumerate(models):
    offsets = base_x + (i - 2) * bar_w
    values = df[model].values
    errors = yerr[i]

    # Handle None values - replace with 0 for plotting
    plot_values = np.where(pd.isna(values), 0, values)
    plot_errors = np.where(pd.isna(errors), 0, errors)

    bars = ax.bar(
        offsets,
        plot_values,
        width=bar_w,
        label=model,
        color=colours[model],
        edgecolor="dimgray",
        linewidth=1.0,
        yerr=plot_errors,
        capsize=3,
        hatch=hatch_for_qwen if model == "Qwen3" else None,
        error_kw=dict(elinewidth=1, capthick=1, ecolor="dimgray"),
    )

# -------------------------------------------------------------
# Axis and labels
# -------------------------------------------------------------
ax.set_ylabel("Performance")
ax.set_ylim(0, 100)
ax.set_xticks(base_x)
ax.set_xticklabels(benchmarks, rotation=45, ha="right")

# Add horizontal dotted lines for easy comparison
for y in [20, 40, 60, 80]:
    ax.axhline(
        y=y, color="lightgrey", linestyle="--", linewidth=0.8, alpha=0.3, zorder=0
    )

# Add gold stars for Comma when it's top scoring (excluding Qwen)
for i, benchmark in enumerate(benchmarks):
    comma_score = df["Comma"].iloc[i]
    if pd.isna(comma_score) or comma_score == 0:
        continue
    other_scores = []
    for model in ["LLaMA", "MPT", "RPJ-INCITE"]:
        score = df[model].iloc[i]
        if not pd.isna(score) and score > 0:
            other_scores.append(score)
    if other_scores and comma_score >= max(other_scores):
        x_pos = base_x[i] + (0 - 2) * bar_w  # Comma is index 0
        comma_error = yerr_df["Comma"].iloc[i]
        if not pd.isna(comma_error):
            star_y_pos = comma_score + comma_error + 4
            ax.scatter(
                x_pos,
                star_y_pos,
                marker="*",
                color="gold",
                s=80,
                edgecolor="darkgoldenrod",
                linewidth=0.5,
                zorder=10,
            )

# Section labels
ax.text(base_x[4], -27, "Knowledge / Reasoning", ha="center", va="top", fontsize=11)
mid_coding = (base_x[9] + base_x[10]) / 2
ax.text(mid_coding, -27, "Coding", ha="center", va="top", fontsize=11)

# -------------------------------------------------------------
# Legend
# -------------------------------------------------------------
legend_patches = [
    Patch(facecolor="gold", edgecolor="dimgray", label="Comma"),
    Patch(facecolor="hotpink", edgecolor="dimgray", label="LLaMA"),
    Patch(facecolor="mediumpurple", edgecolor="dimgray", label="MPT"),
    Patch(facecolor="cornflowerblue", edgecolor="dimgray", label="RPJ-INCITE"),
    Patch(facecolor="none", edgecolor="dimgray", hatch=hatch_for_qwen, label="Qwen3"),
]
ax.legend(
    handles=legend_patches,
    ncol=5,
    frameon=False,
    loc="upper center",
    bbox_to_anchor=(0.5, 1.12),
)

# -------------------------------------------------------------
# Clean up
# -------------------------------------------------------------
ax.spines[["right", "top"]].set_visible(False)
fig.tight_layout()
fig.subplots_adjust(bottom=0.32)

# Save the plot
plt.savefig("model_comparison_plot.png", dpi=300, bbox_inches="tight")
plt.savefig("model_comparison_plot.pdf", bbox_inches="tight")
print("Plot saved as 'model_comparison_plot.png' and 'model_comparison_plot.pdf'")

plt.show()

## 04_plot_2t.py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.patches import Patch


# -------------------------------------------------------------
# Data
# -------------------------------------------------------------
benchmarks = [
    "ARC-C",
    "ARC-E",
    "MMLU",
    "BoolQ",
    "HSwag",
    "OBQA",
    "CSQA",
    "PIQA",
    "SIQA",  # Knowledge / Reasoning
    "HumEval",
    "MBPP",  # Coding
]
models = ["Comma-2T", "OLMo-Twin", "LLaMA-2", "DeepSeek", "Qwen3"]

scores = {
    "Comma-2T": [51.0, 70.6, 46.1, 79.0, 67.8, 56.0, 64.2, 73.0, 51.0, 44.92, 40.7],
    "LLaMA-2": [50.0, 69.3, 41.8, 77.7, 76.0, 57.2, 62.7, 78.8, 49.5, 26.4, 31.11],
    "DeepSeek": [49.6, 68.8, 44.2, 72.4, 76.2, 58.0, 66.8, 79.7, 50.9, 46.86, 43.21],
    "OLMo-Twin": [46.3, 65.3, 26.0, 69.8, 74.1, 53.4, 61.8, 79.3, 48.5, 19.92, 23.9],
    "Qwen3": [59.7, 80.6, 73.0, 86.6, 74.9, 52.0, 68.1, 77.8, 51.1, 94.26, 61.35],
}
df = pd.DataFrame(scores, index=benchmarks)

# Error bars from the data
yerr_data = {
    "Comma-2T": [1.46, 0.94, 0.41, 0.71, 0.47, 2.22, 1.37, 1.04, 1.13, 3.43, 2.0],
    "LLaMA-2": [1.46, 0.95, 0.41, 0.73, 0.43, 2.21, 1.38, 0.95, 1.13, 3.11, 1.86],
    "DeepSeek": [1.46, 0.95, 0.41, 0.78, 0.43, 2.21, 1.35, 0.94, 1.13, 3.46, 2.01],
    "OLMo-Twin": [1.46, 0.98, 0.37, 0.80, 0.44, 2.23, 1.39, 0.94, 1.13, 2.9, 1.71],
    "Qwen3": [1.43, 0.81, 0.35, 0.60, 0.43, 2.24, 1.33, 0.97, 1.13, 1.73, 2.1],
}
yerr_df = pd.DataFrame(yerr_data, index=benchmarks)

# Convert to numpy array and handle None values
yerr = yerr_df.values.T  # Transpose to match expected shape

# -------------------------------------------------------------
# X positions with a gap between SIQA and HumEval
# -------------------------------------------------------------
base_x = np.arange(len(benchmarks), dtype=float)
gap_start = 9
base_x[gap_start:] += 1.0

# -------------------------------------------------------------
# Plot
# -------------------------------------------------------------
fig, ax = plt.subplots(figsize=(11, 4))

bar_w = 0.15
colours = {
    "Comma-2T": "#E6B800",   # golden/mustard yellow (like in screenshot)
    "OLMo-Twin": "#CC5869",  # muted red/pink (like in screenshot)
    "LLaMA-2": "#C4A5C9",    # light purple/mauve (like in screenshot)
    "DeepSeek": "#6B9080",   # sage green (like in screenshot)
    "Qwen3": "none",
}
hatch_for_qwen = "///"

for i, model in enumerate(models):
    offsets = base_x + (i - 2) * bar_w
    values = df[model].values
    errors = yerr[i]

    # Handle None values - replace with 0 for plotting
    plot_values = np.where(pd.isna(values), 0, values)
    plot_errors = np.where(pd.isna(errors), 0, errors)

    bars = ax.bar(
        offsets,
        plot_values,
        width=bar_w,
        label=model,
        color=colours[model],
        edgecolor="dimgray",
        linewidth=1.0,
        yerr=plot_errors,
        capsize=3,
        hatch=hatch_for_qwen if model == "Qwen3" else None,
        error_kw=dict(elinewidth=1, capthick=1, ecolor="dimgray"),
    )

# -------------------------------------------------------------
# Axis and labels
# -------------------------------------------------------------
ax.set_ylabel("Performance")
ax.set_ylim(0, 100)
ax.set_xticks(base_x)
ax.set_xticklabels(benchmarks, rotation=45, ha="right")

# Add horizontal dotted lines for easy comparison
for y in [20, 40, 60, 80]:
    ax.axhline(y=y, color='lightgrey', linestyle='--', linewidth=0.8, alpha=0.3, zorder=0)

# Add gold stars for Comma-2T when it's top scoring (excluding Qwen3)
for i, benchmark in enumerate(benchmarks):
    comma_score = df["Comma-2T"].iloc[i]
    if pd.isna(comma_score) or comma_score == 0:
        continue
    other_scores = []
    for model in ["OLMo-Twin", "LLaMA-2", "DeepSeek"]:
        score = df[model].iloc[i]
        if not pd.isna(score) and score > 0:
            other_scores.append(score)
    if other_scores and comma_score >= max(other_scores):
        x_pos = base_x[i] + (0 - 2) * bar_w  # Comma-2T is index 0
        comma_error = yerr_df["Comma-2T"].iloc[i]
        if not pd.isna(comma_error):
            star_y_pos = comma_score + comma_error + 4
            ax.scatter(x_pos, star_y_pos, marker='*', color='gold', s=80,
                      edgecolor='darkgoldenrod', linewidth=0.5, zorder=10)

# Section labels
ax.text(base_x[4], -27, "Knowledge / Reasoning", ha="center", va="top", fontsize=11)
mid_coding = (base_x[9] + base_x[10]) / 2
ax.text(mid_coding, -27, "Coding", ha="center", va="top", fontsize=11)

# -------------------------------------------------------------
# Legend
# -------------------------------------------------------------
legend_patches = [
    Patch(facecolor="#E6B800", edgecolor="dimgray", label="Comma v0.1-2T"),
    Patch(facecolor="#CC5869", edgecolor="dimgray", label="OLMo Twin"),
    Patch(facecolor="#C4A5C9", edgecolor="dimgray", label="LLaMA 2"),
    Patch(facecolor="#6B9080", edgecolor="dimgray", label="DeepSeekLLM"),
    Patch(facecolor="none", edgecolor="dimgray", hatch=hatch_for_qwen, label="Qwen3"),
]
ax.legend(
    handles=legend_patches,
    ncol=5,
    frameon=False,
    loc="upper center",
    bbox_to_anchor=(0.5, 1.12),
)

# -------------------------------------------------------------
# Clean up
# -------------------------------------------------------------
ax.spines[["right", "top"]].set_visible(False)
fig.tight_layout()
fig.subplots_adjust(bottom=0.32)

# Save the plot
plt.savefig('model_comparison_plot_v2.png', dpi=300, bbox_inches='tight')
plt.savefig('model_comparison_plot_v2.pdf', bbox_inches='tight')
print("Plot saved as 'model_comparison_plot_v2.png' and 'model_comparison_plot_v2.pdf'")

plt.show()
Model	ARC-C	ARC-E	MMLU	BoolQ	HS	OBQA	CSQA	PIQA	SIQA	HEval	MBPP	Avg
togethercomputer/RedPajama-INCITE-7B-Base	44.7±1.45	66.8±0.97	26.4±0.37	70.9±0.79	70.3±0.46	50.2±2.24	57.7±1.41	77.0±0.98	46.4±1.13	10.3±2.2	17.16±1.48	52.2
huggyllama/llama-7b	48.2±1.46	68.2±0.96	32.2±0.39	75.0±0.76	76.2±0.42	53.6±2.23	61.8±1.39	79.3±0.95	49.0±1.13	17.07±2.66	28.09±1.82	56.9
stabilityai/stablelm-base-alpha-7b	43.69±1.45	65.78±0.97	40.04±0.41	70.31±0.80	74.27±0.44	52.0±2.24	57.25±1.42	79.0±0.95	48.36±1.13	24.76±3.11	36.54±1.95	53.8
mosaicml/mpt-7b	45.8±1.46	67.1±0.96	29.4±0.38	73.7±0.77	76.3±0.42	54.0±2.23	64.2±1.37	80.5±0.92	48.9±1.13	31.07±3.28	35.35±1.94	57.2
openlm-research/open_llama_7b_v2	46.8±1.46	67.3±0.96	33.1±0.39	72.3±0.78	74.5±0.43	50.8±2.24	62.9±1.38	79.8±0.94	49.5±1.13	26.46±3.08	33.18±1.88	54.2
common-pile/comma-v0.1-1t	50.1±1.46	68.4±0.95	36.0±0.40	74.6±0.76	64.3±0.48	49.8±2.24	59.8±1.40	72.7±1.04	49.3±1.13	35.64±3.34	34.35±1.95	55.9
Qwen/Qwen3-8B	59.7±1.43	80.6±0.81	73.0±0.35	86.6±0.60	74.9±0.43	52.0±2.24	68.1±1.33	77.8±0.97	51.1±1.13	94.26±1.73	61.35±2.10	68.6
Model	ARC-C	ARC-E	MMLU	BoolQ	HS	OBQA	CSQA	PIQA	SIQA	HEval	MBPP	Avg
allenai/OLMo-7B-Twin-2T-hf	46.3±1.46	65.3±0.98	26.0±0.37	69.8±0.80	74.1±0.44	53.4±2.23	61.8±1.39	79.3±0.94	48.5±1.13	19.9±2.9	23.9±1.71	51.7
meta-llama/Llama-2-7b-hf	50.0±1.46	69.3±0.95	41.8±0.41	77.7±0.73	76.0±0.43	57.2±2.21	62.7±1.38	78.8±0.95	49.5±1.13	26.4±3.11	31.1±1.86	56.4
common-pile/comma-v0.1-2t	51.0±1.46	70.6±0.94	46.1±0.41	79.0±0.71	67.8±0.47	56.0±2.22	64.2±1.37	73.0±1.04	51.0±1.13	44.9±3.43	40.7±2.0	58.6
deepseek-ai/deepseek-llm-7b-base	49.6±1.46	68.8±0.95	44.2±0.41	72.4±0.78	76.2±0.43	58.0±2.21	66.8±1.35	79.7±0.94	50.9±1.13	46.9±3.46	43.2±2.01	59.7
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	from matplotlib.patches import Patch


	# -------------------------------------------------------------
	# Data
	# -------------------------------------------------------------
	benchmarks = [
	"ARC-C",
	"ARC-E",
	"MMLU",
	"BoolQ",
	"HSwag",
	"OBQA",
	"CSQA",
	"PIQA",
	"SIQA", # Knowledge / Reasoning
	"HumEval",
	"MBPP", # Coding
	]
	models = ["Comma", "LLaMA", "MPT", "RPJ-INCITE", "Qwen3"]

	scores = {
	"Comma": [50.1, 68.4, 36.0, 74.6, 64.3, 49.8, 59.8, 72.7, 49.3, 35.64, 34.35],
	"LLaMA": [48.2, 68.2, 32.2, 75.0, 76.2, 53.6, 61.8, 79.3, 49.0, 17.07, 28.09],
	"MPT": [45.8, 67.1, 29.4, 73.7, 76.3, 54.0, 64.2, 80.5, 48.9, 31.07, 35.35],
	"RPJ-INCITE": [44.7, 66.8, 26.4, 70.9, 70.3, 50.2, 57.7, 77.0, 46.4, 10.3, 17.16],
	"Qwen3": [59.7, 80.6, 73.0, 86.6, 74.9, 52.0, 68.1, 77.8, 51.1, 94.26, 61.35],
	}
	df = pd.DataFrame(scores, index=benchmarks)

	# Error bars from the data
	yerr_data = {
	"Comma": [1.46, 0.95, 0.40, 0.76, 0.48, 2.24, 1.40, 1.04, 1.13, 3.34, 1.95],
	"LLaMA": [1.46, 0.96, 0.39, 0.76, 0.42, 2.23, 1.39, 0.95, 1.13, 2.66, 1.82],
	"MPT": [1.46, 0.96, 0.38, 0.77, 0.42, 2.23, 1.37, 0.92, 1.13, 3.28, 1.94],
	"RPJ-INCITE": [1.45, 0.97, 0.37, 0.79, 0.46, 2.24, 1.41, 0.98, 1.13, 2.2, 1.48],
	"Qwen3": [1.43, 0.81, 0.35, 0.60, 0.43, 2.24, 1.33, 0.97, 1.13, 1.73, 2.1],
	}
	yerr_df = pd.DataFrame(yerr_data, index=benchmarks)

	# Convert to numpy array and handle None values
	yerr = yerr_df.values.T # Transpose to match expected shape

	# -------------------------------------------------------------
	# X positions with a gap between SIQA and HumEval
	# -------------------------------------------------------------
	base_x = np.arange(len(benchmarks), dtype=float)
	gap_start = 9
	base_x[gap_start:] += 1.0

	# -------------------------------------------------------------
	# Plot
	# -------------------------------------------------------------
	fig, ax = plt.subplots(figsize=(11, 4))

	bar_w = 0.15
	colours = {
	"Comma": "gold",
	"LLaMA": "hotpink",
	"MPT": "mediumpurple",
	"RPJ-INCITE": "cornflowerblue",
	"Qwen3": "none",
	}
	hatch_for_qwen = "///"

	for i, model in enumerate(models):
	offsets = base_x + (i - 2) * bar_w
	values = df[model].values
	errors = yerr[i]

	# Handle None values - replace with 0 for plotting
	plot_values = np.where(pd.isna(values), 0, values)
	plot_errors = np.where(pd.isna(errors), 0, errors)

	bars = ax.bar(
	offsets,
	plot_values,
	width=bar_w,
	label=model,
	color=colours[model],
	edgecolor="dimgray",
	linewidth=1.0,
	yerr=plot_errors,
	capsize=3,
	hatch=hatch_for_qwen if model == "Qwen3" else None,
	error_kw=dict(elinewidth=1, capthick=1, ecolor="dimgray"),
	)

	# -------------------------------------------------------------
	# Axis and labels
	# -------------------------------------------------------------
	ax.set_ylabel("Performance")
	ax.set_ylim(0, 100)
	ax.set_xticks(base_x)
	ax.set_xticklabels(benchmarks, rotation=45, ha="right")

	# Add horizontal dotted lines for easy comparison
	for y in [20, 40, 60, 80]:
	ax.axhline(
	y=y, color="lightgrey", linestyle="--", linewidth=0.8, alpha=0.3, zorder=0
	)

	# Add gold stars for Comma when it's top scoring (excluding Qwen)
	for i, benchmark in enumerate(benchmarks):
	comma_score = df["Comma"].iloc[i]
	if pd.isna(comma_score) or comma_score == 0:
	continue
	other_scores = []
	for model in ["LLaMA", "MPT", "RPJ-INCITE"]:
	score = df[model].iloc[i]
	if not pd.isna(score) and score > 0:
	other_scores.append(score)
	if other_scores and comma_score >= max(other_scores):
	x_pos = base_x[i] + (0 - 2) * bar_w # Comma is index 0
	comma_error = yerr_df["Comma"].iloc[i]
	if not pd.isna(comma_error):
	star_y_pos = comma_score + comma_error + 4
	ax.scatter(
	x_pos,
	star_y_pos,
	marker="*",
	color="gold",
	s=80,
	edgecolor="darkgoldenrod",
	linewidth=0.5,
	zorder=10,
	)

	# Section labels
	ax.text(base_x[4], -27, "Knowledge / Reasoning", ha="center", va="top", fontsize=11)
	mid_coding = (base_x[9] + base_x[10]) / 2
	ax.text(mid_coding, -27, "Coding", ha="center", va="top", fontsize=11)

	# -------------------------------------------------------------
	# Legend
	# -------------------------------------------------------------
	legend_patches = [
	Patch(facecolor="gold", edgecolor="dimgray", label="Comma"),
	Patch(facecolor="hotpink", edgecolor="dimgray", label="LLaMA"),
	Patch(facecolor="mediumpurple", edgecolor="dimgray", label="MPT"),
	Patch(facecolor="cornflowerblue", edgecolor="dimgray", label="RPJ-INCITE"),
	Patch(facecolor="none", edgecolor="dimgray", hatch=hatch_for_qwen, label="Qwen3"),
	]
	ax.legend(
	handles=legend_patches,
	ncol=5,
	frameon=False,
	loc="upper center",
	bbox_to_anchor=(0.5, 1.12),
	)

	# -------------------------------------------------------------
	# Clean up
	# -------------------------------------------------------------
	ax.spines[["right", "top"]].set_visible(False)
	fig.tight_layout()
	fig.subplots_adjust(bottom=0.32)

	# Save the plot
	plt.savefig("model_comparison_plot.png", dpi=300, bbox_inches="tight")
	plt.savefig("model_comparison_plot.pdf", bbox_inches="tight")
	print("Plot saved as 'model_comparison_plot.png' and 'model_comparison_plot.pdf'")

	plt.show()