| Model | ARC-C | ARC-E | MMLU | BoolQ | HS | OBQA | CSQA | PIQA | SIQA | HEval | MBPP | Avg |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| togethercomputer/RedPajama-INCITE-7B-Base | 44.7±1.45 | 66.8±0.97 | 26.4±0.37 | 70.9±0.79 | 70.3±0.46 | 50.2±2.24 | 57.7±1.41 | 77.0±0.98 | 46.4±1.13 | 10.3±2.2 | 17.16±1.48 | 52.2 |
| huggyllama/llama-7b | 48.2±1.46 | 68.2±0.96 | 32.2±0.39 | 75.0±0.76 | 76.2±0.42 | 53.6±2.23 | 61.8±1.39 | 79.3±0.95 | 49.0±1.13 | 17.07±2.66 | 28.09±1.82 | 56.9 |
| stabilityai/stablelm-base-alpha-7b | 43.69±1.45 | 65.78±0.97 | 40.04±0.41 | 70.31±0.80 | 74.27±0.44 | 52.0±2.24 | 57.25±1.42 | 79.0±0.95 | 48.36±1.13 | 24.76±3.11 | 36.54±1.95 | 53.8 |
| mosaicml/mpt-7b | 45.8±1.46 | 67.1±0.96 | 29.4±0.38 | 73.7±0.77 | 76.3±0.42 | 54.0±2.23 | 64.2±1.37 | 80.5±0.92 | 48.9±1.13 | 31.07±3.28 | 35.35±1.94 | 57.2 |
| openlm-research/open_llama_7b_v2 | 46.8±1.46 | 67.3±0.96 | 33.1±0.39 | 72.3±0.78 | 74.5±0.43 | 50.8±2.24 | 62.9±1.38 | 79.8±0.94 | 49.5±1.13 | 26.46±3.08 | 33.18±1.88 | 54.2 |
| common-pile/comma-v0.1-1t | 50.1±1.46 | 68.4±0.95 | 36.0±0.40 | 74.6±0.76 | 64.3±0.48 | 49.8±2.24 | 59.8±1.40 | 72.7±1.04 | 49.3±1.13 | 35.64±3.34 | 34.35±1.95 | 55.9 |
| Qwen/Qwen3-8B | 59.7±1.43 | 80.6±0.81 | 73.0±0.35 | 86.6±0.60 | 74.9±0.43 | 52.0±2.24 | 68.1±1.33 | 77.8±0.97 | 51.1±1.13 | 94.26±1.73 | 61.35±2.10 | 68.6 |
Last active
October 13, 2025 12:59
-
-
Save baberabb/cffd7cd69362fe6d42ef5a0218347cfd to your computer and use it in GitHub Desktop.
Comma
| Model | ARC-C | ARC-E | MMLU | BoolQ | HS | OBQA | CSQA | PIQA | SIQA | HEval | MBPP | Avg |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| allenai/OLMo-7B-Twin-2T-hf | 46.3±1.46 | 65.3±0.98 | 26.0±0.37 | 69.8±0.80 | 74.1±0.44 | 53.4±2.23 | 61.8±1.39 | 79.3±0.94 | 48.5±1.13 | 19.9±2.9 | 23.9±1.71 | 51.7 |
| meta-llama/Llama-2-7b-hf | 50.0±1.46 | 69.3±0.95 | 41.8±0.41 | 77.7±0.73 | 76.0±0.43 | 57.2±2.21 | 62.7±1.38 | 78.8±0.95 | 49.5±1.13 | 26.4±3.11 | 31.1±1.86 | 56.4 |
| common-pile/comma-v0.1-2t | 51.0±1.46 | 70.6±0.94 | 46.1±0.41 | 79.0±0.71 | 67.8±0.47 | 56.0±2.22 | 64.2±1.37 | 73.0±1.04 | 51.0±1.13 | 44.9±3.43 | 40.7±2.0 | 58.6 |
| deepseek-ai/deepseek-llm-7b-base | 49.6±1.46 | 68.8±0.95 | 44.2±0.41 | 72.4±0.78 | 76.2±0.43 | 58.0±2.21 | 66.8±1.35 | 79.7±0.94 | 50.9±1.13 | 46.9±3.46 | 43.2±2.01 | 59.7 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| from matplotlib.patches import Patch | |
| # ------------------------------------------------------------- | |
| # Data | |
| # ------------------------------------------------------------- | |
| benchmarks = [ | |
| "ARC-C", | |
| "ARC-E", | |
| "MMLU", | |
| "BoolQ", | |
| "HSwag", | |
| "OBQA", | |
| "CSQA", | |
| "PIQA", | |
| "SIQA", # Knowledge / Reasoning | |
| "HumEval", | |
| "MBPP", # Coding | |
| ] | |
| models = ["Comma", "LLaMA", "MPT", "RPJ-INCITE", "Qwen3"] | |
| scores = { | |
| "Comma": [50.1, 68.4, 36.0, 74.6, 64.3, 49.8, 59.8, 72.7, 49.3, 35.64, 34.35], | |
| "LLaMA": [48.2, 68.2, 32.2, 75.0, 76.2, 53.6, 61.8, 79.3, 49.0, 17.07, 28.09], | |
| "MPT": [45.8, 67.1, 29.4, 73.7, 76.3, 54.0, 64.2, 80.5, 48.9, 31.07, 35.35], | |
| "RPJ-INCITE": [44.7, 66.8, 26.4, 70.9, 70.3, 50.2, 57.7, 77.0, 46.4, 10.3, 17.16], | |
| "Qwen3": [59.7, 80.6, 73.0, 86.6, 74.9, 52.0, 68.1, 77.8, 51.1, 94.26, 61.35], | |
| } | |
| df = pd.DataFrame(scores, index=benchmarks) | |
| # Error bars from the data | |
| yerr_data = { | |
| "Comma": [1.46, 0.95, 0.40, 0.76, 0.48, 2.24, 1.40, 1.04, 1.13, 3.34, 1.95], | |
| "LLaMA": [1.46, 0.96, 0.39, 0.76, 0.42, 2.23, 1.39, 0.95, 1.13, 2.66, 1.82], | |
| "MPT": [1.46, 0.96, 0.38, 0.77, 0.42, 2.23, 1.37, 0.92, 1.13, 3.28, 1.94], | |
| "RPJ-INCITE": [1.45, 0.97, 0.37, 0.79, 0.46, 2.24, 1.41, 0.98, 1.13, 2.2, 1.48], | |
| "Qwen3": [1.43, 0.81, 0.35, 0.60, 0.43, 2.24, 1.33, 0.97, 1.13, 1.73, 2.1], | |
| } | |
| yerr_df = pd.DataFrame(yerr_data, index=benchmarks) | |
| # Convert to numpy array and handle None values | |
| yerr = yerr_df.values.T # Transpose to match expected shape | |
| # ------------------------------------------------------------- | |
| # X positions with a gap between SIQA and HumEval | |
| # ------------------------------------------------------------- | |
| base_x = np.arange(len(benchmarks), dtype=float) | |
| gap_start = 9 | |
| base_x[gap_start:] += 1.0 | |
| # ------------------------------------------------------------- | |
| # Plot | |
| # ------------------------------------------------------------- | |
| fig, ax = plt.subplots(figsize=(11, 4)) | |
| bar_w = 0.15 | |
| colours = { | |
| "Comma": "gold", | |
| "LLaMA": "hotpink", | |
| "MPT": "mediumpurple", | |
| "RPJ-INCITE": "cornflowerblue", | |
| "Qwen3": "none", | |
| } | |
| hatch_for_qwen = "///" | |
| for i, model in enumerate(models): | |
| offsets = base_x + (i - 2) * bar_w | |
| values = df[model].values | |
| errors = yerr[i] | |
| # Handle None values - replace with 0 for plotting | |
| plot_values = np.where(pd.isna(values), 0, values) | |
| plot_errors = np.where(pd.isna(errors), 0, errors) | |
| bars = ax.bar( | |
| offsets, | |
| plot_values, | |
| width=bar_w, | |
| label=model, | |
| color=colours[model], | |
| edgecolor="dimgray", | |
| linewidth=1.0, | |
| yerr=plot_errors, | |
| capsize=3, | |
| hatch=hatch_for_qwen if model == "Qwen3" else None, | |
| error_kw=dict(elinewidth=1, capthick=1, ecolor="dimgray"), | |
| ) | |
| # ------------------------------------------------------------- | |
| # Axis and labels | |
| # ------------------------------------------------------------- | |
| ax.set_ylabel("Performance") | |
| ax.set_ylim(0, 100) | |
| ax.set_xticks(base_x) | |
| ax.set_xticklabels(benchmarks, rotation=45, ha="right") | |
| # Add horizontal dotted lines for easy comparison | |
| for y in [20, 40, 60, 80]: | |
| ax.axhline( | |
| y=y, color="lightgrey", linestyle="--", linewidth=0.8, alpha=0.3, zorder=0 | |
| ) | |
| # Add gold stars for Comma when it's top scoring (excluding Qwen) | |
| for i, benchmark in enumerate(benchmarks): | |
| comma_score = df["Comma"].iloc[i] | |
| if pd.isna(comma_score) or comma_score == 0: | |
| continue | |
| other_scores = [] | |
| for model in ["LLaMA", "MPT", "RPJ-INCITE"]: | |
| score = df[model].iloc[i] | |
| if not pd.isna(score) and score > 0: | |
| other_scores.append(score) | |
| if other_scores and comma_score >= max(other_scores): | |
| x_pos = base_x[i] + (0 - 2) * bar_w # Comma is index 0 | |
| comma_error = yerr_df["Comma"].iloc[i] | |
| if not pd.isna(comma_error): | |
| star_y_pos = comma_score + comma_error + 4 | |
| ax.scatter( | |
| x_pos, | |
| star_y_pos, | |
| marker="*", | |
| color="gold", | |
| s=80, | |
| edgecolor="darkgoldenrod", | |
| linewidth=0.5, | |
| zorder=10, | |
| ) | |
| # Section labels | |
| ax.text(base_x[4], -27, "Knowledge / Reasoning", ha="center", va="top", fontsize=11) | |
| mid_coding = (base_x[9] + base_x[10]) / 2 | |
| ax.text(mid_coding, -27, "Coding", ha="center", va="top", fontsize=11) | |
| # ------------------------------------------------------------- | |
| # Legend | |
| # ------------------------------------------------------------- | |
| legend_patches = [ | |
| Patch(facecolor="gold", edgecolor="dimgray", label="Comma"), | |
| Patch(facecolor="hotpink", edgecolor="dimgray", label="LLaMA"), | |
| Patch(facecolor="mediumpurple", edgecolor="dimgray", label="MPT"), | |
| Patch(facecolor="cornflowerblue", edgecolor="dimgray", label="RPJ-INCITE"), | |
| Patch(facecolor="none", edgecolor="dimgray", hatch=hatch_for_qwen, label="Qwen3"), | |
| ] | |
| ax.legend( | |
| handles=legend_patches, | |
| ncol=5, | |
| frameon=False, | |
| loc="upper center", | |
| bbox_to_anchor=(0.5, 1.12), | |
| ) | |
| # ------------------------------------------------------------- | |
| # Clean up | |
| # ------------------------------------------------------------- | |
| ax.spines[["right", "top"]].set_visible(False) | |
| fig.tight_layout() | |
| fig.subplots_adjust(bottom=0.32) | |
| # Save the plot | |
| plt.savefig("model_comparison_plot.png", dpi=300, bbox_inches="tight") | |
| plt.savefig("model_comparison_plot.pdf", bbox_inches="tight") | |
| print("Plot saved as 'model_comparison_plot.png' and 'model_comparison_plot.pdf'") | |
| plt.show() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import pandas as pd | |
| from matplotlib.patches import Patch | |
| # ------------------------------------------------------------- | |
| # Data | |
| # ------------------------------------------------------------- | |
| benchmarks = [ | |
| "ARC-C", | |
| "ARC-E", | |
| "MMLU", | |
| "BoolQ", | |
| "HSwag", | |
| "OBQA", | |
| "CSQA", | |
| "PIQA", | |
| "SIQA", # Knowledge / Reasoning | |
| "HumEval", | |
| "MBPP", # Coding | |
| ] | |
| models = ["Comma-2T", "OLMo-Twin", "LLaMA-2", "DeepSeek", "Qwen3"] | |
| scores = { | |
| "Comma-2T": [51.0, 70.6, 46.1, 79.0, 67.8, 56.0, 64.2, 73.0, 51.0, 44.92, 40.7], | |
| "LLaMA-2": [50.0, 69.3, 41.8, 77.7, 76.0, 57.2, 62.7, 78.8, 49.5, 26.4, 31.11], | |
| "DeepSeek": [49.6, 68.8, 44.2, 72.4, 76.2, 58.0, 66.8, 79.7, 50.9, 46.86, 43.21], | |
| "OLMo-Twin": [46.3, 65.3, 26.0, 69.8, 74.1, 53.4, 61.8, 79.3, 48.5, 19.92, 23.9], | |
| "Qwen3": [59.7, 80.6, 73.0, 86.6, 74.9, 52.0, 68.1, 77.8, 51.1, 94.26, 61.35], | |
| } | |
| df = pd.DataFrame(scores, index=benchmarks) | |
| # Error bars from the data | |
| yerr_data = { | |
| "Comma-2T": [1.46, 0.94, 0.41, 0.71, 0.47, 2.22, 1.37, 1.04, 1.13, 3.43, 2.0], | |
| "LLaMA-2": [1.46, 0.95, 0.41, 0.73, 0.43, 2.21, 1.38, 0.95, 1.13, 3.11, 1.86], | |
| "DeepSeek": [1.46, 0.95, 0.41, 0.78, 0.43, 2.21, 1.35, 0.94, 1.13, 3.46, 2.01], | |
| "OLMo-Twin": [1.46, 0.98, 0.37, 0.80, 0.44, 2.23, 1.39, 0.94, 1.13, 2.9, 1.71], | |
| "Qwen3": [1.43, 0.81, 0.35, 0.60, 0.43, 2.24, 1.33, 0.97, 1.13, 1.73, 2.1], | |
| } | |
| yerr_df = pd.DataFrame(yerr_data, index=benchmarks) | |
| # Convert to numpy array and handle None values | |
| yerr = yerr_df.values.T # Transpose to match expected shape | |
| # ------------------------------------------------------------- | |
| # X positions with a gap between SIQA and HumEval | |
| # ------------------------------------------------------------- | |
| base_x = np.arange(len(benchmarks), dtype=float) | |
| gap_start = 9 | |
| base_x[gap_start:] += 1.0 | |
| # ------------------------------------------------------------- | |
| # Plot | |
| # ------------------------------------------------------------- | |
| fig, ax = plt.subplots(figsize=(11, 4)) | |
| bar_w = 0.15 | |
| colours = { | |
| "Comma-2T": "#E6B800", # golden/mustard yellow (like in screenshot) | |
| "OLMo-Twin": "#CC5869", # muted red/pink (like in screenshot) | |
| "LLaMA-2": "#C4A5C9", # light purple/mauve (like in screenshot) | |
| "DeepSeek": "#6B9080", # sage green (like in screenshot) | |
| "Qwen3": "none", | |
| } | |
| hatch_for_qwen = "///" | |
| for i, model in enumerate(models): | |
| offsets = base_x + (i - 2) * bar_w | |
| values = df[model].values | |
| errors = yerr[i] | |
| # Handle None values - replace with 0 for plotting | |
| plot_values = np.where(pd.isna(values), 0, values) | |
| plot_errors = np.where(pd.isna(errors), 0, errors) | |
| bars = ax.bar( | |
| offsets, | |
| plot_values, | |
| width=bar_w, | |
| label=model, | |
| color=colours[model], | |
| edgecolor="dimgray", | |
| linewidth=1.0, | |
| yerr=plot_errors, | |
| capsize=3, | |
| hatch=hatch_for_qwen if model == "Qwen3" else None, | |
| error_kw=dict(elinewidth=1, capthick=1, ecolor="dimgray"), | |
| ) | |
| # ------------------------------------------------------------- | |
| # Axis and labels | |
| # ------------------------------------------------------------- | |
| ax.set_ylabel("Performance") | |
| ax.set_ylim(0, 100) | |
| ax.set_xticks(base_x) | |
| ax.set_xticklabels(benchmarks, rotation=45, ha="right") | |
| # Add horizontal dotted lines for easy comparison | |
| for y in [20, 40, 60, 80]: | |
| ax.axhline(y=y, color='lightgrey', linestyle='--', linewidth=0.8, alpha=0.3, zorder=0) | |
| # Add gold stars for Comma-2T when it's top scoring (excluding Qwen3) | |
| for i, benchmark in enumerate(benchmarks): | |
| comma_score = df["Comma-2T"].iloc[i] | |
| if pd.isna(comma_score) or comma_score == 0: | |
| continue | |
| other_scores = [] | |
| for model in ["OLMo-Twin", "LLaMA-2", "DeepSeek"]: | |
| score = df[model].iloc[i] | |
| if not pd.isna(score) and score > 0: | |
| other_scores.append(score) | |
| if other_scores and comma_score >= max(other_scores): | |
| x_pos = base_x[i] + (0 - 2) * bar_w # Comma-2T is index 0 | |
| comma_error = yerr_df["Comma-2T"].iloc[i] | |
| if not pd.isna(comma_error): | |
| star_y_pos = comma_score + comma_error + 4 | |
| ax.scatter(x_pos, star_y_pos, marker='*', color='gold', s=80, | |
| edgecolor='darkgoldenrod', linewidth=0.5, zorder=10) | |
| # Section labels | |
| ax.text(base_x[4], -27, "Knowledge / Reasoning", ha="center", va="top", fontsize=11) | |
| mid_coding = (base_x[9] + base_x[10]) / 2 | |
| ax.text(mid_coding, -27, "Coding", ha="center", va="top", fontsize=11) | |
| # ------------------------------------------------------------- | |
| # Legend | |
| # ------------------------------------------------------------- | |
| legend_patches = [ | |
| Patch(facecolor="#E6B800", edgecolor="dimgray", label="Comma v0.1-2T"), | |
| Patch(facecolor="#CC5869", edgecolor="dimgray", label="OLMo Twin"), | |
| Patch(facecolor="#C4A5C9", edgecolor="dimgray", label="LLaMA 2"), | |
| Patch(facecolor="#6B9080", edgecolor="dimgray", label="DeepSeekLLM"), | |
| Patch(facecolor="none", edgecolor="dimgray", hatch=hatch_for_qwen, label="Qwen3"), | |
| ] | |
| ax.legend( | |
| handles=legend_patches, | |
| ncol=5, | |
| frameon=False, | |
| loc="upper center", | |
| bbox_to_anchor=(0.5, 1.12), | |
| ) | |
| # ------------------------------------------------------------- | |
| # Clean up | |
| # ------------------------------------------------------------- | |
| ax.spines[["right", "top"]].set_visible(False) | |
| fig.tight_layout() | |
| fig.subplots_adjust(bottom=0.32) | |
| # Save the plot | |
| plt.savefig('model_comparison_plot_v2.png', dpi=300, bbox_inches='tight') | |
| plt.savefig('model_comparison_plot_v2.pdf', bbox_inches='tight') | |
| print("Plot saved as 'model_comparison_plot_v2.png' and 'model_comparison_plot_v2.pdf'") | |
| plt.show() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
(harness defaults)