7shi/_results.md

## _results.md

      
    Raw
  

              _results.md
            
          
    測定環境

Machine: EVO-X2
CPU: AMD Ryzen AI Max+ 395
OS: Windows 11 Pro 24H2
RAM: 128GB - VRAM

バックエンド

Prompt1 / Eval1: Ollama 0.12.6 (VRAM 64GB, ROCm)
Prompt2 / Eval2: Ollama 0.12.8 (VRAM 96GB, Vulkan)

"---" はモデルロード不可


Model
Parameters
Size (GB)
Prompt1
Eval1
Prompt2
Eval2


mixtral:8x22b
140.6B
74.06
---
---
136.55
9.02


gpt-oss-120b-GGUF
117B
60.88
279.10
33.09
383.36
25.41


gpt-oss:120b
116.8B
60.88
278.21
31.58
333.84
23.37


llama4:scout
108.6B
62.81
35.22
13.31
154.36
14.72


Llama-4-Scout-17B-16E-Instruct-GGUF:Q2_K_XL
108B
39.46
61.01
16.70
152.67
16.74


command-r-plus:104b-08-2024-q4_K_M
103.8B
58.44
---
---
---
---


Llama-3.3-70B-Instruct-GGUF:Q2_K
70.6B
24.56
23.37
5.48
74.86
6.61


llama3.3
70.6B
39.60
23.89
4.12
65.80
4.85


Tanuki-8x8B-dpo-v1.0-GGUF:IQ2_XXS
47B
11.82
1033.08
21.94
361.55
39.42


Tanuki-8x8B-dpo-v1.0-GGUF
47B
26.67
1007.25
20.05
285.75
24.55


mixtral:8x7b
46.7B
24.63
98.94
24.62
467.39
25.94


Qwen3-32B-GGUF:IQ3_XXS
32.8B
12.17
51.25
11.22
159.62
11.66


qwen3:32b
32.8B
18.81
50.36
9.37
136.36
9.56


Qwen_QwQ-32B-GGUF:IQ3_XXS
32.8B
11.96
54.83
10.11
183.05
12.64


qwq-bakeneko-32b-gguf:IQ3_XXS
32.8B
11.96
55.03
10.46
192.39
12.84


aya-expanse-32b-GGUF:IQ2_S
32.3B
10.05
58.27
11.19
242.39
15.12


aya-expanse:32b
32.3B
18.44
59.06
8.52
158.10
10.28


command-r:35b-08-2024-q4_K_M
32.3B
18.44
51.00
8.62
148.19
10.38


Qwen3-30B-A3B-GGUF:IQ3_XXS
30.5B
12.01
307.27
60.07
808.98
49.94


qwen3:30b
30.5B
17.28
296.41
54.38
805.41
53.24


gemma3:27b
27.4B
16.20
62.42
10.39
174.69
10.85


gemma-2-27b-it-gguf:IQ3_S
27.2B
11.33
64.16
9.42
225.27
12.96


gemma2:27b-instruct-i1-iq2_S
27.2B
8.06
66.27
12.18
261.48
16.50


gemma2:27b-instruct-iq2_S
27.2B
8.06
67.50
12.39
261.74
16.12


gemma2:27b-instruct-q2_K
27.2B
9.73
65.36
13.09
217.94
14.42


gemma-3-27b-it-iMat-GGUF:IQ3_S
27B
11.33
64.25
9.76
228.14
12.32


mistral-small3.2
24.0B
14.14
70.92
11.92
---
---


Mistral-Small-24B-Instruct-2501-GGUF:IQ4_XS
23.6B
11.88
74.36
15.27
250.59
9.29


Mistral-Small-3.1-24B-Instruct-2503-HF-gguf:IQ4_XS
23.6B
11.88
76.51
15.28
251.51
9.33


mistral-small:22b-instruct-2409-iq3_M
22.2B
9.37
75.99
11.87
249.41
16.59


gpt-oss-20b-GGUF
20.9B
10.83
436.95
65.47
1327.90
65.26


gpt-oss:20b
20.9B
12.85
433.78
42.81
1026.95
39.98


deepseek-r1:14b
14.8B
8.37
132.13
17.97
356.10
21.55


qwen3:14b
14.8B
8.64
122.04
19.78
330.08
20.19


AXCXEPT-phi-4-open-R1-Distill-EZOv1-gguf
14.7B
8.43
133.96
19.30
362.92
21.77


microsoft_Phi-4-reasoning-GGUF
14.7B
8.43
132.19
18.08
335.90
21.23


phi4
14.7B
8.43
131.99
19.43
344.79
21.98


gemma3:12b
12.2B
7.59
147.46
21.65
425.70
22.70


mistral-nemo:12b-instruct-2407-q4_0
12.2B
6.59
173.61
25.29
753.14
26.71


mistral-nemo:12b-instruct-2407-q4_K_M
12.2B
6.96
170.87
21.31
453.40
25.65


mistral-nemo:12b-instruct-2407-q4_K_S
12.2B
6.63
173.63
22.32
461.43
26.44


amoral-gemma3-12B-gguf:Q4_K_M
11.8B
7.59
165.39
21.94
464.72
22.49


llama3.2-vision
10.7B
7.28
244.99
30.69
---
---


ezo-common-9b-gemma-2-it:q8_0
9.2B
9.15
219.96
18.09
817.42
18.92


ezo-common-gemma-2:9b-instruct-q4_K_M
9.2B
5.37
222.14
23.75
597.84
28.47


gemma2:9b-instruct-q4_0
9.2B
5.07
222.44
27.06
864.28
28.48


gemma2:9b-instruct-q4_K_M
9.2B
5.37
226.01
23.55
588.58
28.39


gemma2:9b-instruct-q4_K_S
9.2B
5.10
221.88
24.24
595.73
29.18


Babel-9B-Chat-GGUF
9.01B
4.66
219.00
36.13
649.08
25.87


qwen3:8b
8.2B
4.87
272.54
32.07
666.73
31.91


aya-expanse:8b
8.0B
4.71
273.30
29.10
733.15
34.89


command-r7b
8.0B
4.71
259.13
28.41
756.12
34.69


llama3.1:8b-instruct-q4_0
8.0B
4.34
268.00
36.35
1090.54
40.25


llama3.1:8b-instruct-q4_K_M
8.0B
4.58
265.87
30.94
731.36
38.75


llama3.1:8b-instruct-q4_K_S
8.0B
4.37
261.26
31.69
735.11
39.52


llama3:8b-instruct-q4_K_M
8.0B
4.58
263.18
31.18
718.79
38.39


llama-translate:8b-q4_K_M
8.0B
5.86
251.95
28.74
657.54
35.09


tanuki-dpo-v1.0:8b-iq3_XXS
7.5B
2.77
267.49
37.18
760.86
46.15


tanuki-dpo-v1.0:8b-q4_K_S
7.5B
4.03
283.28
31.32
662.55
40.35


tanuki-dpo-v1.0:8b-q6_K
7.5B
5.74
268.29
26.78
596.20
30.45


OLMo-2-1124-7B-Instruct-GGUF
7.3B
4.16
266.32
25.39
740.06
28.37


gemma3n:e4b
6.9B
7.03
347.79
41.30
---
---


gemma3n:e2b
4.5B
5.24
654.02
59.97
---
---


gemma3:4b
4.3B
3.11
469.07
57.39
1576.27
57.41


qwen3:4b
4.0B
2.44
399.86
54.46
1190.10
50.97


amoral-gemma3-4B-gguf:q8_0
3.88B
4.64
503.49
42.45
1885.15
41.71


borea-phi-3.5-coding:3.8b-mini-instruct-q6_K
3.8B
2.92
412.04
39.43
989.82
40.24


borea-phi-3.5-common:3.8b-mini-instruct-q6_K
3.8B
2.92
443.00
38.94
994.18
39.54


borea-phi-3.5-jp:3.8b-mini-instruct-q6_K
3.8B
2.92
437.24
39.93
988.04
40.18


llm-jp-3-ezo-humanities:3.7b-instruct-q8_0
3.8B
3.75
486.63
37.96
1780.44
39.88


phi3.5:3.8b-mini-instruct-q6_K
3.8B
2.92
421.08
39.74
1030.63
40.17


phi4-mini:3.8b-q8_0
3.8B
3.80
456.68
39.59
1905.50
42.35


phi4-mini
3.8B
2.32
511.26
50.78
1444.25
60.52


llm-jp-3-3.7b-instruct3-gguf
3.78B
2.17
497.32
50.92
1364.63
58.27


sarashina2.2-3b-instruct-v0.1-gguf:q8_0
3.36B
3.32
695.17
47.11
2134.66
51.20


ezo-common-t2-gemma-2:2b-instruct-q8_0
2.6B
2.59
815.97
56.37
2936.00
59.55


ezo-gemma-2-jpn:2b-instruct-q8_0
2.6B
2.59
827.44
55.60
2944.20
58.90


gemma-2-2b-jpn-it:q8_0
2.6B
2.60
778.00
54.23
2476.48
57.77


gemma2:2b
2.6B
1.52
847.66
74.47
3185.07
82.20


gemma2:2b-instruct-q4_0
2.6B
1.52
846.64
74.55
1090.90
80.34


gemma2:2b-instruct-q4_K_M
2.6B
1.59
805.38
68.27
2256.01
78.31


gemma2:2b-instruct-q4_K_S
2.6B
1.53
831.53
70.11
2256.96
80.68


gemma2:2b-instruct-q8_0
2.6B
2.59
788.02
55.53
2912.84
58.64


gemma-2-baku-2b-it:q8_0
2.6B
2.60
795.94
55.23
2887.30
57.95


gemma-2-jpn-translate:2b-instruct-q8_0
2.6B
3.11
930.29
46.69
2921.84
50.21


qwen3:1.7b
2.0B
1.27
1232.53
95.57
3029.02
80.89


llm-jp-3-1.8b-instruct3-gguf:q8_0
1.87B
1.85
1048.41
65.70
3500.47
69.67


TinySwallow-1.5B-Instruct-GGUF:Q5_K_M
1.54B
1.05
1316.01
99.72
3261.63
127.16


TinySwallow-1.5B-Instruct-GGUF:q8_0
1.54B
1.53
1381.20
85.94
4509.65
97.40


gemma3:1b
999.89M
0.76
2327.49
140.29
5671.39
133.90


qwen3:0.6b
751.63M
0.49
3519.04
167.84
5299.80
139.84


qwen2.5:0.5b
494.03M
0.37
3906.63
233.77
9064.31
272.81


qwen2:0.5b-instruct-q8_0
494.03M
0.49
3888.29
202.93
11316.37
243.64


## evals-merge.py
import argparse
import os
from typing import Dict, List, Tuple


def normalize_model_name(model: str) -> str:
    """
    Normalize model name by:
    - Removing ':latest' suffix
    - Taking basename if '/' is present
    """
    # Remove :latest suffix
    if model.endswith(':latest'):
        model = model[:-7]

    # Take basename if '/' is present
    if '/' in model:
        model = os.path.basename(model)

    return model


def parse_markdown_table(filepath: str) -> Tuple[List[str], List[str], Dict[str, Dict[str, str]]]:
    """
    Parse markdown table and return headers, model order, and data indexed by original model name.
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    if len(lines) < 2:
        raise ValueError(f"Invalid markdown table in {filepath}")

    # Parse header
    headers = [h.strip() for h in lines[0].strip().split('|') if h.strip()]

    # Skip separator line (line 1)
    # Parse data rows
    data = {}
    model_order = []
    for line in lines[2:]:
        line = line.strip()
        if not line or line == '|':
            continue

        cols = [c.strip() for c in line.split('|') if c.strip()]
        if len(cols) != len(headers):
            continue

        row_data = dict(zip(headers, cols))
        model_name = row_data['Model']
        data[model_name] = row_data
        model_order.append(model_name)

    return headers, model_order, data


def merge_evals(file1: str, file2: str, output: str):
    """
    Merge two evaluation markdown files.
    """
    # Parse both files
    headers1, order1, data1 = parse_markdown_table(file1)
    headers2, order2, data2 = parse_markdown_table(file2)

    # Preserve order from file1, then add models only in file2
    all_models = order1.copy()
    for model in order2:
        if model not in data1:
            all_models.append(model)

    # Build merged table
    merged_lines = []

    # Header
    header = "| Model | Parameters | Size (GB) | Prompt1 | Eval1 | Prompt2 | Eval2 |"
    merged_lines.append(header)

    # Separator
    separator = "|-------|----------:|----------:|--------:|------:|--------:|------:|"
    merged_lines.append(separator)

    # Data rows
    for model in all_models:
        row1 = data1.get(model, {})
        row2 = data2.get(model, {})

        # Normalize model name for display only
        model_display = normalize_model_name(model)
        params = row1.get('Parameters', row2.get('Parameters', ''))
        size = row1.get('Size (GB)', row2.get('Size (GB)', ''))

        # Get Prompt and Eval from both files
        prompt1 = row1.get('Prompt', '')
        eval1 = row1.get('Eval', '')
        prompt2 = row2.get('Prompt', '')
        eval2 = row2.get('Eval', '')

        row = f"| {model_display} | {params} | {size} | {prompt1} | {eval1} | {prompt2} | {eval2} |"
        merged_lines.append(row)

    # Write output
    with open(output, 'w', encoding='utf-8') as f:
        f.write('\n'.join(merged_lines) + '\n')

    print(f"Merged {len(all_models)} models to {output}")


def main():
    parser = argparse.ArgumentParser(
        description='Merge two evaluation markdown files'
    )
    parser.add_argument(
        'file1',
        help='First markdown file'
    )
    parser.add_argument(
        'file2',
        help='Second markdown file'
    )
    parser.add_argument(
        '-o', '--output',
        required=True,
        help='Output markdown file'
    )

    args = parser.parse_args()

    merge_evals(args.file1, args.file2, args.output)


if __name__ == '__main__':
    main()

## evals.py
import ollama
import json
import os
from datetime import timedelta

def create_stats(last_chunk):
    stats = {}

    if last_chunk.total_duration is not None:
        stats["total_duration"] = last_chunk.total_duration / 1e9
    if last_chunk.load_duration is not None:
        stats["load_duration"] = last_chunk.load_duration / 1e9
    if last_chunk.prompt_eval_count is not None:
        stats["prompt_eval_count"] = last_chunk.prompt_eval_count
    if last_chunk.prompt_eval_duration is not None:
        stats["prompt_eval_duration"] = last_chunk.prompt_eval_duration / 1e9
    if last_chunk.eval_count is not None:
        stats["eval_count"] = last_chunk.eval_count
    if last_chunk.eval_duration is not None:
        stats["eval_duration"] = last_chunk.eval_duration / 1e9

    return stats

def show_stats(stats):
    print("--- Statistics ---")

    if "total_duration" in stats:
        print("total duration:      ", timedelta(seconds=stats["total_duration"]))
    if "load_duration" in stats:
        print("load duration:       ", timedelta(seconds=stats["load_duration"]))
    if "prompt_eval_count" in stats:
        print("prompt eval count:   ", stats["prompt_eval_count"])
    if "prompt_eval_duration" in stats:
        print("prompt eval duration:", timedelta(seconds=stats["prompt_eval_duration"]))
    if "prompt_eval_count" in stats and "prompt_eval_duration" in stats:
        prompt_eval_rate = stats["prompt_eval_count"] / stats["prompt_eval_duration"]
        print("prompt eval rate:    ", f"{prompt_eval_rate:.2f} tokens/s")
    if "eval_count" in stats:
        print("eval_count:          ", stats["eval_count"])
    if "eval_duration" in stats:
        print("eval duration:       ", timedelta(seconds=stats["eval_duration"]))
    if "eval_count" in stats and "eval_duration" in stats:
        eval_rate = stats["eval_count"] / stats["eval_duration"]
        print("eval rate:           ", f"{eval_rate:.2f} tokens/s")

def exec_stream(model, prompt):
    messages=[{"role": "user", "content": prompt}]
    stream = ollama.chat(model=model, messages=messages, stream=True)
    text = ""
    for chunk in stream:
        chunk_text = chunk['message']['content']
        text += chunk_text
        print(chunk_text, end='', flush=True)
    if not text.endswith("\n"):
        print()
    return text.rstrip(), create_stats(chunk)

STATS_FILE = "evals.jsonl"
PROMPT = """
You are tasked with writing a comprehensive analysis of the ethical implications and societal impacts of artificial intelligence in the next decade.

Please address the following points in your response:

1. **Employment and Labor Markets**: Discuss how AI automation might affect job markets across different sectors. Which industries are most vulnerable to disruption? What new types of jobs might emerge? How should societies prepare for these transitions?

2. **Privacy and Surveillance**: Analyze the tension between AI-powered services and individual privacy rights. What are the risks of pervasive data collection? How can we balance innovation with privacy protection?

3. **Bias and Fairness**: Examine how biases in training data and algorithms can perpetuate or amplify existing social inequalities. What strategies can be employed to create more equitable AI systems?

4. **Decision-Making and Accountability**: When AI systems make critical decisions (healthcare, criminal justice, finance), who should be held responsible for errors or harmful outcomes? How can we ensure transparency and explainability?

5. **Global Inequality**: Consider how AI development and deployment might affect the gap between developed and developing nations. What measures could promote more equitable access to AI benefits?

6. **Regulation and Governance**: What role should governments, corporations, and international organizations play in regulating AI? What are the challenges of governing a rapidly evolving technology?

Please provide a nuanced analysis that considers multiple perspectives, potential trade-offs, and concrete examples where relevant. Your response should be approximately 500-800 words.
""".strip()

# Load existing logs
processed_models = set()
if os.path.exists(STATS_FILE):
    with open(STATS_FILE, "r") as f:
        for line in f:
            if line.strip():
                record = json.loads(line)
                # Use name if model is dict, otherwise use as-is
                if isinstance(record["model"], dict):
                    processed_models.add(record["model"]["name"])
                else:
                    processed_models.add(record["model"])

models = ollama.list().models
total_models = len(models)
processed_count = len(processed_models)

print(f"Total models: {total_models}")
print(f"Already processed: {processed_count}")
print(f"Remaining: {total_models - processed_count}")
print()

for idx, model in enumerate(models, 1):
    model_name = model.model

    # Skip if already processed
    if model_name in processed_models:
        print(f"[{idx}/{total_models}] Skipping {model_name} (already processed)")
        continue

    print(f"[{idx}/{total_models}] Processing {model_name}")

    # Collect model information
    model_info = {
        "name": model_name,
    }
    if hasattr(model, 'size') and model.size is not None:
        model_info["size"] = model.size
    if hasattr(model, 'details') and model.details is not None:
        if hasattr(model.details, 'parameter_size') and model.details.parameter_size is not None:
            model_info["parameter_size"] = model.details.parameter_size
        if hasattr(model.details, 'family') and model.details.family is not None:
            model_info["family"] = model.details.family

    try:
        _, st = exec_stream(model_name, PROMPT)
        print()
        show_stats(st)

        # Append to stats.jsonl
        record = {
            "model": model_info,
            "stats": st
        }
        with open(STATS_FILE, "a") as f:
            f.write(json.dumps(record) + "\n")

        processed_count += 1
        print(f"Saved to {STATS_FILE}")
        print(f"Progress: {processed_count}/{total_models} models completed")
        print()

    except Exception as e:
        print(f"\nError: {e}")
        print(f"Skipping {model_name}")
        print()

# Report generation
def normalize_name(name):
    """Normalize model name"""
    # Get basename if it's a path
    if "/" in name:
        name = name.split("/")[-1]
    # Remove hyphens and convert to lowercase
    return name.replace("-", "").lower()

def parse_parameter_size(param_size):
    """Convert parameter size to float (e.g., "7B" -> 7.0, "751.63M" -> 0.75163)"""
    if param_size is None or param_size == "":
        return 0.0

    param_size = param_size.strip()

    # Check unit suffix
    if param_size.endswith("B"):
        # Billion
        return float(param_size.rstrip("B"))
    elif param_size.endswith("M"):
        # Million
        return float(param_size.rstrip("M")) / 1000.0
    else:
        # Other cases
        try:
            return float(param_size)
        except ValueError:
            return 0.0

def generate_report():
    """Generate report from evals.jsonl"""
    if not os.path.exists(STATS_FILE):
        print(f"Error: {STATS_FILE} not found")
        return

    print(f"Generating report from {STATS_FILE}...")

    records = []
    with open(STATS_FILE, "r") as f:
        for line in f:
            if not line.strip():
                continue
            try:
                record = json.loads(line)
                records.append(record)
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")

    # Format data
    report_data = []
    for record in records:
        model = record.get("model", {})
        stats = record.get("stats", {})

        # Get model information
        if isinstance(model, dict):
            name = model.get("name", "")
            parameter_size = model.get("parameter_size", "")
            size = model.get("size", 0)
        else:
            # Legacy format (string)
            name = model
            parameter_size = ""
            size = 0

        # Convert size to GB
        size_gb = size / (1024 ** 3) if size else 0

        # Get rates
        prompt_eval_rate = 0.0
        if "prompt_eval_count" in stats and "prompt_eval_duration" in stats:
            if stats["prompt_eval_duration"] > 0:
                prompt_eval_rate = stats["prompt_eval_count"] / stats["prompt_eval_duration"]

        eval_rate = 0.0
        if "eval_count" in stats and "eval_duration" in stats:
            if stats["eval_duration"] > 0:
                eval_rate = stats["eval_count"] / stats["eval_duration"]

        # Exclude records with both rates as N/A (both are 0)
        if prompt_eval_rate > 0 or eval_rate > 0:
            report_data.append({
                "name": name,
                "parameter_size": parameter_size,
                "parameter_size_num": parse_parameter_size(parameter_size),
                "normalized_name": normalize_name(name),
                "size_gb": size_gb,
                "prompt_eval_rate": prompt_eval_rate,
                "eval_rate": eval_rate,
            })

    # Sort: parameter_size descending, then normalized_name ascending
    report_data.sort(key=lambda x: (-x["parameter_size_num"], x["normalized_name"]))

    # Generate Markdown report
    report_file_md = "evals.md"
    with open(report_file_md, "w") as f:
        f.write("| Model | Parameters | Size (GB) | Prompt | Eval |\n")
        f.write("|-------|-----------:|----------:|-------:|-----:|\n")

        for data in report_data:
            name = data["name"]
            param_size = data["parameter_size"] if data["parameter_size"] else "N/A"
            size_gb = f"{data['size_gb']:.2f}" if data['size_gb'] > 0 else "N/A"
            prompt_rate = f"{data['prompt_eval_rate']:.2f}" if data['prompt_eval_rate'] > 0 else "N/A"
            eval_rate = f"{data['eval_rate']:.2f}" if data['eval_rate'] > 0 else "N/A"

            f.write(f"| {name} | {param_size} | {size_gb} | {prompt_rate} | {eval_rate} |\n")

    print(f"Report generated: {report_file_md}")

    # Generate TSV report
    report_file_tsv = "evals.tsv"
    with open(report_file_tsv, "w") as f:
        # Header row
        f.write("Model\tParameters\tSize (GB)\tPrompt\tEval\n")

        for data in report_data:
            name = data["name"]
            param_size = data["parameter_size"] if data["parameter_size"] else "N/A"
            size_gb = f"{data['size_gb']:.2f}" if data['size_gb'] > 0 else "N/A"
            prompt_rate = f"{data['prompt_eval_rate']:.2f}" if data['prompt_eval_rate'] > 0 else "N/A"
            eval_rate = f"{data['eval_rate']:.2f}" if data['eval_rate'] > 0 else "N/A"

            f.write(f"{name}\t{param_size}\t{size_gb}\t{prompt_rate}\t{eval_rate}\n")

    print(f"Report generated: {report_file_tsv}")

# Generate report after main loop
print("\nAll models processed. Generating report...")
generate_report()
Model	Parameters	Size (GB)	Prompt1	Eval1	Prompt2	Eval2
mixtral:8x22b	140.6B	74.06	---	---	136.55	9.02
gpt-oss-120b-GGUF	117B	60.88	279.10	33.09	383.36	25.41
gpt-oss:120b	116.8B	60.88	278.21	31.58	333.84	23.37
llama4:scout	108.6B	62.81	35.22	13.31	154.36	14.72
Llama-4-Scout-17B-16E-Instruct-GGUF:Q2_K_XL	108B	39.46	61.01	16.70	152.67	16.74
command-r-plus:104b-08-2024-q4_K_M	103.8B	58.44	---	---	---	---
Llama-3.3-70B-Instruct-GGUF:Q2_K	70.6B	24.56	23.37	5.48	74.86	6.61
llama3.3	70.6B	39.60	23.89	4.12	65.80	4.85
Tanuki-8x8B-dpo-v1.0-GGUF:IQ2_XXS	47B	11.82	1033.08	21.94	361.55	39.42
Tanuki-8x8B-dpo-v1.0-GGUF	47B	26.67	1007.25	20.05	285.75	24.55
mixtral:8x7b	46.7B	24.63	98.94	24.62	467.39	25.94
Qwen3-32B-GGUF:IQ3_XXS	32.8B	12.17	51.25	11.22	159.62	11.66
qwen3:32b	32.8B	18.81	50.36	9.37	136.36	9.56
Qwen_QwQ-32B-GGUF:IQ3_XXS	32.8B	11.96	54.83	10.11	183.05	12.64
qwq-bakeneko-32b-gguf:IQ3_XXS	32.8B	11.96	55.03	10.46	192.39	12.84
aya-expanse-32b-GGUF:IQ2_S	32.3B	10.05	58.27	11.19	242.39	15.12
aya-expanse:32b	32.3B	18.44	59.06	8.52	158.10	10.28
command-r:35b-08-2024-q4_K_M	32.3B	18.44	51.00	8.62	148.19	10.38
Qwen3-30B-A3B-GGUF:IQ3_XXS	30.5B	12.01	307.27	60.07	808.98	49.94
qwen3:30b	30.5B	17.28	296.41	54.38	805.41	53.24
gemma3:27b	27.4B	16.20	62.42	10.39	174.69	10.85
gemma-2-27b-it-gguf:IQ3_S	27.2B	11.33	64.16	9.42	225.27	12.96
gemma2:27b-instruct-i1-iq2_S	27.2B	8.06	66.27	12.18	261.48	16.50
gemma2:27b-instruct-iq2_S	27.2B	8.06	67.50	12.39	261.74	16.12
gemma2:27b-instruct-q2_K	27.2B	9.73	65.36	13.09	217.94	14.42
gemma-3-27b-it-iMat-GGUF:IQ3_S	27B	11.33	64.25	9.76	228.14	12.32
mistral-small3.2	24.0B	14.14	70.92	11.92	---	---
Mistral-Small-24B-Instruct-2501-GGUF:IQ4_XS	23.6B	11.88	74.36	15.27	250.59	9.29
Mistral-Small-3.1-24B-Instruct-2503-HF-gguf:IQ4_XS	23.6B	11.88	76.51	15.28	251.51	9.33
mistral-small:22b-instruct-2409-iq3_M	22.2B	9.37	75.99	11.87	249.41	16.59
gpt-oss-20b-GGUF	20.9B	10.83	436.95	65.47	1327.90	65.26
gpt-oss:20b	20.9B	12.85	433.78	42.81	1026.95	39.98
deepseek-r1:14b	14.8B	8.37	132.13	17.97	356.10	21.55
qwen3:14b	14.8B	8.64	122.04	19.78	330.08	20.19
AXCXEPT-phi-4-open-R1-Distill-EZOv1-gguf	14.7B	8.43	133.96	19.30	362.92	21.77
microsoft_Phi-4-reasoning-GGUF	14.7B	8.43	132.19	18.08	335.90	21.23
phi4	14.7B	8.43	131.99	19.43	344.79	21.98
gemma3:12b	12.2B	7.59	147.46	21.65	425.70	22.70
mistral-nemo:12b-instruct-2407-q4_0	12.2B	6.59	173.61	25.29	753.14	26.71
mistral-nemo:12b-instruct-2407-q4_K_M	12.2B	6.96	170.87	21.31	453.40	25.65
mistral-nemo:12b-instruct-2407-q4_K_S	12.2B	6.63	173.63	22.32	461.43	26.44
amoral-gemma3-12B-gguf:Q4_K_M	11.8B	7.59	165.39	21.94	464.72	22.49
llama3.2-vision	10.7B	7.28	244.99	30.69	---	---
ezo-common-9b-gemma-2-it:q8_0	9.2B	9.15	219.96	18.09	817.42	18.92
ezo-common-gemma-2:9b-instruct-q4_K_M	9.2B	5.37	222.14	23.75	597.84	28.47
gemma2:9b-instruct-q4_0	9.2B	5.07	222.44	27.06	864.28	28.48
gemma2:9b-instruct-q4_K_M	9.2B	5.37	226.01	23.55	588.58	28.39
gemma2:9b-instruct-q4_K_S	9.2B	5.10	221.88	24.24	595.73	29.18
Babel-9B-Chat-GGUF	9.01B	4.66	219.00	36.13	649.08	25.87
qwen3:8b	8.2B	4.87	272.54	32.07	666.73	31.91
aya-expanse:8b	8.0B	4.71	273.30	29.10	733.15	34.89
command-r7b	8.0B	4.71	259.13	28.41	756.12	34.69
llama3.1:8b-instruct-q4_0	8.0B	4.34	268.00	36.35	1090.54	40.25
llama3.1:8b-instruct-q4_K_M	8.0B	4.58	265.87	30.94	731.36	38.75
llama3.1:8b-instruct-q4_K_S	8.0B	4.37	261.26	31.69	735.11	39.52
llama3:8b-instruct-q4_K_M	8.0B	4.58	263.18	31.18	718.79	38.39
llama-translate:8b-q4_K_M	8.0B	5.86	251.95	28.74	657.54	35.09
tanuki-dpo-v1.0:8b-iq3_XXS	7.5B	2.77	267.49	37.18	760.86	46.15
tanuki-dpo-v1.0:8b-q4_K_S	7.5B	4.03	283.28	31.32	662.55	40.35
tanuki-dpo-v1.0:8b-q6_K	7.5B	5.74	268.29	26.78	596.20	30.45
OLMo-2-1124-7B-Instruct-GGUF	7.3B	4.16	266.32	25.39	740.06	28.37
gemma3n:e4b	6.9B	7.03	347.79	41.30	---	---
gemma3n:e2b	4.5B	5.24	654.02	59.97	---	---
gemma3:4b	4.3B	3.11	469.07	57.39	1576.27	57.41
qwen3:4b	4.0B	2.44	399.86	54.46	1190.10	50.97
amoral-gemma3-4B-gguf:q8_0	3.88B	4.64	503.49	42.45	1885.15	41.71
borea-phi-3.5-coding:3.8b-mini-instruct-q6_K	3.8B	2.92	412.04	39.43	989.82	40.24
borea-phi-3.5-common:3.8b-mini-instruct-q6_K	3.8B	2.92	443.00	38.94	994.18	39.54
borea-phi-3.5-jp:3.8b-mini-instruct-q6_K	3.8B	2.92	437.24	39.93	988.04	40.18
llm-jp-3-ezo-humanities:3.7b-instruct-q8_0	3.8B	3.75	486.63	37.96	1780.44	39.88
phi3.5:3.8b-mini-instruct-q6_K	3.8B	2.92	421.08	39.74	1030.63	40.17
phi4-mini:3.8b-q8_0	3.8B	3.80	456.68	39.59	1905.50	42.35
phi4-mini	3.8B	2.32	511.26	50.78	1444.25	60.52
llm-jp-3-3.7b-instruct3-gguf	3.78B	2.17	497.32	50.92	1364.63	58.27
sarashina2.2-3b-instruct-v0.1-gguf:q8_0	3.36B	3.32	695.17	47.11	2134.66	51.20
ezo-common-t2-gemma-2:2b-instruct-q8_0	2.6B	2.59	815.97	56.37	2936.00	59.55
ezo-gemma-2-jpn:2b-instruct-q8_0	2.6B	2.59	827.44	55.60	2944.20	58.90
gemma-2-2b-jpn-it:q8_0	2.6B	2.60	778.00	54.23	2476.48	57.77
gemma2:2b	2.6B	1.52	847.66	74.47	3185.07	82.20
gemma2:2b-instruct-q4_0	2.6B	1.52	846.64	74.55	1090.90	80.34
gemma2:2b-instruct-q4_K_M	2.6B	1.59	805.38	68.27	2256.01	78.31
gemma2:2b-instruct-q4_K_S	2.6B	1.53	831.53	70.11	2256.96	80.68
gemma2:2b-instruct-q8_0	2.6B	2.59	788.02	55.53	2912.84	58.64
gemma-2-baku-2b-it:q8_0	2.6B	2.60	795.94	55.23	2887.30	57.95
gemma-2-jpn-translate:2b-instruct-q8_0	2.6B	3.11	930.29	46.69	2921.84	50.21
qwen3:1.7b	2.0B	1.27	1232.53	95.57	3029.02	80.89
llm-jp-3-1.8b-instruct3-gguf:q8_0	1.87B	1.85	1048.41	65.70	3500.47	69.67
TinySwallow-1.5B-Instruct-GGUF:Q5_K_M	1.54B	1.05	1316.01	99.72	3261.63	127.16
TinySwallow-1.5B-Instruct-GGUF:q8_0	1.54B	1.53	1381.20	85.94	4509.65	97.40
gemma3:1b	999.89M	0.76	2327.49	140.29	5671.39	133.90
qwen3:0.6b	751.63M	0.49	3519.04	167.84	5299.80	139.84
qwen2.5:0.5b	494.03M	0.37	3906.63	233.77	9064.31	272.81
qwen2:0.5b-instruct-q8_0	494.03M	0.49	3888.29	202.93	11316.37	243.64
	import argparse
	import os
	from typing import Dict, List, Tuple


	def normalize_model_name(model: str) -> str:
	"""
	Normalize model name by:
	- Removing ':latest' suffix
	- Taking basename if '/' is present
	"""
	# Remove :latest suffix
	if model.endswith(':latest'):
	model = model[:-7]

	# Take basename if '/' is present
	if '/' in model:
	model = os.path.basename(model)

	return model


	def parse_markdown_table(filepath: str) -> Tuple[List[str], List[str], Dict[str, Dict[str, str]]]:
	"""
	Parse markdown table and return headers, model order, and data indexed by original model name.
	"""
	with open(filepath, 'r', encoding='utf-8') as f:
	lines = f.readlines()

	if len(lines) < 2:
	raise ValueError(f"Invalid markdown table in {filepath}")

	# Parse header
	headers = [h.strip() for h in lines[0].strip().split('\|') if h.strip()]

	# Skip separator line (line 1)
	# Parse data rows
	data = {}
	model_order = []
	for line in lines[2:]:
	line = line.strip()
	if not line or line == '\|':
	continue

	cols = [c.strip() for c in line.split('\|') if c.strip()]
	if len(cols) != len(headers):
	continue

	row_data = dict(zip(headers, cols))
	model_name = row_data['Model']
	data[model_name] = row_data
	model_order.append(model_name)

	return headers, model_order, data


	def merge_evals(file1: str, file2: str, output: str):
	"""
	Merge two evaluation markdown files.
	"""
	# Parse both files
	headers1, order1, data1 = parse_markdown_table(file1)
	headers2, order2, data2 = parse_markdown_table(file2)

	# Preserve order from file1, then add models only in file2
	all_models = order1.copy()
	for model in order2:
	if model not in data1:
	all_models.append(model)

	# Build merged table
	merged_lines = []

	# Header
	header = "\| Model \| Parameters \| Size (GB) \| Prompt1 \| Eval1 \| Prompt2 \| Eval2 \|"
	merged_lines.append(header)

	# Separator
	separator = "\|-------\|----------:\|----------:\|--------:\|------:\|--------:\|------:\|"
	merged_lines.append(separator)

	# Data rows
	for model in all_models:
	row1 = data1.get(model, {})
	row2 = data2.get(model, {})

	# Normalize model name for display only
	model_display = normalize_model_name(model)
	params = row1.get('Parameters', row2.get('Parameters', ''))
	size = row1.get('Size (GB)', row2.get('Size (GB)', ''))

	# Get Prompt and Eval from both files
	prompt1 = row1.get('Prompt', '')
	eval1 = row1.get('Eval', '')
	prompt2 = row2.get('Prompt', '')
	eval2 = row2.get('Eval', '')

	row = f"\| {model_display} \| {params} \| {size} \| {prompt1} \| {eval1} \| {prompt2} \| {eval2} \|"
	merged_lines.append(row)

	# Write output
	with open(output, 'w', encoding='utf-8') as f:
	f.write('\n'.join(merged_lines) + '\n')

	print(f"Merged {len(all_models)} models to {output}")


	def main():
	parser = argparse.ArgumentParser(
	description='Merge two evaluation markdown files'
	)
	parser.add_argument(
	'file1',
	help='First markdown file'
	)
	parser.add_argument(
	'file2',
	help='Second markdown file'
	)
	parser.add_argument(
	'-o', '--output',
	required=True,
	help='Output markdown file'
	)

	args = parser.parse_args()

	merge_evals(args.file1, args.file2, args.output)


	if __name__ == '__main__':
	main()
	import ollama
	import json
	import os
	from datetime import timedelta

	def create_stats(last_chunk):
	stats = {}

	if last_chunk.total_duration is not None:
	stats["total_duration"] = last_chunk.total_duration / 1e9
	if last_chunk.load_duration is not None:
	stats["load_duration"] = last_chunk.load_duration / 1e9
	if last_chunk.prompt_eval_count is not None:
	stats["prompt_eval_count"] = last_chunk.prompt_eval_count
	if last_chunk.prompt_eval_duration is not None:
	stats["prompt_eval_duration"] = last_chunk.prompt_eval_duration / 1e9
	if last_chunk.eval_count is not None:
	stats["eval_count"] = last_chunk.eval_count
	if last_chunk.eval_duration is not None:
	stats["eval_duration"] = last_chunk.eval_duration / 1e9

	return stats

	def show_stats(stats):
	print("--- Statistics ---")

	if "total_duration" in stats:
	print("total duration: ", timedelta(seconds=stats["total_duration"]))
	if "load_duration" in stats:
	print("load duration: ", timedelta(seconds=stats["load_duration"]))
	if "prompt_eval_count" in stats:
	print("prompt eval count: ", stats["prompt_eval_count"])
	if "prompt_eval_duration" in stats:
	print("prompt eval duration:", timedelta(seconds=stats["prompt_eval_duration"]))
	if "prompt_eval_count" in stats and "prompt_eval_duration" in stats:
	prompt_eval_rate = stats["prompt_eval_count"] / stats["prompt_eval_duration"]
	print("prompt eval rate: ", f"{prompt_eval_rate:.2f} tokens/s")
	if "eval_count" in stats:
	print("eval_count: ", stats["eval_count"])
	if "eval_duration" in stats:
	print("eval duration: ", timedelta(seconds=stats["eval_duration"]))
	if "eval_count" in stats and "eval_duration" in stats:
	eval_rate = stats["eval_count"] / stats["eval_duration"]
	print("eval rate: ", f"{eval_rate:.2f} tokens/s")

	def exec_stream(model, prompt):
	messages=[{"role": "user", "content": prompt}]
	stream = ollama.chat(model=model, messages=messages, stream=True)
	text = ""
	for chunk in stream:
	chunk_text = chunk['message']['content']
	text += chunk_text
	print(chunk_text, end='', flush=True)
	if not text.endswith("\n"):
	print()
	return text.rstrip(), create_stats(chunk)

	STATS_FILE = "evals.jsonl"
	PROMPT = """
	You are tasked with writing a comprehensive analysis of the ethical implications and societal impacts of artificial intelligence in the next decade.

	Please address the following points in your response:

	1. Employment and Labor Markets: Discuss how AI automation might affect job markets across different sectors. Which industries are most vulnerable to disruption? What new types of jobs might emerge? How should societies prepare for these transitions?

	2. Privacy and Surveillance: Analyze the tension between AI-powered services and individual privacy rights. What are the risks of pervasive data collection? How can we balance innovation with privacy protection?

	3. Bias and Fairness: Examine how biases in training data and algorithms can perpetuate or amplify existing social inequalities. What strategies can be employed to create more equitable AI systems?

	4. Decision-Making and Accountability: When AI systems make critical decisions (healthcare, criminal justice, finance), who should be held responsible for errors or harmful outcomes? How can we ensure transparency and explainability?

	5. Global Inequality: Consider how AI development and deployment might affect the gap between developed and developing nations. What measures could promote more equitable access to AI benefits?

	6. Regulation and Governance: What role should governments, corporations, and international organizations play in regulating AI? What are the challenges of governing a rapidly evolving technology?

	Please provide a nuanced analysis that considers multiple perspectives, potential trade-offs, and concrete examples where relevant. Your response should be approximately 500-800 words.
	""".strip()

	# Load existing logs
	processed_models = set()
	if os.path.exists(STATS_FILE):
	with open(STATS_FILE, "r") as f:
	for line in f:
	if line.strip():
	record = json.loads(line)
	# Use name if model is dict, otherwise use as-is
	if isinstance(record["model"], dict):
	processed_models.add(record["model"]["name"])
	else:
	processed_models.add(record["model"])

	models = ollama.list().models
	total_models = len(models)
	processed_count = len(processed_models)

	print(f"Total models: {total_models}")
	print(f"Already processed: {processed_count}")
	print(f"Remaining: {total_models - processed_count}")
	print()

	for idx, model in enumerate(models, 1):
	model_name = model.model

	# Skip if already processed
	if model_name in processed_models:
	print(f"[{idx}/{total_models}] Skipping {model_name} (already processed)")
	continue

	print(f"[{idx}/{total_models}] Processing {model_name}")

	# Collect model information
	model_info = {
	"name": model_name,
	}
	if hasattr(model, 'size') and model.size is not None:
	model_info["size"] = model.size
	if hasattr(model, 'details') and model.details is not None:
	if hasattr(model.details, 'parameter_size') and model.details.parameter_size is not None:
	model_info["parameter_size"] = model.details.parameter_size
	if hasattr(model.details, 'family') and model.details.family is not None:
	model_info["family"] = model.details.family

	try:
	_, st = exec_stream(model_name, PROMPT)
	print()
	show_stats(st)

	# Append to stats.jsonl
	record = {
	"model": model_info,
	"stats": st
	}
	with open(STATS_FILE, "a") as f:
	f.write(json.dumps(record) + "\n")

	processed_count += 1
	print(f"Saved to {STATS_FILE}")
	print(f"Progress: {processed_count}/{total_models} models completed")
	print()

	except Exception as e:
	print(f"\nError: {e}")
	print(f"Skipping {model_name}")
	print()

	# Report generation
	def normalize_name(name):
	"""Normalize model name"""
	# Get basename if it's a path
	if "/" in name:
	name = name.split("/")[-1]
	# Remove hyphens and convert to lowercase
	return name.replace("-", "").lower()

	def parse_parameter_size(param_size):
	"""Convert parameter size to float (e.g., "7B" -> 7.0, "751.63M" -> 0.75163)"""
	if param_size is None or param_size == "":
	return 0.0

	param_size = param_size.strip()

	# Check unit suffix
	if param_size.endswith("B"):
	# Billion
	return float(param_size.rstrip("B"))
	elif param_size.endswith("M"):
	# Million
	return float(param_size.rstrip("M")) / 1000.0
	else:
	# Other cases
	try:
	return float(param_size)
	except ValueError:
	return 0.0

	def generate_report():
	"""Generate report from evals.jsonl"""
	if not os.path.exists(STATS_FILE):
	print(f"Error: {STATS_FILE} not found")
	return

	print(f"Generating report from {STATS_FILE}...")

	records = []
	with open(STATS_FILE, "r") as f:
	for line in f:
	if not line.strip():
	continue
	try:
	record = json.loads(line)
	records.append(record)
	except json.JSONDecodeError as e:
	print(f"Error parsing line: {e}")

	# Format data
	report_data = []
	for record in records:
	model = record.get("model", {})
	stats = record.get("stats", {})

	# Get model information
	if isinstance(model, dict):
	name = model.get("name", "")
	parameter_size = model.get("parameter_size", "")
	size = model.get("size", 0)
	else:
	# Legacy format (string)
	name = model
	parameter_size = ""
	size = 0

	# Convert size to GB
	size_gb = size / (1024 ** 3) if size else 0

	# Get rates
	prompt_eval_rate = 0.0
	if "prompt_eval_count" in stats and "prompt_eval_duration" in stats:
	if stats["prompt_eval_duration"] > 0:
	prompt_eval_rate = stats["prompt_eval_count"] / stats["prompt_eval_duration"]

	eval_rate = 0.0
	if "eval_count" in stats and "eval_duration" in stats:
	if stats["eval_duration"] > 0:
	eval_rate = stats["eval_count"] / stats["eval_duration"]

	# Exclude records with both rates as N/A (both are 0)
	if prompt_eval_rate > 0 or eval_rate > 0:
	report_data.append({
	"name": name,
	"parameter_size": parameter_size,
	"parameter_size_num": parse_parameter_size(parameter_size),
	"normalized_name": normalize_name(name),
	"size_gb": size_gb,
	"prompt_eval_rate": prompt_eval_rate,
	"eval_rate": eval_rate,
	})

	# Sort: parameter_size descending, then normalized_name ascending
	report_data.sort(key=lambda x: (-x["parameter_size_num"], x["normalized_name"]))

	# Generate Markdown report
	report_file_md = "evals.md"
	with open(report_file_md, "w") as f:
	f.write("\| Model \| Parameters \| Size (GB) \| Prompt \| Eval \|\n")
	f.write("\|-------\|-----------:\|----------:\|-------:\|-----:\|\n")

	for data in report_data:
	name = data["name"]
	param_size = data["parameter_size"] if data["parameter_size"] else "N/A"
	size_gb = f"{data['size_gb']:.2f}" if data['size_gb'] > 0 else "N/A"
	prompt_rate = f"{data['prompt_eval_rate']:.2f}" if data['prompt_eval_rate'] > 0 else "N/A"
	eval_rate = f"{data['eval_rate']:.2f}" if data['eval_rate'] > 0 else "N/A"

	f.write(f"\| {name} \| {param_size} \| {size_gb} \| {prompt_rate} \| {eval_rate} \|\n")

	print(f"Report generated: {report_file_md}")

	# Generate TSV report
	report_file_tsv = "evals.tsv"
	with open(report_file_tsv, "w") as f:
	# Header row
	f.write("Model\tParameters\tSize (GB)\tPrompt\tEval\n")

	for data in report_data:
	name = data["name"]
	param_size = data["parameter_size"] if data["parameter_size"] else "N/A"
	size_gb = f"{data['size_gb']:.2f}" if data['size_gb'] > 0 else "N/A"
	prompt_rate = f"{data['prompt_eval_rate']:.2f}" if data['prompt_eval_rate'] > 0 else "N/A"
	eval_rate = f"{data['eval_rate']:.2f}" if data['eval_rate'] > 0 else "N/A"

	f.write(f"{name}\t{param_size}\t{size_gb}\t{prompt_rate}\t{eval_rate}\n")

	print(f"Report generated: {report_file_tsv}")

	# Generate report after main loop
	print("\nAll models processed. Generating report...")
	generate_report()