pruksmhc/gist:a316805d864324c4fa94cdb492d0b562

## gistfile1.txt
{
  "results": {
    "mmstar": {
      "alias": "mmstar",
      "coarse perception,none": 0.7502214478878857,
      "coarse perception_stderr,none": "N/A",
      "average,none": 0.6282898425011137,
      "average_stderr,none": "N/A",
      "fine-grained perception,none": 0.5771677720461803,
      "fine-grained perception_stderr,none": "N/A",
      "instance reasoning,none": 0.7059851871377787,
      "instance reasoning_stderr,none": "N/A",
      "logical reasoning,none": 0.6201974043558202,
      "logical reasoning_stderr,none": "N/A",
      "math,none": 0.6405762719415486,
      "math_stderr,none": "N/A",
      "science & technology,none": 0.47559097163746894,
      "science & technology_stderr,none": "N/A"
    }
  },
  "group_subtasks": {
    "mmstar": []
  },
  "configs": {
    "mmstar": {
      "task": "mmstar",
      "dataset_path": "Lin-Chen/MMStar",
      "dataset_kwargs": {
        "token": true
      },
      "test_split": "val",
      "full_docs": false,
      "process_results_use_image": false,
      "doc_to_visual": "<function mmstar_doc_to_visual at 0x742714d83240>",
      "doc_to_text": "<function mmstar_doc_to_text at 0x742714d837e0>",
      "doc_to_target": "answer",
      "process_results": "<function mmstar_process_results at 0x742714d83d80>",
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 0,
      "metric_list": [
        {
          "metric": "coarse perception",
          "aggregation": "<function mmstar_aggregate_results at 0x7427144a82c0>",
          "higher_is_better": true
        },
        {
          "metric": "fine-grained perception",
          "aggregation": "<function mmstar_aggregate_results at 0x7427144a8720>",
          "higher_is_better": true
        },
        {
          "metric": "instance reasoning",
          "aggregation": "<function mmstar_aggregate_results at 0x7427144a8b80>",
          "higher_is_better": true
        },
        {
          "metric": "logical reasoning",
          "aggregation": "<function mmstar_aggregate_results at 0x7427144a8fe0>",
          "higher_is_better": true
        },
        {
          "metric": "science & technology",
          "aggregation": "<function mmstar_aggregate_results at 0x7427144a9440>",
          "higher_is_better": true
        },
        {
          "metric": "math",
          "aggregation": "<function mmstar_aggregate_results at 0x7427144a98a0>",
          "higher_is_better": true
        },
        {
          "metric": "average",
          "aggregation": "<function mmstar_aggregate_results at 0x7427144a9d00>",
          "higher_is_better": true
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "until": [
          "\n\n"
        ],
        "do_sample": false
      },
      "repeats": 1,
      "should_decontaminate": false,
      "metadata": [
        {
          "version": 0.0
        }
      ],
      "lmms_eval_specific_kwargs": {
        "default": {
          "pre_prompt": "",
          "post_prompt": "\nAnswer with the option's letter from the given choices directly"
        },
        "pre_prompt": "",
        "post_prompt": "\nAnswer with the option's letter from the given choices directly"
      }
    }
  },
  "versions": {
    "mmstar": "Yaml"
  },
  "n-shot": {
    "mmstar": 0
  },
  "higher_is_better": {
    "mmstar": {
      "coarse perception": true,
      "fine-grained perception": true,
      "instance reasoning": true,
      "logical reasoning": true,
      "science & technology": true,
      "math": true,
      "average": true
    }
  },
  "n-samples": {
    "mmstar": {
      "original": 1500,
      "effective": 1500
    }
  },
  "config": {
    "model": "qwen2_5_vl",
    "model_args": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_pixels=12845056,attn_implementation=flash_attention_2,interleave_visuals=False",
    "batch_size": "32",
    "batch_sizes": [],
    "device": "cuda:0",
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": "",
    "random_seed": 0,
    "numpy_seed": 1234,
    "torch_seed": 1234,
    "fewshot_seed": 1234
  },
  "git_hash": "8895505",
  "date": "20250730_091239",
  "task_hashes": {
    "mmstar": "596d11d8efa9bb058c2d9055a457420e06ff5bddea7858f7ff029c864296a3b8"
  },
  "model_source": "qwen2_5_vl",
  "model_name": "Qwen/Qwen2.5-VL-7B-Instruct",
  "model_name_sanitized": "Qwen__Qwen2.5-VL-7B-Instruct",
  "system_instruction": null,
  "system_instruction_sha": null,
  "fewshot_as_multiturn": false,
  "chat_template": null,
  "chat_template_sha": null,
  "start_time": 22916651.957306925,
  "end_time": 22917384.860846087,
  "total_evaluation_time_seconds": "732.9035391621292"
}
	{
	"results": {
	"mmstar": {
	"alias": "mmstar",
	"coarse perception,none": 0.7502214478878857,
	"coarse perception_stderr,none": "N/A",
	"average,none": 0.6282898425011137,
	"average_stderr,none": "N/A",
	"fine-grained perception,none": 0.5771677720461803,
	"fine-grained perception_stderr,none": "N/A",
	"instance reasoning,none": 0.7059851871377787,
	"instance reasoning_stderr,none": "N/A",
	"logical reasoning,none": 0.6201974043558202,
	"logical reasoning_stderr,none": "N/A",
	"math,none": 0.6405762719415486,
	"math_stderr,none": "N/A",
	"science & technology,none": 0.47559097163746894,
	"science & technology_stderr,none": "N/A"
	}
	},
	"group_subtasks": {
	"mmstar": []
	},
	"configs": {
	"mmstar": {
	"task": "mmstar",
	"dataset_path": "Lin-Chen/MMStar",
	"dataset_kwargs": {
	"token": true
	},
	"test_split": "val",
	"full_docs": false,
	"process_results_use_image": false,
	"doc_to_visual": "<function mmstar_doc_to_visual at 0x742714d83240>",
	"doc_to_text": "<function mmstar_doc_to_text at 0x742714d837e0>",
	"doc_to_target": "answer",
	"process_results": "<function mmstar_process_results at 0x742714d83d80>",
	"description": "",
	"target_delimiter": " ",
	"fewshot_delimiter": "\n\n",
	"num_fewshot": 0,
	"metric_list": [
	{
	"metric": "coarse perception",
	"aggregation": "<function mmstar_aggregate_results at 0x7427144a82c0>",
	"higher_is_better": true
	},
	{
	"metric": "fine-grained perception",
	"aggregation": "<function mmstar_aggregate_results at 0x7427144a8720>",
	"higher_is_better": true
	},
	{
	"metric": "instance reasoning",
	"aggregation": "<function mmstar_aggregate_results at 0x7427144a8b80>",
	"higher_is_better": true
	},
	{
	"metric": "logical reasoning",
	"aggregation": "<function mmstar_aggregate_results at 0x7427144a8fe0>",
	"higher_is_better": true
	},
	{
	"metric": "science & technology",
	"aggregation": "<function mmstar_aggregate_results at 0x7427144a9440>",
	"higher_is_better": true
	},
	{
	"metric": "math",
	"aggregation": "<function mmstar_aggregate_results at 0x7427144a98a0>",
	"higher_is_better": true
	},
	{
	"metric": "average",
	"aggregation": "<function mmstar_aggregate_results at 0x7427144a9d00>",
	"higher_is_better": true
	}
	],
	"output_type": "generate_until",
	"generation_kwargs": {
	"until": [
	"\n\n"
	],
	"do_sample": false
	},
	"repeats": 1,
	"should_decontaminate": false,
	"metadata": [
	{
	"version": 0.0
	}
	],
	"lmms_eval_specific_kwargs": {
	"default": {
	"pre_prompt": "",
	"post_prompt": "\nAnswer with the option's letter from the given choices directly"
	},
	"pre_prompt": "",
	"post_prompt": "\nAnswer with the option's letter from the given choices directly"
	}
	}
	},
	"versions": {
	"mmstar": "Yaml"
	},
	"n-shot": {
	"mmstar": 0
	},
	"higher_is_better": {
	"mmstar": {
	"coarse perception": true,
	"fine-grained perception": true,
	"instance reasoning": true,
	"logical reasoning": true,
	"science & technology": true,
	"math": true,
	"average": true
	}
	},
	"n-samples": {
	"mmstar": {
	"original": 1500,
	"effective": 1500
	}
	},
	"config": {
	"model": "qwen2_5_vl",
	"model_args": "pretrained=Qwen/Qwen2.5-VL-7B-Instruct,max_pixels=12845056,attn_implementation=flash_attention_2,interleave_visuals=False",
	"batch_size": "32",
	"batch_sizes": [],
	"device": "cuda:0",
	"use_cache": null,
	"limit": null,
	"bootstrap_iters": 100000,
	"gen_kwargs": "",
	"random_seed": 0,
	"numpy_seed": 1234,
	"torch_seed": 1234,
	"fewshot_seed": 1234
	},
	"git_hash": "8895505",
	"date": "20250730_091239",
	"task_hashes": {
	"mmstar": "596d11d8efa9bb058c2d9055a457420e06ff5bddea7858f7ff029c864296a3b8"
	},
	"model_source": "qwen2_5_vl",
	"model_name": "Qwen/Qwen2.5-VL-7B-Instruct",
	"model_name_sanitized": "Qwen__Qwen2.5-VL-7B-Instruct",
	"system_instruction": null,
	"system_instruction_sha": null,
	"fewshot_as_multiturn": false,
	"chat_template": null,
	"chat_template_sha": null,
	"start_time": 22916651.957306925,
	"end_time": 22917384.860846087,
	"total_evaluation_time_seconds": "732.9035391621292"
	}
No results found