open-llm-leaderboard-old/details_DrNicefellow__Mistral-4-from-Mixtral-8x7B-v0.1
收藏数据集概述
数据集简介
该数据集是在评估模型DrNicefellow/Mistral-4-from-Mixtral-8x7B-v0.1在Open LLM Leaderboard上的自动创建的。
数据集组成
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集从1次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 额外的"results"配置存储所有运行结果的聚合,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_DrNicefellow__Mistral-4-from-Mixtral-8x7B-v0.1", "harness_winogrande_5", split="train")
最新结果
以下是最新结果:
python { "all": { "acc": 0.24768201532826062, "acc_stderr": 0.03028171804374249, "acc_norm": 0.24936515830727307, "acc_norm_stderr": 0.03109188686084462, "mc1": 0.2423500611995104, "mc1_stderr": 0.01500067437357034, "mc2": 0.485079980098346, "mc2_stderr": 0.016210663798591776 }, "harness|arc:challenge|25": { "acc": 0.21245733788395904, "acc_stderr": 0.011953482906582954, "acc_norm": 0.28242320819112626, "acc_norm_stderr": 0.01315545688409722 }, "harness|hellaswag|10": { "acc": 0.26070503883688506, "acc_stderr": 0.004381220409641171, "acc_norm": 0.2753435570603465, "acc_norm_stderr": 0.004457743287380273 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.22, "acc_stderr": 0.04163331998932268, "acc_norm": 0.22, "acc_norm_stderr": 0.04163331998932268 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.2814814814814815, "acc_stderr": 0.03885004245800255, "acc_norm": 0.2814814814814815, "acc_norm_stderr": 0.03885004245800255 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.17763157894736842, "acc_stderr": 0.031103182383123398, "acc_norm": 0.17763157894736842, "acc_norm_stderr": 0.031103182383123398 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.28, "acc_stderr": 0.04512608598542128, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2188679245283019, "acc_stderr": 0.02544786382510861, "acc_norm": 0.2188679245283019, "acc_norm_stderr": 0.02544786382510861 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2152777777777778, "acc_stderr": 0.03437079344106135, "acc_norm": 0.2152777777777778, "acc_norm_stderr": 0.03437079344106135 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.14, "acc_stderr": 0.034873508801977725, "acc_norm": 0.14, "acc_norm_stderr": 0.034873508801977725 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.2, "acc_stderr": 0.04020151261036846, "acc_norm": 0.2, "acc_norm_stderr": 0.04020151261036846 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.2138728323699422, "acc_stderr": 0.03126511206173043, "acc_norm": 0.2138728323699422, "acc_norm_stderr": 0.03126511206173043 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.21568627450980393, "acc_stderr": 0.04092563958237654, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.04092563958237654 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.22, "acc_stderr": 0.041633319989322716, "acc_norm": 0.22, "acc_norm_stderr": 0.041633319989322716 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.19574468085106383, "acc_stderr": 0.025937853139977148, "acc_norm": 0.19574468085106383, "acc_norm_stderr": 0.025937853139977148 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.24561403508771928, "acc_stderr": 0.04049339297748141, "acc_norm": 0.24561403508771928, "acc_norm_stderr": 0.04049339297748141 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.21379310344827587, "acc_stderr": 0.034165204477475494, "acc_norm": 0.21379310344827587, "acc_norm_stderr": 0.034165204477475494 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.2566137566137566, "acc_stderr": 0.022494510767503154, "acc_norm": 0.2566137566137566, "acc_norm_stderr": 0.022494510767503154 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.14285714285714285, "acc_stderr": 0.03129843185743809, "acc_norm": 0.14285714285714285, "acc_norm_stderr": 0.03129843185743809 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.18, "acc_stderr": 0.038612291966536934, "acc_norm": 0.18, "acc_norm_stderr": 0.038612291966536934 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.3032258064516129, "acc_stderr": 0.02614868593067175, "acc_norm": 0.3032258064516129, "acc_norm_stderr": 0.02614868593067175 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2955665024630542, "acc_stderr": 0.032104944337514575, "acc_norm": 0.2955665024630542, "acc_norm_stderr": 0.032104944337514575 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.32, "acc_stderr": 0.04688261722621505, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.22424242424242424, "acc_stderr": 0.032568666616811015, "acc_norm": 0.22424242424242424, "acc_norm_stderr": 0.032568666616811015 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.22727272727272727, "acc_stderr": 0.02985751567338642, "acc_norm": 0.22727272727272727, "acc_norm_stderr": 0.02985751567338642 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.23316062176165803, "acc_stderr": 0.030516111371476008, "acc_norm": 0.23316062176165803, "acc_norm_stderr": 0.030516111371476008 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.2128205128205128, "acc_stderr": 0.020752423722128013, "acc_norm": 0.2128205128205128, "acc_norm_stderr": 0.020752423722128013 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.26666666666666666, "acc_stderr": 0.026962424325073828, "acc_norm":



