open-llm-leaderboard-old/details_ChavyvAkvar__habib-DPO
收藏数据集概述
该数据集是在对模型 ChavyvAkvar/habib-DPO 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_ChavyvAkvar__habib-DPO", "harness_winogrande_5", split="train")
最新结果
以下是 2024-04-08T14:15:05.108726 运行 的最新结果:
python { "all": { "acc": 0.6463086057955821, "acc_stderr": 0.03219641610397338, "acc_norm": 0.6470499429849053, "acc_norm_stderr": 0.0328482314699445, "mc1": 0.4847001223990208, "mc1_stderr": 0.017495304473187902, "mc2": 0.6519725385425954, "mc2_stderr": 0.015460535740302407 }, "harness|arc:challenge|25": { "acc": 0.6569965870307167, "acc_stderr": 0.013872423223718166, "acc_norm": 0.6868600682593856, "acc_norm_stderr": 0.01355267154362349 }, "harness|hellaswag|10": { "acc": 0.6899024098785103, "acc_stderr": 0.004615880352799734, "acc_norm": 0.8670583549093805, "acc_norm_stderr": 0.003388177893268278 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.04725815626252606, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252606 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6444444444444445, "acc_stderr": 0.04135176749720386, "acc_norm": 0.6444444444444445, "acc_norm_stderr": 0.04135176749720386 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6842105263157895, "acc_stderr": 0.0378272898086547, "acc_norm": 0.6842105263157895, "acc_norm_stderr": 0.0378272898086547 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.59, "acc_stderr": 0.049431107042371025, "acc_norm": 0.59, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7245283018867924, "acc_stderr": 0.027495663683724064, "acc_norm": 0.7245283018867924, "acc_norm_stderr": 0.027495663683724064 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7430555555555556, "acc_stderr": 0.03653946969442099, "acc_norm": 0.7430555555555556, "acc_norm_stderr": 0.03653946969442099 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.45, "acc_stderr": 0.05, "acc_norm": 0.45, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.53, "acc_stderr": 0.05016135580465919, "acc_norm": 0.53, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6705202312138728, "acc_stderr": 0.03583901754736412, "acc_norm": 0.6705202312138728, "acc_norm_stderr": 0.03583901754736412 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.3333333333333333, "acc_stderr": 0.04690650298201942, "acc_norm": 0.3333333333333333, "acc_norm_stderr": 0.04690650298201942 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.76, "acc_stderr": 0.042923469599092816, "acc_norm": 0.76, "acc_norm_stderr": 0.042923469599092816 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5957446808510638, "acc_stderr": 0.03208115750788684, "acc_norm": 0.5957446808510638, "acc_norm_stderr": 0.03208115750788684 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5175438596491229, "acc_stderr": 0.04700708033551038, "acc_norm": 0.5175438596491229, "acc_norm_stderr": 0.04700708033551038 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5793103448275863, "acc_stderr": 0.0411391498118926, "acc_norm": 0.5793103448275863, "acc_norm_stderr": 0.0411391498118926 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.42592592592592593, "acc_stderr": 0.02546714904546955, "acc_norm": 0.42592592592592593, "acc_norm_stderr": 0.02546714904546955 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.48412698412698413, "acc_stderr": 0.04469881854072606, "acc_norm": 0.48412698412698413, "acc_norm_stderr": 0.04469881854072606 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7741935483870968, "acc_stderr": 0.023785577884181012, "acc_norm": 0.7741935483870968, "acc_norm_stderr": 0.023785577884181012 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.49261083743842365, "acc_stderr": 0.03517603540361008, "acc_norm": 0.49261083743842365, "acc_norm_stderr": 0.03517603540361008 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.7, "acc_stderr": 0.046056618647183814, "acc_norm": 0.7, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7575757575757576, "acc_stderr": 0.03346409881055953, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.03346409881055953 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7777777777777778, "acc_stderr": 0.029620227874790486, "acc_norm": 0.7777777777777778, "acc_norm_stderr": 0.029620227874790486 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8860103626943006, "acc_stderr": 0.022935144053919443, "acc_norm": 0.8860103626943006, "acc_norm_stderr": 0.022935144053919443 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6564102564102564, "acc_stderr": 0.024078696580635477, "acc_norm": 0.6564102564102564, "acc_norm_stderr": 0.024078696580635477 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3296296296296296, "acc_stderr": 0.028661201116524565, "acc_norm": 0.3296296296296296, "acc_norm_stderr": 0.028661201116524565 }, "harness|hendrycksTest-high_school_microeconomics|



