open-llm-leaderboard-old/details_Replete-AI__Llama-3-11.5B-Instruct-v2
收藏数据集概述
数据集简介
该数据集是在评估模型Replete-AI/Llama-3-11.5B-Instruct-v2在Open LLM Leaderboard上的运行过程中自动创建的。
数据集结构
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集来自1次运行,每次运行在每个配置中都有一个特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Replete-AI__Llama-3-11.5B-Instruct-v2", "harness_winogrande_5", split="train")
最新结果
以下是2024-04-23T10:54:41.990794运行的最新结果:
python { "all": { "acc": 0.6689187661805235, "acc_stderr": 0.031740786959523176, "acc_norm": 0.6711359245269092, "acc_norm_stderr": 0.03237712234903635, "mc1": 0.35862913096695226, "mc1_stderr": 0.016789289499502025, "mc2": 0.5164839368919582, "mc2_stderr": 0.015196752099462125 }, "harness|arc:challenge|25": { "acc": 0.5767918088737202, "acc_stderr": 0.014438036220848025, "acc_norm": 0.6143344709897611, "acc_norm_stderr": 0.014224250973257184 }, "harness|hellaswag|10": { "acc": 0.5893248356901015, "acc_stderr": 0.004909509538525161, "acc_norm": 0.7858992232622983, "acc_norm_stderr": 0.004093587404303692 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6370370370370371, "acc_stderr": 0.04153948404742398, "acc_norm": 0.6370370370370371, "acc_norm_stderr": 0.04153948404742398 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6973684210526315, "acc_stderr": 0.03738520676119667, "acc_norm": 0.6973684210526315, "acc_norm_stderr": 0.03738520676119667 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.71, "acc_stderr": 0.045604802157206845, "acc_norm": 0.71, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7509433962264151, "acc_stderr": 0.026616482980501704, "acc_norm": 0.7509433962264151, "acc_norm_stderr": 0.026616482980501704 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7916666666666666, "acc_stderr": 0.033961162058453336, "acc_norm": 0.7916666666666666, "acc_norm_stderr": 0.033961162058453336 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.47, "acc_stderr": 0.050161355804659205, "acc_norm": 0.47, "acc_norm_stderr": 0.050161355804659205 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.37, "acc_stderr": 0.04852365870939099, "acc_norm": 0.37, "acc_norm_stderr": 0.04852365870939099 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6416184971098265, "acc_stderr": 0.03656343653353159, "acc_norm": 0.6416184971098265, "acc_norm_stderr": 0.03656343653353159 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.5196078431372549, "acc_stderr": 0.04971358884367405, "acc_norm": 0.5196078431372549, "acc_norm_stderr": 0.04971358884367405 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.04229525846816506, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816506 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5914893617021276, "acc_stderr": 0.032134180267015755, "acc_norm": 0.5914893617021276, "acc_norm_stderr": 0.032134180267015755 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.6052631578947368, "acc_stderr": 0.04598188057816542, "acc_norm": 0.6052631578947368, "acc_norm_stderr": 0.04598188057816542 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6344827586206897, "acc_stderr": 0.040131241954243856, "acc_norm": 0.6344827586206897, "acc_norm_stderr": 0.040131241954243856 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.4444444444444444, "acc_stderr": 0.025591857761382182, "acc_norm": 0.4444444444444444, "acc_norm_stderr": 0.025591857761382182 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.48412698412698413, "acc_stderr": 0.04469881854072606, "acc_norm": 0.48412698412698413, "acc_norm_stderr": 0.04469881854072606 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7870967741935484, "acc_stderr": 0.023287665127268563, "acc_norm": 0.7870967741935484, "acc_norm_stderr": 0.023287665127268563 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5024630541871922, "acc_stderr": 0.035179450386910616, "acc_norm": 0.5024630541871922, "acc_norm_stderr": 0.035179450386910616 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7515151515151515, "acc_stderr": 0.03374402644139404, "acc_norm": 0.7515151515151515, "acc_norm_stderr": 0.03374402644139404 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8434343434343434, "acc_stderr": 0.025890520358141454, "acc_norm": 0.8434343434343434, "acc_norm_stderr": 0.025890520358141454 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9119170984455959, "acc_stderr": 0.02045374660160103, "acc_norm": 0.9119170984455959, "acc_norm_stderr": 0.02045374660160103 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.658974358974359, "acc_stderr": 0.02403548967633507, "acc_norm": 0.658974358974359, "acc_norm_stderr": 0.02403548967633507 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3814814814814815, "acc_stderr": 0.029616718927497593, "acc_norm": 0



