open-llm-leaderboard-old/details_Salesforce__xLAM-v0.1-r
收藏数据集概述
数据集摘要
该数据集是在对模型 Salesforce/xLAM-v0.1-r 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 该数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Salesforce__xLAM-v0.1-r", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-22T04:14:05.448120 运行的最新结果:
python { "all": { "acc": 0.6976542334140061, "acc_stderr": 0.03071018227865466, "acc_norm": 0.7016121327715498, "acc_norm_stderr": 0.03130851594704009, "mc1": 0.4222766217870257, "mc1_stderr": 0.01729073325424817, "mc2": 0.5777296714578628, "mc2_stderr": 0.015008412090662794 }, "harness|arc:challenge|25": { "acc": 0.6390784982935154, "acc_stderr": 0.014034761386175452, "acc_norm": 0.6757679180887372, "acc_norm_stderr": 0.013678810399518824 }, "harness|hellaswag|10": { "acc": 0.650866361282613, "acc_stderr": 0.0047572204492837, "acc_norm": 0.845947022505477, "acc_norm_stderr": 0.003602617446641387 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.38, "acc_stderr": 0.048783173121456316, "acc_norm": 0.38, "acc_norm_stderr": 0.048783173121456316 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.674074074074074, "acc_stderr": 0.040491220417025055, "acc_norm": 0.674074074074074, "acc_norm_stderr": 0.040491220417025055 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7763157894736842, "acc_stderr": 0.03391160934343604, "acc_norm": 0.7763157894736842, "acc_norm_stderr": 0.03391160934343604 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.66, "acc_stderr": 0.04760952285695238, "acc_norm": 0.66, "acc_norm_stderr": 0.04760952285695238 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7811320754716982, "acc_stderr": 0.02544786382510859, "acc_norm": 0.7811320754716982, "acc_norm_stderr": 0.02544786382510859 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8194444444444444, "acc_stderr": 0.032166008088022675, "acc_norm": 0.8194444444444444, "acc_norm_stderr": 0.032166008088022675 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.62, "acc_stderr": 0.04878317312145633, "acc_norm": 0.62, "acc_norm_stderr": 0.04878317312145633 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.7283236994219653, "acc_stderr": 0.03391750322321659, "acc_norm": 0.7283236994219653, "acc_norm_stderr": 0.03391750322321659 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.39215686274509803, "acc_stderr": 0.04858083574266345, "acc_norm": 0.39215686274509803, "acc_norm_stderr": 0.04858083574266345 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.8, "acc_stderr": 0.04020151261036846, "acc_norm": 0.8, "acc_norm_stderr": 0.04020151261036846 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.6638297872340425, "acc_stderr": 0.030881618520676942, "acc_norm": 0.6638297872340425, "acc_norm_stderr": 0.030881618520676942 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5789473684210527, "acc_stderr": 0.046446020912223177, "acc_norm": 0.5789473684210527, "acc_norm_stderr": 0.046446020912223177 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6551724137931034, "acc_stderr": 0.03960933549451207, "acc_norm": 0.6551724137931034, "acc_norm_stderr": 0.03960933549451207 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.48148148148148145, "acc_stderr": 0.025733641991838987, "acc_norm": 0.48148148148148145, "acc_norm_stderr": 0.025733641991838987 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.49206349206349204, "acc_stderr": 0.044715725362943486, "acc_norm": 0.49206349206349204, "acc_norm_stderr": 0.044715725362943486 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8129032258064516, "acc_stderr": 0.02218571009225225, "acc_norm": 0.8129032258064516, "acc_norm_stderr": 0.02218571009225225 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5714285714285714, "acc_stderr": 0.03481904844438803, "acc_norm": 0.5714285714285714, "acc_norm_stderr": 0.03481904844438803 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.78, "acc_stderr": 0.04163331998932261, "acc_norm": 0.78, "acc_norm_stderr": 0.04163331998932261 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7878787878787878, "acc_stderr": 0.031922715695482995, "acc_norm": 0.7878787878787878, "acc_norm_stderr": 0.031922715695482995 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8434343434343434, "acc_stderr": 0.025890520358141454, "acc_norm": 0.8434343434343434, "acc_norm_stderr": 0.025890520358141454 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9481865284974094, "acc_stderr": 0.01599622932024412, "acc_norm": 0.9481865284974094, "acc_norm_stderr": 0.01599622932024412 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6743589743589744, "acc_stderr": 0.02375966576741229, "acc_norm": 0.6743589743589744, "acc_norm_stderr": 0.02375966576741229 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3592592592592593, "acc_stderr": 0.029252905927251976, "acc_norm": 0.3592592592592593, "acc_norm_



