open-llm-leaderboard-old/details_JunchengXie__Llama-2-13b-chat-hf-gpt-4-80k-base_lora
收藏数据集概述
数据集摘要
该数据集是在对模型 JunchengXie/Llama-2-13b-chat-hf-gpt-4-80k-base_lora 进行评估运行时自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行结果的聚合,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_JunchengXie__Llama-2-13b-chat-hf-gpt-4-80k-base_lora", "harness_winogrande_5", split="train")
最新结果
这些是最新的结果,来自 2024-03-14T08:10:23.851339 的运行: python { "all": { "acc": 0.5375202562562614, "acc_stderr": 0.03371995679231197, "acc_norm": 0.5438346315493492, "acc_norm_stderr": 0.03443309877077462, "mc1": 0.3317013463892289, "mc1_stderr": 0.016482148810241477, "mc2": 0.5093471255301023, "mc2_stderr": 0.016129393728182468 }, "harness|arc:challenge|25": { "acc": 0.5238907849829352, "acc_stderr": 0.014594701798071654, "acc_norm": 0.5537542662116041, "acc_norm_stderr": 0.014526705548539982 }, "harness|hellaswag|10": { "acc": 0.5534754033061143, "acc_stderr": 0.004961161589228403, "acc_norm": 0.756920932085242, "acc_norm_stderr": 0.004280658234718767 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.48148148148148145, "acc_stderr": 0.043163785995113245, "acc_norm": 0.48148148148148145, "acc_norm_stderr": 0.043163785995113245 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5592105263157895, "acc_stderr": 0.04040311062490436, "acc_norm": 0.5592105263157895, "acc_norm_stderr": 0.04040311062490436 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.52, "acc_stderr": 0.050211673156867795, "acc_norm": 0.52, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.5509433962264151, "acc_stderr": 0.030612730713641095, "acc_norm": 0.5509433962264151, "acc_norm_stderr": 0.030612730713641095 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.5625, "acc_stderr": 0.04148415739394154, "acc_norm": 0.5625, "acc_norm_stderr": 0.04148415739394154 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.34, "acc_stderr": 0.04760952285695236, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695236 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.47, "acc_stderr": 0.05016135580465919, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.41040462427745666, "acc_stderr": 0.03750757044895537, "acc_norm": 0.41040462427745666, "acc_norm_stderr": 0.03750757044895537 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.28431372549019607, "acc_stderr": 0.04488482852329017, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.04488482852329017 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.67, "acc_stderr": 0.04725815626252609, "acc_norm": 0.67, "acc_norm_stderr": 0.04725815626252609 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.39574468085106385, "acc_stderr": 0.03196758697835363, "acc_norm": 0.39574468085106385, "acc_norm_stderr": 0.03196758697835363 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.2982456140350877, "acc_stderr": 0.04303684033537315, "acc_norm": 0.2982456140350877, "acc_norm_stderr": 0.04303684033537315 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5379310344827586, "acc_stderr": 0.04154659671707548, "acc_norm": 0.5379310344827586, "acc_norm_stderr": 0.04154659671707548 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.36243386243386244, "acc_stderr": 0.024757473902752056, "acc_norm": 0.36243386243386244, "acc_norm_stderr": 0.024757473902752056 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.30952380952380953, "acc_stderr": 0.04134913018303316, "acc_norm": 0.30952380952380953, "acc_norm_stderr": 0.04134913018303316 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.28, "acc_stderr": 0.045126085985421276, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421276 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.635483870967742, "acc_stderr": 0.02737987122994325, "acc_norm": 0.635483870967742, "acc_norm_stderr": 0.02737987122994325 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.43349753694581283, "acc_stderr": 0.03486731727419872, "acc_norm": 0.43349753694581283, "acc_norm_stderr": 0.03486731727419872 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.61, "acc_stderr": 0.04902071300001974, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.6666666666666666, "acc_stderr": 0.036810508691615486, "acc_norm": 0.6666666666666666, "acc_norm_stderr": 0.036810508691615486 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.6818181818181818, "acc_stderr": 0.033184773338453294, "acc_norm": 0.6818181818181818, "acc_norm_stderr": 0.033184773338453294 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.772020725388601, "acc_stderr": 0.03027690994517826, "acc_norm": 0.772020725388601, "acc_norm_stderr": 0.03027690994517826 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.4948717948717949, "acc_stderr": 0.02534967290683866, "acc_norm": 0.4948717948717949, "acc_norm_stderr": 0.02534967290683866 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3037037037037037, "acc_stderr": 0.028037929969114986, "acc_norm": 0.3037037037037037, "acc_norm_stderr": 0.028037929969114986 }, "harness|hendrycksTest



