open-llm-leaderboard-old/details_OpenBuddy__openbuddy-deepseek-67b-v18.1-4k
收藏数据集概述
数据集简介
该数据集是在评估模型OpenBuddy/openbuddy-deepseek-67b-v18.1-4k在Open LLM Leaderboard上的运行过程中自动创建的。
数据集组成
- 数据集包含63个配置,每个配置对应一个评估任务。
- 数据集由1次运行创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示在Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_OpenBuddy__openbuddy-deepseek-67b-v18.1-4k", "harness_winogrande_5", split="train")
最新结果
以下是最新结果来自运行2024-02-18T20:55:27.550442: python { "all": { "acc": 0.7058260280059325, "acc_stderr": 0.030134629260569593, "acc_norm": 0.7076734462849897, "acc_norm_stderr": 0.03073727698082304, "mc1": 0.39412484700122397, "mc1_stderr": 0.01710658814070033, "mc2": 0.5565901681593471, "mc2_stderr": 0.015389712051681206 }, "harness|arc:challenge|25": { "acc": 0.6493174061433447, "acc_stderr": 0.013944635930726096, "acc_norm": 0.6774744027303754, "acc_norm_stderr": 0.013659980894277371 }, "harness|hellaswag|10": { "acc": 0.655646285600478, "acc_stderr": 0.0047418597531784295, "acc_norm": 0.8465445130452102, "acc_norm_stderr": 0.0035968938961909126 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.04725815626252606, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252606 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.7037037037037037, "acc_stderr": 0.03944624162501116, "acc_norm": 0.7037037037037037, "acc_norm_stderr": 0.03944624162501116 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7631578947368421, "acc_stderr": 0.03459777606810537, "acc_norm": 0.7631578947368421, "acc_norm_stderr": 0.03459777606810537 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.82, "acc_stderr": 0.03861229196653694, "acc_norm": 0.82, "acc_norm_stderr": 0.03861229196653694 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7622641509433963, "acc_stderr": 0.026199808807561915, "acc_norm": 0.7622641509433963, "acc_norm_stderr": 0.026199808807561915 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8263888888888888, "acc_stderr": 0.03167473383795718, "acc_norm": 0.8263888888888888, "acc_norm_stderr": 0.03167473383795718 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.49, "acc_stderr": 0.05024183937956911, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956911 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.58, "acc_stderr": 0.04960449637488584, "acc_norm": 0.58, "acc_norm_stderr": 0.04960449637488584 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6763005780346821, "acc_stderr": 0.035676037996391706, "acc_norm": 0.6763005780346821, "acc_norm_stderr": 0.035676037996391706 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.37254901960784315, "acc_stderr": 0.04810840148082635, "acc_norm": 0.37254901960784315, "acc_norm_stderr": 0.04810840148082635 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.04229525846816506, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816506 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.7276595744680852, "acc_stderr": 0.029101290698386715, "acc_norm": 0.7276595744680852, "acc_norm_stderr": 0.029101290698386715 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5087719298245614, "acc_stderr": 0.04702880432049615, "acc_norm": 0.5087719298245614, "acc_norm_stderr": 0.04702880432049615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.7034482758620689, "acc_stderr": 0.03806142687309992, "acc_norm": 0.7034482758620689, "acc_norm_stderr": 0.03806142687309992 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.5238095238095238, "acc_stderr": 0.025722097064388525, "acc_norm": 0.5238095238095238, "acc_norm_stderr": 0.025722097064388525 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5, "acc_stderr": 0.04472135954999579, "acc_norm": 0.5, "acc_norm_stderr": 0.04472135954999579 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8225806451612904, "acc_stderr": 0.021732540689329276, "acc_norm": 0.8225806451612904, "acc_norm_stderr": 0.021732540689329276 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5615763546798029, "acc_stderr": 0.03491207857486519, "acc_norm": 0.5615763546798029, "acc_norm_stderr": 0.03491207857486519 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.74, "acc_stderr": 0.044084400227680794, "acc_norm": 0.74, "acc_norm_stderr": 0.044084400227680794 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.806060606060606, "acc_stderr": 0.030874145136562097, "acc_norm": 0.806060606060606, "acc_norm_stderr": 0.030874145136562097 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8939393939393939, "acc_stderr": 0.021938047738853106, "acc_norm": 0.8939393939393939, "acc_norm_stderr": 0.021938047738853106 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9637305699481865, "acc_stderr": 0.013492659751295153, "acc_norm": 0.9637305699481865, "acc_norm_stderr": 0.013492659751295153 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.7102564102564103, "acc_stderr": 0.02300062824368797, "acc_norm": 0.7102564102564103, "acc_norm_stderr": 0.02300062824368797 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3814814814814815, "acc_stderr": 0.029616718927497593, "acc_norm": 0.3814814814814815, "acc



