open-llm-leaderboard-old/details_Nitral-AI__Lelanta-lake-7b
收藏数据集概述
数据集简介
该数据集是在对模型 Nitral-AI/Lelanta-lake-7b 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 63 个配置,每个配置对应一个评估任务。
- 数据集由 1 次运行创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 额外的 "results" 配置存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Nitral-AI__Lelanta-lake-7b", "harness_winogrande_5", split="train")
最新结果
以下是 2024-03-16T01:40:30.655595 运行 的最新结果:
python { "all": { "acc": 0.6494898866023416, "acc_stderr": 0.03217719105422821, "acc_norm": 0.648619500438268, "acc_norm_stderr": 0.03285354970975784, "mc1": 0.5716034271725826, "mc1_stderr": 0.017323088597314743, "mc2": 0.7304994916069856, "mc2_stderr": 0.014587862942968618 }, "harness|arc:challenge|25": { "acc": 0.7064846416382252, "acc_stderr": 0.013307250444941113, "acc_norm": 0.7226962457337884, "acc_norm_stderr": 0.013082095839059376 }, "harness|hellaswag|10": { "acc": 0.7184823740290779, "acc_stderr": 0.004488201756642578, "acc_norm": 0.8894642501493726, "acc_norm_stderr": 0.003129155503881715 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6296296296296297, "acc_stderr": 0.041716541613545426, "acc_norm": 0.6296296296296297, "acc_norm_stderr": 0.041716541613545426 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7236842105263158, "acc_stderr": 0.036390575699529276, "acc_norm": 0.7236842105263158, "acc_norm_stderr": 0.036390575699529276 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.64, "acc_stderr": 0.04824181513244218, "acc_norm": 0.64, "acc_norm_stderr": 0.04824181513244218 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7094339622641509, "acc_stderr": 0.027943219989337128, "acc_norm": 0.7094339622641509, "acc_norm_stderr": 0.027943219989337128 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.7708333333333334, "acc_stderr": 0.03514697467862388, "acc_norm": 0.7708333333333334, "acc_norm_stderr": 0.03514697467862388 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.45, "acc_stderr": 0.05, "acc_norm": 0.45, "acc_norm_stderr": 0.05 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.653179190751445, "acc_stderr": 0.036291466701596636, "acc_norm": 0.653179190751445, "acc_norm_stderr": 0.036291466701596636 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4019607843137255, "acc_stderr": 0.04878608714466996, "acc_norm": 0.4019607843137255, "acc_norm_stderr": 0.04878608714466996 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5617021276595745, "acc_stderr": 0.03243618636108101, "acc_norm": 0.5617021276595745, "acc_norm_stderr": 0.03243618636108101 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.5, "acc_stderr": 0.047036043419179864, "acc_norm": 0.5, "acc_norm_stderr": 0.047036043419179864 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5379310344827586, "acc_stderr": 0.04154659671707548, "acc_norm": 0.5379310344827586, "acc_norm_stderr": 0.04154659671707548 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.42063492063492064, "acc_stderr": 0.025424835086923996, "acc_norm": 0.42063492063492064, "acc_norm_stderr": 0.025424835086923996 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.5, "acc_stderr": 0.04472135954999579, "acc_norm": 0.5, "acc_norm_stderr": 0.04472135954999579 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.34, "acc_stderr": 0.04760952285695235, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7806451612903226, "acc_stderr": 0.023540799358723295, "acc_norm": 0.7806451612903226, "acc_norm_stderr": 0.023540799358723295 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5024630541871922, "acc_stderr": 0.035179450386910616, "acc_norm": 0.5024630541871922, "acc_norm_stderr": 0.035179450386910616 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.68, "acc_stderr": 0.04688261722621505, "acc_norm": 0.68, "acc_norm_stderr": 0.04688261722621505 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7696969696969697, "acc_stderr": 0.032876667586034906, "acc_norm": 0.7696969696969697, "acc_norm_stderr": 0.032876667586034906 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8131313131313131, "acc_stderr": 0.027772533334218967, "acc_norm": 0.8131313131313131, "acc_norm_stderr": 0.027772533334218967 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8963730569948186, "acc_stderr": 0.021995311963644237, "acc_norm": 0.8963730569948186, "acc_norm_stderr": 0.021995311963644237 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6692307692307692, "acc_stderr": 0.023854795680971125, "acc_norm": 0.6692307692307692, "acc_norm_stderr": 0.023854795680971125 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.337037037037037, "acc_stderr": 0.02882088466625326, "acc_norm": 0.337037037037037, "acc_norm_stderr": 0.02882088466625326 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6764705882352942, "acc_stderr": 0.0303



