open-llm-leaderboard-old/details_Menouar__saqr-7b-beta
收藏数据集概述
数据集简介
该数据集是在对模型 Menouar/saqr-7b-beta 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 创建来源:从1次运行中创建。每个运行在每个配置中作为一个特定的分割存在,分割名称使用运行的时间戳。
- 最新结果:"train" 分割始终指向最新的结果。
- 聚合结果:一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Menouar__saqr-7b-beta", "harness_winogrande_5", split="train")
最新结果
这些是最新的结果,来自运行 2024-02-18T12:49:44.046455: python { "all": { "acc": 0.27030982140899557, "acc_stderr": 0.03111036577540486, "acc_norm": 0.2704987678522067, "acc_norm_stderr": 0.031811806028838624, "mc1": 0.26193390452876375, "mc1_stderr": 0.01539211880501503, "mc2": 0.3938162400030715, "mc2_stderr": 0.014166543524460336 }, "harness|arc:challenge|25": { "acc": 0.42150170648464164, "acc_stderr": 0.014430197069326016, "acc_norm": 0.4778156996587031, "acc_norm_stderr": 0.014597001927076133 }, "harness|hellaswag|10": { "acc": 0.5774746066520613, "acc_stderr": 0.004929517011508222, "acc_norm": 0.776140211113324, "acc_norm_stderr": 0.004159773209765884 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.2074074074074074, "acc_stderr": 0.03502553170678316, "acc_norm": 0.2074074074074074, "acc_norm_stderr": 0.03502553170678316 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.23026315789473684, "acc_stderr": 0.03426059424403165, "acc_norm": 0.23026315789473684, "acc_norm_stderr": 0.03426059424403165 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.23, "acc_stderr": 0.042295258468165065, "acc_norm": 0.23, "acc_norm_stderr": 0.042295258468165065 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.23773584905660378, "acc_stderr": 0.026199808807561915, "acc_norm": 0.23773584905660378, "acc_norm_stderr": 0.026199808807561915 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2708333333333333, "acc_stderr": 0.037161774375660185, "acc_norm": 0.2708333333333333, "acc_norm_stderr": 0.037161774375660185 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.14, "acc_stderr": 0.034873508801977725, "acc_norm": 0.14, "acc_norm_stderr": 0.034873508801977725 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.26, "acc_stderr": 0.04408440022768078, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768078 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.21965317919075145, "acc_stderr": 0.031568093627031744, "acc_norm": 0.21965317919075145, "acc_norm_stderr": 0.031568093627031744 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.20588235294117646, "acc_stderr": 0.04023382273617746, "acc_norm": 0.20588235294117646, "acc_norm_stderr": 0.04023382273617746 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.2851063829787234, "acc_stderr": 0.029513196625539355, "acc_norm": 0.2851063829787234, "acc_norm_stderr": 0.029513196625539355 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.23684210526315788, "acc_stderr": 0.03999423879281336, "acc_norm": 0.23684210526315788, "acc_norm_stderr": 0.03999423879281336 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.32413793103448274, "acc_stderr": 0.03900432069185555, "acc_norm": 0.32413793103448274, "acc_norm_stderr": 0.03900432069185555 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.20899470899470898, "acc_stderr": 0.02094048156533486, "acc_norm": 0.20899470899470898, "acc_norm_stderr": 0.02094048156533486 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.21428571428571427, "acc_stderr": 0.03670066451047181, "acc_norm": 0.21428571428571427, "acc_norm_stderr": 0.03670066451047181 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.32, "acc_stderr": 0.04688261722621503, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621503 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.1935483870967742, "acc_stderr": 0.02247525852553606, "acc_norm": 0.1935483870967742, "acc_norm_stderr": 0.02247525852553606 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.1625615763546798, "acc_stderr": 0.025960300064605576, "acc_norm": 0.1625615763546798, "acc_norm_stderr": 0.025960300064605576 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.3151515151515151, "acc_stderr": 0.0362773057502241, "acc_norm": 0.3151515151515151, "acc_norm_stderr": 0.0362773057502241 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.21717171717171718, "acc_stderr": 0.029376616484945633, "acc_norm": 0.21717171717171718, "acc_norm_stderr": 0.029376616484945633 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.18134715025906736, "acc_stderr": 0.02780703236068609, "acc_norm": 0.18134715025906736, "acc_norm_stderr": 0.02780703236068609 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.23846153846153847, "acc_stderr": 0.021606294494647727, "acc_norm": 0.23846153846153847, "acc_norm_stderr": 0.021606294494647727 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2111111111111111, "acc_stderr": 0.024882116857655078, "acc_norm": 0.2111111111111111, "acc_norm_stderr": 0.024882116857655078 }, "harness|hendrycksTest-high_school_microeconom




