open-llm-leaderboard-old/details_SCE__Mistral-7B-math-ia3-pruned10
收藏数据集概述
数据集简介
该数据集是在模型 SCE/Mistral-7B-math-ia3-pruned10 在 Open LLM Leaderboard 上的评估运行期间自动创建的。
数据集结构
- 配置数量:63个配置,每个配置对应一个评估任务。
- 创建来源:从1次运行中创建。每个运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- 训练分割:"train" 分割总是指向最新的结果。
- 结果配置:一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_SCE__Mistral-7B-math-ia3-pruned10", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-29T08:02:29.802840 运行的最新结果:
python { "all": { "acc": 0.6071140736745556, "acc_stderr": 0.03313148079500382, "acc_norm": 0.6116841088566581, "acc_norm_stderr": 0.03380225533253272, "mc1": 0.5299877600979193, "mc1_stderr": 0.01747199209169754, "mc2": 0.6816125161993237, "mc2_stderr": 0.015141567513812132 }, "harness|arc:challenge|25": { "acc": 0.5861774744027304, "acc_stderr": 0.014392730009221005, "acc_norm": 0.6313993174061433, "acc_norm_stderr": 0.014097810678042203 }, "harness|hellaswag|10": { "acc": 0.6624178450507867, "acc_stderr": 0.0047191878909480685, "acc_norm": 0.8471420035849433, "acc_norm_stderr": 0.0035911513232683456 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.04605661864718381, "acc_norm": 0.3, "acc_norm_stderr": 0.04605661864718381 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.5703703703703704, "acc_stderr": 0.04276349494376599, "acc_norm": 0.5703703703703704, "acc_norm_stderr": 0.04276349494376599 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.6381578947368421, "acc_stderr": 0.03910525752849723, "acc_norm": 0.6381578947368421, "acc_norm_stderr": 0.03910525752849723 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.6716981132075471, "acc_stderr": 0.02890159361241178, "acc_norm": 0.6716981132075471, "acc_norm_stderr": 0.02890159361241178 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6805555555555556, "acc_stderr": 0.038990736873573344, "acc_norm": 0.6805555555555556, "acc_norm_stderr": 0.038990736873573344 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.4, "acc_stderr": 0.04923659639173309, "acc_norm": 0.4, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.5, "acc_stderr": 0.050251890762960605, "acc_norm": 0.5, "acc_norm_stderr": 0.050251890762960605 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.39, "acc_stderr": 0.04902071300001975, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001975 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5664739884393064, "acc_stderr": 0.03778621079092056, "acc_norm": 0.5664739884393064, "acc_norm_stderr": 0.03778621079092056 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.39215686274509803, "acc_stderr": 0.04858083574266344, "acc_norm": 0.39215686274509803, "acc_norm_stderr": 0.04858083574266344 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.72, "acc_stderr": 0.04512608598542128, "acc_norm": 0.72, "acc_norm_stderr": 0.04512608598542128 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5319148936170213, "acc_stderr": 0.03261936918467382, "acc_norm": 0.5319148936170213, "acc_norm_stderr": 0.03261936918467382 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.42105263157894735, "acc_stderr": 0.046446020912223177, "acc_norm": 0.42105263157894735, "acc_norm_stderr": 0.046446020912223177 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6, "acc_stderr": 0.040824829046386284, "acc_norm": 0.6, "acc_norm_stderr": 0.040824829046386284 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.37037037037037035, "acc_stderr": 0.024870815251057093, "acc_norm": 0.37037037037037035, "acc_norm_stderr": 0.024870815251057093 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.4126984126984127, "acc_stderr": 0.04403438954768177, "acc_norm": 0.4126984126984127, "acc_norm_stderr": 0.04403438954768177 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.4, "acc_stderr": 0.04923659639173309, "acc_norm": 0.4, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6870967741935484, "acc_stderr": 0.02637756702864586, "acc_norm": 0.6870967741935484, "acc_norm_stderr": 0.02637756702864586 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5073891625615764, "acc_stderr": 0.035176035403610105, "acc_norm": 0.5073891625615764, "acc_norm_stderr": 0.035176035403610105 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.61, "acc_stderr": 0.04902071300001974, "acc_norm": 0.61, "acc_norm_stderr": 0.04902071300001974 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7333333333333333, "acc_stderr": 0.03453131801885417, "acc_norm": 0.7333333333333333, "acc_norm_stderr": 0.03453131801885417 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7575757575757576, "acc_stderr": 0.030532892233932022, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.030532892233932022 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8497409326424871, "acc_stderr": 0.02578772318072386, "acc_norm": 0.8497409326424871, "acc_norm_stderr": 0.02578772318072386 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5461538461538461, "acc_stderr": 0.025242770987126184, "acc_norm": 0.5461538461538461, "acc_norm_stderr": 0.025242770987126184 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.2962962962962963, "acc_stderr": 0.027840811495871934, "acc_norm": 0.2962962962962963, "acc_norm_stderr": 0.0



