open-llm-leaderboard-old/details_Zangs3011__mistral_7b_2EPOCH_DolphinCoder
收藏数据集概述
数据集摘要
该数据集是在模型Zangs3011/mistral_7b_2EPOCH_DolphinCoder在Open LLM Leaderboard上的评估运行期间自动创建的。
数据集组成
- 数据集由63个配置组成,每个配置对应一个评估任务。
- 数据集从1次运行中创建,每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train"分割始终指向最新的结果。
- 一个额外的配置"results"存储所有运行的聚合结果,用于计算和显示Open LLM Leaderboard上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_Zangs3011__mistral_7b_2EPOCH_DolphinCoder", "harness_winogrande_5", split="train")
最新结果
这些是最新结果,来自2024-01-19T04:55:31.577709的运行: python { "all": { "acc": 0.590189563445543, "acc_stderr": 0.033213747146494416, "acc_norm": 0.5975943163476723, "acc_norm_stderr": 0.03391041523451993, "mc1": 0.2974296205630355, "mc1_stderr": 0.016002651487361005, "mc2": 0.44646084605621383, "mc2_stderr": 0.014640949505732814 }, "harness|arc:challenge|25": { "acc": 0.568259385665529, "acc_stderr": 0.014474591427196202, "acc_norm": 0.6075085324232082, "acc_norm_stderr": 0.014269634635670722 }, "harness|hellaswag|10": { "acc": 0.6229834694284008, "acc_stderr": 0.004836486437527263, "acc_norm": 0.8114917347142003, "acc_norm_stderr": 0.003903181667466359 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.04605661864718381, "acc_norm": 0.3, "acc_norm_stderr": 0.04605661864718381 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.562962962962963, "acc_stderr": 0.04284958639753401, "acc_norm": 0.562962962962963, "acc_norm_stderr": 0.04284958639753401 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.5986842105263158, "acc_stderr": 0.039889037033362836, "acc_norm": 0.5986842105263158, "acc_norm_stderr": 0.039889037033362836 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.54, "acc_stderr": 0.05009082659620332, "acc_norm": 0.54, "acc_norm_stderr": 0.05009082659620332 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.630188679245283, "acc_stderr": 0.029711421880107936, "acc_norm": 0.630188679245283, "acc_norm_stderr": 0.029711421880107936 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.6805555555555556, "acc_stderr": 0.038990736873573344, "acc_norm": 0.6805555555555556, "acc_norm_stderr": 0.038990736873573344 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.41, "acc_stderr": 0.049431107042371025, "acc_norm": 0.41, "acc_norm_stderr": 0.049431107042371025 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.48, "acc_stderr": 0.050211673156867795, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.32, "acc_stderr": 0.046882617226215034, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.5664739884393064, "acc_stderr": 0.03778621079092056, "acc_norm": 0.5664739884393064, "acc_norm_stderr": 0.03778621079092056 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.29411764705882354, "acc_stderr": 0.04533838195929777, "acc_norm": 0.29411764705882354, "acc_norm_stderr": 0.04533838195929777 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.75, "acc_stderr": 0.04351941398892446, "acc_norm": 0.75, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.574468085106383, "acc_stderr": 0.03232146916224469, "acc_norm": 0.574468085106383, "acc_norm_stderr": 0.03232146916224469 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.49122807017543857, "acc_stderr": 0.04702880432049615, "acc_norm": 0.49122807017543857, "acc_norm_stderr": 0.04702880432049615 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.5448275862068965, "acc_stderr": 0.04149886942192117, "acc_norm": 0.5448275862068965, "acc_norm_stderr": 0.04149886942192117 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.3968253968253968, "acc_stderr": 0.02519710107424649, "acc_norm": 0.3968253968253968, "acc_norm_stderr": 0.02519710107424649 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.40476190476190477, "acc_stderr": 0.04390259265377562, "acc_norm": 0.40476190476190477, "acc_norm_stderr": 0.04390259265377562 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.6806451612903226, "acc_stderr": 0.026522709674667765, "acc_norm": 0.6806451612903226, "acc_norm_stderr": 0.026522709674667765 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.4187192118226601, "acc_stderr": 0.03471192860518468, "acc_norm": 0.4187192118226601, "acc_norm_stderr": 0.03471192860518468 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.58, "acc_stderr": 0.049604496374885836, "acc_norm": 0.58, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.696969696969697, "acc_stderr": 0.03588624800091706, "acc_norm": 0.696969696969697, "acc_norm_stderr": 0.03588624800091706 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.7373737373737373, "acc_stderr": 0.03135305009533086, "acc_norm": 0.7373737373737373, "acc_norm_stderr": 0.03135305009533086 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.8393782383419689, "acc_stderr": 0.02649905770139746, "acc_norm": 0.8393782383419689, "acc_norm_stderr": 0.02649905770139746 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.5512820512820513, "acc_stderr": 0.025217315184846482, "acc_norm": 0.5512820512820513, "acc_norm_stderr": 0.025217315184846482 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3074074074074074, "acc_stderr": 0.02813325257881564, "acc_norm": 0.3074074074074074, "acc_norm_stderr": 0.02813325257881564 }, "harness|hendrycksTest-high_school_microeconomics|5": { "acc": 0.6302



