open-llm-leaderboard-old/details_vicgalle__Configurable-Llama-3-8B-v0.2
收藏数据集概述
数据集摘要
该数据集是在对模型 vicgalle/Configurable-Llama-3-8B-v0.2 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集结构
- 数据集由 63 个配置组成,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建。每次运行可以在每个配置中找到特定的分割,分割名称使用运行的时间戳。
- "train" 分割始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_vicgalle__Configurable-Llama-3-8B-v0.2", "harness_winogrande_5", split="train")
最新结果
以下是 2024-04-19T03:19:20.040365 运行的最新结果:
python { "all": { "acc": 0.6694408893619185, "acc_stderr": 0.03176677993984921, "acc_norm": 0.6716394002581934, "acc_norm_stderr": 0.03240469776002056, "mc1": 0.37821297429620565, "mc1_stderr": 0.01697633590754687, "mc2": 0.5679248273257846, "mc2_stderr": 0.015218379312142727 }, "harness|arc:challenge|25": { "acc": 0.5836177474402731, "acc_stderr": 0.014405618279436169, "acc_norm": 0.6254266211604096, "acc_norm_stderr": 0.014144193471893454 }, "harness|hellaswag|10": { "acc": 0.5952997410874328, "acc_stderr": 0.004898308167211848, "acc_norm": 0.7977494523003386, "acc_norm_stderr": 0.004008571431483689 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6444444444444445, "acc_stderr": 0.04135176749720385, "acc_norm": 0.6444444444444445, "acc_norm_stderr": 0.04135176749720385 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.7302631578947368, "acc_stderr": 0.03611780560284898, "acc_norm": 0.7302631578947368, "acc_norm_stderr": 0.03611780560284898 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.71, "acc_stderr": 0.045604802157206845, "acc_norm": 0.71, "acc_norm_stderr": 0.045604802157206845 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7509433962264151, "acc_stderr": 0.026616482980501704, "acc_norm": 0.7509433962264151, "acc_norm_stderr": 0.026616482980501704 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8055555555555556, "acc_stderr": 0.03309615177059006, "acc_norm": 0.8055555555555556, "acc_norm_stderr": 0.03309615177059006 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.44, "acc_stderr": 0.049888765156985884, "acc_norm": 0.44, "acc_norm_stderr": 0.049888765156985884 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.59, "acc_stderr": 0.04943110704237101, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237101 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.38, "acc_stderr": 0.04878317312145633, "acc_norm": 0.38, "acc_norm_stderr": 0.04878317312145633 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.653179190751445, "acc_stderr": 0.036291466701596636, "acc_norm": 0.653179190751445, "acc_norm_stderr": 0.036291466701596636 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.5, "acc_stderr": 0.04975185951049946, "acc_norm": 0.5, "acc_norm_stderr": 0.04975185951049946 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.76, "acc_stderr": 0.04292346959909281, "acc_norm": 0.76, "acc_norm_stderr": 0.04292346959909281 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.5957446808510638, "acc_stderr": 0.032081157507886836, "acc_norm": 0.5957446808510638, "acc_norm_stderr": 0.032081157507886836 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.6228070175438597, "acc_stderr": 0.04559522141958216, "acc_norm": 0.6228070175438597, "acc_norm_stderr": 0.04559522141958216 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6344827586206897, "acc_stderr": 0.04013124195424386, "acc_norm": 0.6344827586206897, "acc_norm_stderr": 0.04013124195424386 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.43915343915343913, "acc_stderr": 0.025559920550531003, "acc_norm": 0.43915343915343913, "acc_norm_stderr": 0.025559920550531003 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.49206349206349204, "acc_stderr": 0.044715725362943486, "acc_norm": 0.49206349206349204, "acc_norm_stderr": 0.044715725362943486 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.4, "acc_stderr": 0.04923659639173309, "acc_norm": 0.4, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.7806451612903226, "acc_stderr": 0.023540799358723274, "acc_norm": 0.7806451612903226, "acc_norm_stderr": 0.023540799358723274 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5172413793103449, "acc_stderr": 0.035158955511656986, "acc_norm": 0.5172413793103449, "acc_norm_stderr": 0.035158955511656986 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.74, "acc_stderr": 0.0440844002276808, "acc_norm": 0.74, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.7575757575757576, "acc_stderr": 0.03346409881055953, "acc_norm": 0.7575757575757576, "acc_norm_stderr": 0.03346409881055953 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8333333333333334, "acc_stderr": 0.026552207828215282, "acc_norm": 0.8333333333333334, "acc_norm_stderr": 0.026552207828215282 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9119170984455959, "acc_stderr": 0.02045374660160103, "acc_norm": 0.9119170984455959, "acc_norm_stderr": 0.02045374660160103 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.6512820512820513, "acc_stderr": 0.024162780284017717, "acc_norm": 0.6512820512820513, "acc_norm_stderr": 0.024162780284017717 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.3814814814814815, "acc_stderr": 0.029616718927497593, "acc_norm": 0.3814814814814815, "acc_norm_stderr": 0.



