open-llm-leaderboard/details_upstage__Llama-2-70b-instruct-v2
收藏数据集概述
数据集摘要
该数据集是在对模型 upstage/Llama-2-70b-instruct-v2 进行评估运行期间自动创建的,用于 Open LLM Leaderboard。
数据集组成
- 数据集包含 61 个配置,每个配置对应一个评估任务。
- 数据集从 1 次运行中创建,每个运行可以在每个配置中作为一个特定的分片找到,分片名称使用运行的时间戳。
- "train" 分片始终指向最新的结果。
- 一个额外的配置 "results" 存储所有运行的聚合结果,用于计算和显示 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_upstage__Llama-2-70b-instruct-v2", "harness_truthfulqa_mc_0", split="train")
最新结果
以下是 2023-08-03T01:46:57.047903 运行的最新结果:
python { "all": { "acc": 0.7050740464217434, "acc_stderr": 0.03085018588043536, "acc_norm": 0.7087855823993987, "acc_norm_stderr": 0.03081992944181276, "mc1": 0.44430844553243576, "mc1_stderr": 0.017394586250743173, "mc2": 0.6224972679005382, "mc2_stderr": 0.014880875055625352 }, "harness|arc:challenge|25": { "acc": 0.6732081911262798, "acc_stderr": 0.013706665975587333, "acc_norm": 0.7107508532423208, "acc_norm_stderr": 0.013250012579393441 }, "harness|hellaswag|10": { "acc": 0.6974706233817964, "acc_stderr": 0.00458414401465495, "acc_norm": 0.8789085839474209, "acc_norm_stderr": 0.0032556675321152857 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.33, "acc_stderr": 0.04725815626252605, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252605 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.6518518518518519, "acc_stderr": 0.041153246103369526, "acc_norm": 0.6518518518518519, "acc_norm_stderr": 0.041153246103369526 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.8421052631578947, "acc_stderr": 0.029674167520101453, "acc_norm": 0.8421052631578947, "acc_norm_stderr": 0.029674167520101453 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.74, "acc_stderr": 0.044084400227680794, "acc_norm": 0.74, "acc_norm_stderr": 0.044084400227680794 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.7320754716981132, "acc_stderr": 0.027257260322494845, "acc_norm": 0.7320754716981132, "acc_norm_stderr": 0.027257260322494845 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.8402777777777778, "acc_stderr": 0.030635578972093274, "acc_norm": 0.8402777777777778, "acc_norm_stderr": 0.030635578972093274 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.49, "acc_stderr": 0.05024183937956912, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.6, "acc_stderr": 0.04923659639173309, "acc_norm": 0.6, "acc_norm_stderr": 0.04923659639173309 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.42, "acc_stderr": 0.049604496374885836, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.6589595375722543, "acc_stderr": 0.036146654241808254, "acc_norm": 0.6589595375722543, "acc_norm_stderr": 0.036146654241808254 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.4215686274509804, "acc_stderr": 0.04913595201274498, "acc_norm": 0.4215686274509804, "acc_norm_stderr": 0.04913595201274498 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.77, "acc_stderr": 0.04229525846816507, "acc_norm": 0.77, "acc_norm_stderr": 0.04229525846816507 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.7063829787234043, "acc_stderr": 0.029771642712491227, "acc_norm": 0.7063829787234043, "acc_norm_stderr": 0.029771642712491227 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.4649122807017544, "acc_stderr": 0.04692008381368909, "acc_norm": 0.4649122807017544, "acc_norm_stderr": 0.04692008381368909 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.6482758620689655, "acc_stderr": 0.0397923663749741, "acc_norm": 0.6482758620689655, "acc_norm_stderr": 0.0397923663749741 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.46825396825396826, "acc_stderr": 0.025699352832131792, "acc_norm": 0.46825396825396826, "acc_norm_stderr": 0.025699352832131792 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.46825396825396826, "acc_stderr": 0.04463112720677173, "acc_norm": 0.46825396825396826, "acc_norm_stderr": 0.04463112720677173 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.47, "acc_stderr": 0.05016135580465919, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.8096774193548387, "acc_stderr": 0.02233170761182307, "acc_norm": 0.8096774193548387, "acc_norm_stderr": 0.02233170761182307 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.5615763546798029, "acc_stderr": 0.03491207857486519, "acc_norm": 0.5615763546798029, "acc_norm_stderr": 0.03491207857486519 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.79, "acc_stderr": 0.040936018074033256, "acc_norm": 0.79, "acc_norm_stderr": 0.040936018074033256 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.8424242424242424, "acc_stderr": 0.02845038880528436, "acc_norm": 0.8424242424242424, "acc_norm_stderr": 0.02845038880528436 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.8737373737373737, "acc_stderr": 0.023664359402880242, "acc_norm": 0.8737373737373737, "acc_norm_stderr": 0.023664359402880242 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.9378238341968912, "acc_stderr": 0.017426974154240528, "acc_norm": 0.9378238341968912, "acc_norm_stderr": 0.017426974154240528 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.7102564102564103, "acc_stderr": 0.023000628243687968, "acc_norm": 0.7102564102564103, "acc_norm_stderr": 0.023000628243687968 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.31851851851851853, "acc_stderr": 0.028406533090608463, "acc_



