open-llm-leaderboard-old/details_gardner__TinyLlama-1.1B-SlimOrca-Function-Calling-3T
收藏数据集概述
数据集简介
该数据集是在评估模型 gardner/TinyLlama-1.1B-SlimOrca-Function-Calling-3T 在 Open LLM Leaderboard 上的自动创建的。数据集包含 63 个配置,每个配置对应一个评估任务。
数据集结构
数据集由 1 次运行创建,每个运行可以在每个配置中作为一个特定的分割找到,分割名称使用运行的时间戳。"train" 分割始终指向最新的结果。
额外配置
一个额外的配置 "results" 存储了所有运行的聚合结果,用于计算和显示在 Open LLM Leaderboard 上的聚合指标。
数据加载示例
python from datasets import load_dataset data = load_dataset("open-llm-leaderboard/details_gardner__TinyLlama-1.1B-SlimOrca-Function-Calling-3T", "harness_winogrande_5", split="train")
最新结果
以下是 2024-01-27T06:41:42.022481 运行的最新结果:
python { "all": { "acc": 0.2868856338605546, "acc_stderr": 0.03190338525939913, "acc_norm": 0.2887423811940406, "acc_norm_stderr": 0.03265770846944957, "mc1": 0.23011015911872704, "mc1_stderr": 0.014734557959807762, "mc2": 0.3674239002696778, "mc2_stderr": 0.014479746743393794 }, "harness|arc:challenge|25": { "acc": 0.3310580204778157, "acc_stderr": 0.013752062419817834, "acc_norm": 0.3609215017064846, "acc_norm_stderr": 0.01403476138617546 }, "harness|hellaswag|10": { "acc": 0.4547898824935272, "acc_stderr": 0.004969341773423514, "acc_norm": 0.5965943039235212, "acc_norm_stderr": 0.0048957821077864885 }, "harness|hendrycksTest-abstract_algebra|5": { "acc": 0.27, "acc_stderr": 0.044619604333847415, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847415 }, "harness|hendrycksTest-anatomy|5": { "acc": 0.4148148148148148, "acc_stderr": 0.04256193767901407, "acc_norm": 0.4148148148148148, "acc_norm_stderr": 0.04256193767901407 }, "harness|hendrycksTest-astronomy|5": { "acc": 0.23684210526315788, "acc_stderr": 0.034597776068105365, "acc_norm": 0.23684210526315788, "acc_norm_stderr": 0.034597776068105365 }, "harness|hendrycksTest-business_ethics|5": { "acc": 0.31, "acc_stderr": 0.04648231987117316, "acc_norm": 0.31, "acc_norm_stderr": 0.04648231987117316 }, "harness|hendrycksTest-clinical_knowledge|5": { "acc": 0.2792452830188679, "acc_stderr": 0.027611163402399715, "acc_norm": 0.2792452830188679, "acc_norm_stderr": 0.027611163402399715 }, "harness|hendrycksTest-college_biology|5": { "acc": 0.2569444444444444, "acc_stderr": 0.03653946969442099, "acc_norm": 0.2569444444444444, "acc_norm_stderr": 0.03653946969442099 }, "harness|hendrycksTest-college_chemistry|5": { "acc": 0.24, "acc_stderr": 0.04292346959909282, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282 }, "harness|hendrycksTest-college_computer_science|5": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "harness|hendrycksTest-college_mathematics|5": { "acc": 0.35, "acc_stderr": 0.047937248544110196, "acc_norm": 0.35, "acc_norm_stderr": 0.047937248544110196 }, "harness|hendrycksTest-college_medicine|5": { "acc": 0.23699421965317918, "acc_stderr": 0.03242414757483099, "acc_norm": 0.23699421965317918, "acc_norm_stderr": 0.03242414757483099 }, "harness|hendrycksTest-college_physics|5": { "acc": 0.1568627450980392, "acc_stderr": 0.03618664819936248, "acc_norm": 0.1568627450980392, "acc_norm_stderr": 0.03618664819936248 }, "harness|hendrycksTest-computer_security|5": { "acc": 0.33, "acc_stderr": 0.04725815626252605, "acc_norm": 0.33, "acc_norm_stderr": 0.04725815626252605 }, "harness|hendrycksTest-conceptual_physics|5": { "acc": 0.33191489361702126, "acc_stderr": 0.030783736757745647, "acc_norm": 0.33191489361702126, "acc_norm_stderr": 0.030783736757745647 }, "harness|hendrycksTest-econometrics|5": { "acc": 0.24561403508771928, "acc_stderr": 0.040493392977481425, "acc_norm": 0.24561403508771928, "acc_norm_stderr": 0.040493392977481425 }, "harness|hendrycksTest-electrical_engineering|5": { "acc": 0.2413793103448276, "acc_stderr": 0.03565998174135302, "acc_norm": 0.2413793103448276, "acc_norm_stderr": 0.03565998174135302 }, "harness|hendrycksTest-elementary_mathematics|5": { "acc": 0.2566137566137566, "acc_stderr": 0.022494510767503154, "acc_norm": 0.2566137566137566, "acc_norm_stderr": 0.022494510767503154 }, "harness|hendrycksTest-formal_logic|5": { "acc": 0.23015873015873015, "acc_stderr": 0.037649508797906066, "acc_norm": 0.23015873015873015, "acc_norm_stderr": 0.037649508797906066 }, "harness|hendrycksTest-global_facts|5": { "acc": 0.24, "acc_stderr": 0.04292346959909283, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909283 }, "harness|hendrycksTest-high_school_biology|5": { "acc": 0.27741935483870966, "acc_stderr": 0.025470196835900055, "acc_norm": 0.27741935483870966, "acc_norm_stderr": 0.025470196835900055 }, "harness|hendrycksTest-high_school_chemistry|5": { "acc": 0.2512315270935961, "acc_stderr": 0.030516530732694436, "acc_norm": 0.2512315270935961, "acc_norm_stderr": 0.030516530732694436 }, "harness|hendrycksTest-high_school_computer_science|5": { "acc": 0.26, "acc_stderr": 0.0440844002276808, "acc_norm": 0.26, "acc_norm_stderr": 0.0440844002276808 }, "harness|hendrycksTest-high_school_european_history|5": { "acc": 0.21818181818181817, "acc_stderr": 0.03225078108306289, "acc_norm": 0.21818181818181817, "acc_norm_stderr": 0.03225078108306289 }, "harness|hendrycksTest-high_school_geography|5": { "acc": 0.31313131313131315, "acc_stderr": 0.033042050878136525, "acc_norm": 0.31313131313131315, "acc_norm_stderr": 0.033042050878136525 }, "harness|hendrycksTest-high_school_government_and_politics|5": { "acc": 0.32124352331606215, "acc_stderr": 0.033699508685490674, "acc_norm": 0.32124352331606215, "acc_norm_stderr": 0.033699508685490674 }, "harness|hendrycksTest-high_school_macroeconomics|5": { "acc": 0.24871794871794872, "acc_stderr": 0.021916957709213803, "acc_norm": 0.24871794871794872, "acc_norm_stderr": 0.021916957709213803 }, "harness|hendrycksTest-high_school_mathematics|5": { "acc": 0.255555555555555




