Name: Muesli1/dclm-CORE-score
Creator: Muesli1
Published: 2026-04-21 00:00:54
License: 暂无描述

下载链接：

https://hf-mirror.com/datasets/Muesli1/dclm-CORE-score

下载链接

链接失效反馈

官方服务：

资源简介：

--- license: mit --- This data was directly extracted from the batches that [DCLM](https://github.com/mlfoundations/dclm/tree/361714bdd60bb9b7f4b2d8354cebbf0dec0c329e) creates internally to calculate the CORE v2 score. It includes the following metrics: - mmlu_fewshot - hellaswag_zeroshot - jeopardy - bigbench_qa_wikidata - arc_easy - arc_challenge - copa - commonsense_qa - piqa - openbook_qa - lambada_openai - hellaswag - winograd - winogrande - bigbench_dyck_languages - agi_eval_lsat_ar - bigbench_cs_algorithms - bigbench_operators - bigbench_repeat_copy_logic - squad - coqa - boolq - bigbench_language_identification as defined by the official [mmlu_and_lowvar.yaml](https://github.com/mlfoundations/dclm/blob/361714bdd60bb9b7f4b2d8354cebbf0dec0c329e/eval/mmlu_and_lowvar.yaml) file. To use this data to evaluate a given model to produce the CORE score (and equivalent other results as calculated by the DCLM codebase), you can use: ```python import inspect import json import os from pathlib import Path from typing import Any import pandas as pd import torch import tqdm from torch.nn import functional as F from transformers import AutoModelForCausalLM, PreTrainedModel from transformers.modeling_outputs import CausalLMOutputWithPast class InContextLearningLMAccuracy: def __init__(self): self.correct = torch.tensor(0.) self.total = torch.tensor(0.) def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor): for batch_idx, cont_idx in enumerate(batch['continuation_indices']): cont_tok_pred = outputs[batch_idx].index_select( dim=0, index=cont_idx - 1, ).argmax(dim=-1) cont_tok_targ = labels[batch_idx].index_select( dim=0, index=cont_idx - 1, ) correct = (cont_tok_pred == cont_tok_targ).all().int().item() self.correct += correct self.total += torch.tensor(1.0) def compute(self): return self.correct / self.total class InContextLearningMultipleChoiceAccuracy: def __init__(self): self.correct = torch.tensor(0.0) self.total = torch.tensor(0.0) def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor): perplexities = [] for batch_idx, cont_idx in enumerate(batch['continuation_indices']): # continuation indices refer to indices in the original input's token space cont_tok_logits = outputs[batch_idx].index_select( dim=0, index=cont_idx - 1, ) # labels have been shifted left by one index, so the cont_idx needs to be shifted as well. cont_tok_targ = labels[batch_idx].index_select( dim=0, index=cont_idx - 1, ) cross_entropy = F.cross_entropy(cont_tok_logits, cont_tok_targ) perplexity = torch.exp(cross_entropy) perplexities.append(perplexity) for (start, end), gold_idx in zip( batch['choice_groupings'], batch['gold_indices'], ): subset = perplexities[start:end] idx_min = subset.index(min(subset)) if idx_min == gold_idx: self.correct += torch.tensor(1.0) self.total += torch.tensor(1.0) def compute(self): return self.correct.float() / self.total CURRENT_VERSION = "v2" def get_aggregated_results(data: dict[str, Any], eval_metadata: pd.DataFrame, aggregation_json: dict[str, Any], version=CURRENT_VERSION): data["missing tasks"] = str( [task for task in eval_metadata["Eval Task"] if task not in data["eval_metrics"]["icl"]] ) eval_metadata["results"] = eval_metadata["Eval Task"].map(data["eval_metrics"]["icl"]) eval_metadata["centered results"] = ( eval_metadata["results"].astype(float) - 0.01 * eval_metadata[ "Random baseline"].astype(float) ) / (1.0 - 0.01 * eval_metadata["Random baseline"].astype(float)) result_df = eval_metadata.groupby("Task Category").agg({"centered results": "mean"}).reset_index() data["aggregated_task_categories_centered"] = result_df.set_index("Task Category").to_dict()["centered results"] data["aggregated_centered_results"] = eval_metadata["centered results"].mean() data["aggregated_results"] = eval_metadata["results"].mean() for key in aggregation_json: tasks = aggregation_json[key] data[key] = eval_metadata[eval_metadata["Eval Task"].isin(tasks)]["results"].mean() data[f"{key}_centered"] = eval_metadata[eval_metadata["Eval Task"].isin(tasks)]["centered results"].mean() # add the new names if 'low_variance_datasets_centered' in data: # missing task for Core: missing_tasks_for_core = [task for task in aggregation_json['low_variance_datasets'] if task not in data["eval_metrics"]["icl"]] if missing_tasks_for_core: data[f'Core_{version}'] = "N/A due to missing tasks: " + str(missing_tasks_for_core) else: data[f'Core_{version}'] = data['low_variance_datasets_centered'] if 'aggregated_centered_results' in data: if data["missing tasks"] != "[]": data[f'Extended_{version}'] = "N/A due to missing tasks: " + data["missing tasks"] else: data[f'Extended_{version}'] = data['aggregated_centered_results'] data['eval_version'] = version # Handle migration for old results if version == CURRENT_VERSION: if 'Core' in data and 'Core_v1' not in data: # If updating an older results file, migrate CORE --> Core_v1 (which won't be present) data['Core_v1'] = data['Core'] del data['Core'] if 'Extended' in data and 'Extended_v1' not in data: # If updating an older results file, migrate Extended --> Extended_v1 (which won't be present) data['Extended_v1'] = data['Extended'] del data['Extended'] # Set unversioned keys to point to current version if it is present if f'Core_{CURRENT_VERSION}' in data: data['Core'] = data[f'Core_{CURRENT_VERSION}'] if f'Extended_{CURRENT_VERSION}' in data: data['Extended'] = data[f'Extended_{CURRENT_VERSION}'] # Extra data data["eval_metrics"]["icl_centered"] = ( eval_metadata[["Eval Task", "centered results"]] .dropna(subset=["centered results"]) .set_index("Eval Task")["centered results"] .to_dict() ) return data @torch.inference_mode def evaluate_core_score(core_dir: Path, model: PreTrainedModel, verbose: bool = True): device = model.device model.eval() model_forward_args = set(inspect.signature(model.forward).parameters.keys()) task_dir_names = os.listdir(core_dir) # Reading tasks tasks: dict[str, list[str]] = {} for dir_name in sorted(task_dir_names): if (core_dir / dir_name).is_file(): continue split_name = dir_name.split("__") task_name = split_name[1] if len(split_name) == 4: sub_task_name = split_name[3] else: sub_task_name = None if task_name in tasks: assert sub_task_name is not None tasks[task_name].append(dir_name) else: tasks[task_name] = [dir_name] evaluation_results = {} # Evaluating tasks for task_name, dir_names in tasks.items(): eval_sum = torch.scalar_tensor(0.0, dtype=torch.float32, device=device) eval_amount = torch.scalar_tensor(0, dtype=torch.int32, device=device) for dir_name in dir_names: split_name = dir_name.split("__") if len(split_name) == 4: # print("Subtask", split_name[2]) sub_task_name = split_name[3] else: sub_task_name = None task_title = task_name if sub_task_name is None else f"{task_name} - {sub_task_name}" with open(core_dir / dir_name / "metric_name.txt") as f: metric_name = f.readline() with open(core_dir / dir_name / "batch_amount.txt") as f: total_batch_amount = int(f.readline()) if metric_name == "InContextLearningLMAccuracy": metric = InContextLearningLMAccuracy() elif metric_name == "InContextLearningMultipleChoiceAccuracy": metric = InContextLearningMultipleChoiceAccuracy() else: raise ValueError(f"Unknown metric: '{metric_name}'") batch_files = sorted(f for f in (core_dir / dir_name).glob(f"batch_*.pt") if f.is_file()) with tqdm.tqdm(desc=task_title, total=total_batch_amount) as pbar: for file in batch_files: batches: list[dict[str, torch.Tensor | list | str]] = torch.load(file, map_location=device) for batch in batches: assert batch["mode"] == "icl_task" labels: torch.Tensor = batch.pop('labels') # HF CausalLM models internally shift labels before computing loss, so we do the same here labels[:, :-1] = labels[:, 1:].clone() labels[:, -1] = -100 outputs: CausalLMOutputWithPast = model( **{k: v for (k, v) in batch.items() if k in model_forward_args}) logits = outputs["logits"] metric.update(batch, logits, labels) pbar.update() metric_result = metric.compute() eval_sum += metric_result.item() eval_amount += 1 if sub_task_name is not None and verbose: print(f"{task_title}: {metric_result.item():.3f}") mean_eval = (eval_sum / eval_amount).item() if verbose: print(f"{task_name}: {mean_eval:.3f}") evaluation_results[task_name] = mean_eval # Print direct results if verbose: print(json.dumps(evaluation_results, indent=4)) # Get necessary metadata for CORE score calculation with open(core_dir / "additional_aggregation.json", "r") as f: aggregation_json = json.load(f) eval_metadata = pd.read_csv(core_dir / "eval_meta_data.csv") output = { "eval_metrics": { "icl": evaluation_results } } # Calculate CORE score output = get_aggregated_results(output, eval_metadata, aggregation_json) if verbose: print(json.dumps(output, indent=4)) return output ``` with an example execution of ```python evaluate_core_score( Path("path/to/downloaded/dataset/core"), AutoModelForCausalLM.from_pretrained( "allenai/OLMo-1B-0724-hf", trust_remote_code=True ).to(device="cuda"), verbose=True ) ``` giving the result: ```json { "eval_metrics": { "icl": { "mmlu_fewshot": 0.28318944573402405, "hellaswag_zeroshot": 0.6577000021934509, "jeopardy": 0.2256999909877777, "bigbench_qa_wikidata": 0.6697999835014343, "arc_easy": 0.6444000005722046, "arc_challenge": 0.3463999927043915, "copa": 0.75, "commonsense_qa": 0.195700004696846, "piqa": 0.7638999819755554, "openbook_qa": 0.3659999966621399, "lambada_openai": 0.6104999780654907, "hellaswag": 0.6629999876022339, "winograd": 0.7985000014305115, "winogrande": 0.6179999709129333, "bigbench_dyck_languages": 0.2669999897480011, "agi_eval_lsat_ar": 0.2825999855995178, "bigbench_cs_algorithms": 0.4758000075817108, "bigbench_operators": 0.23810000717639923, "bigbench_repeat_copy_logic": 0.031199999153614044, "squad": 0.0, "coqa": 0.0340999998152256, "boolq": 0.6370000243186951, "bigbench_language_identification": 0.2700999975204468 }, "icl_centered": { "hellaswag_zeroshot": 0.5436000029246012, "jeopardy": 0.2256999909877777, "bigbench_qa_wikidata": 0.6697999835014343, "arc_easy": 0.5258666674296061, "arc_challenge": 0.12853332360585532, "mmlu_fewshot": 0.044252594312032066, "copa": 0.5, "commonsense_qa": -0.34723617303710885, "piqa": 0.5277999639511108, "openbook_qa": 0.15466666221618652, "lambada_openai": 0.6104999780654907, "hellaswag": 0.5506666501363119, "winograd": 0.597000002861023, "winogrande": 0.2359999418258667, "bigbench_language_identification": 0.026799996693929035, "bigbench_dyck_languages": 0.2669999897480011, "agi_eval_lsat_ar": 0.043466647466023765, "bigbench_cs_algorithms": 0.4758000075817108, "bigbench_operators": 0.23810000717639923, "bigbench_repeat_copy_logic": 0.031199999153614044, "squad": 0.0, "coqa": 0.0340999998152256, "boolq": 0.044736906101829135 } }, "missing tasks": "['mmlu_zeroshot', 'triviaqa_sm_sub', 'gsm8k_cot', 'agi_eval_sat_math_cot', 'aqua_cot', 'svamp_cot', 'bigbench_misconceptions', 'siqa', 'bigbench_novel_concepts', 'bigbench_strange_stories', 'bigbench_strategy_qa', 'bigbench_conlang_translation', 'bigbench_conceptual_combinations', 'bigbench_elementary_math_qa', 'bigbench_logical_deduction', 'simple_arithmetic_nospaces', 'simple_arithmetic_withspaces', 'math_qa', 'logi_qa', 'pubmed_qa_labeled', 'agi_eval_lsat_rc', 'agi_eval_lsat_lr', 'bigbench_understanding_fables', 'agi_eval_sat_en', 'winogender_mc_female', 'winogender_mc_male', 'enterprise_pii_classification', 'bbq', 'gpqa_main', 'gpqa_diamond']", "aggregated_task_categories_centered": { "commonsense reasoning": 0.20880761328254713, "language understanding": 0.4274277620845371, "reading comprehension": 0.026278968639018244, "safety": NaN, "symbolic problem solving": 0.21111333022514978, "world knowledge": 0.3188305119673411 }, "aggregated_centered_results": 0.2664501366311704, "aggregated_results": 0.4273343194762002, "rw_small": 0.6089166651169459, "rw_small_centered": 0.32978947364795974, "95%_CI_above": 0.4416857097475302, "95%_CI_above_centered": 0.2884428834052838, "99%_CI_above": 0.4416857097475302, "99%_CI_above_centered": 0.2884428834052838, "low_variance_datasets": 0.43388635919175367, "low_variance_datasets_centered": 0.276550024918404, "Core_v2": 0.276550024918404, "Extended_v2": "N/A due to missing tasks: ['mmlu_zeroshot', 'triviaqa_sm_sub', 'gsm8k_cot', 'agi_eval_sat_math_cot', 'aqua_cot', 'svamp_cot', 'bigbench_misconceptions', 'siqa', 'bigbench_novel_concepts', 'bigbench_strange_stories', 'bigbench_strategy_qa', 'bigbench_conlang_translation', 'bigbench_conceptual_combinations', 'bigbench_elementary_math_qa', 'bigbench_logical_deduction', 'simple_arithmetic_nospaces', 'simple_arithmetic_withspaces', 'math_qa', 'logi_qa', 'pubmed_qa_labeled', 'agi_eval_lsat_rc', 'agi_eval_lsat_lr', 'bigbench_understanding_fables', 'agi_eval_sat_en', 'winogender_mc_female', 'winogender_mc_male', 'enterprise_pii_classification', 'bbq', 'gpqa_main', 'gpqa_diamond']", "eval_version": "v2", "Core": 0.276550024918404, "Extended": "N/A due to missing tasks: ['mmlu_zeroshot', 'triviaqa_sm_sub', 'gsm8k_cot', 'agi_eval_sat_math_cot', 'aqua_cot', 'svamp_cot', 'bigbench_misconceptions', 'siqa', 'bigbench_novel_concepts', 'bigbench_strange_stories', 'bigbench_strategy_qa', 'bigbench_conlang_translation', 'bigbench_conceptual_combinations', 'bigbench_elementary_math_qa', 'bigbench_logical_deduction', 'simple_arithmetic_nospaces', 'simple_arithmetic_withspaces', 'math_qa', 'logi_qa', 'pubmed_qa_labeled', 'agi_eval_lsat_rc', 'agi_eval_lsat_lr', 'bigbench_understanding_fables', 'agi_eval_sat_en', 'winogender_mc_female', 'winogender_mc_male', 'enterprise_pii_classification', 'bbq', 'gpqa_main', 'gpqa_diamond']" } ``` which is equivalent (except missing some non-relevant fields) to the data produced by the DCLM codebase using ```bash torchrun --nproc_per_node 1 eval/eval_openlm_ckpt.py --hf-model allenai/OLMo-1B-0724-hf --tokenizer allenai/OLMo-1B-0724-hf --eval-yaml "eval/mmlu_and_lowvar.yaml" --output-file exp_data/evals/olmo_eval_mmlu_and_lowvar.json --donot-compute-perplexity ```

应用场景：