Muesli1/dclm-CORE-score
收藏Hugging Face2026-04-21 更新2026-04-26 收录
下载链接:
https://hf-mirror.com/datasets/Muesli1/dclm-CORE-score
下载链接
链接失效反馈官方服务:
资源简介:
---
license: mit
---
This data was directly extracted from the batches that [DCLM](https://github.com/mlfoundations/dclm/tree/361714bdd60bb9b7f4b2d8354cebbf0dec0c329e) creates internally to calculate the CORE v2 score.
It includes the following metrics:
- mmlu_fewshot
- hellaswag_zeroshot
- jeopardy
- bigbench_qa_wikidata
- arc_easy
- arc_challenge
- copa
- commonsense_qa
- piqa
- openbook_qa
- lambada_openai
- hellaswag
- winograd
- winogrande
- bigbench_dyck_languages
- agi_eval_lsat_ar
- bigbench_cs_algorithms
- bigbench_operators
- bigbench_repeat_copy_logic
- squad
- coqa
- boolq
- bigbench_language_identification
as defined by the official [mmlu_and_lowvar.yaml](https://github.com/mlfoundations/dclm/blob/361714bdd60bb9b7f4b2d8354cebbf0dec0c329e/eval/mmlu_and_lowvar.yaml) file.
To use this data to evaluate a given model to produce the CORE score (and equivalent other results as calculated by the DCLM codebase), you can use:
```python
import inspect
import json
import os
from pathlib import Path
from typing import Any
import pandas as pd
import torch
import tqdm
from torch.nn import functional as F
from transformers import AutoModelForCausalLM, PreTrainedModel
from transformers.modeling_outputs import CausalLMOutputWithPast
class InContextLearningLMAccuracy:
def __init__(self):
self.correct = torch.tensor(0.)
self.total = torch.tensor(0.)
def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
cont_tok_pred = outputs[batch_idx].index_select(
dim=0,
index=cont_idx - 1,
).argmax(dim=-1)
cont_tok_targ = labels[batch_idx].index_select(
dim=0,
index=cont_idx - 1,
)
correct = (cont_tok_pred == cont_tok_targ).all().int().item()
self.correct += correct
self.total += torch.tensor(1.0)
def compute(self):
return self.correct / self.total
class InContextLearningMultipleChoiceAccuracy:
def __init__(self):
self.correct = torch.tensor(0.0)
self.total = torch.tensor(0.0)
def update(self, batch: dict, outputs: torch.Tensor, labels: torch.Tensor):
perplexities = []
for batch_idx, cont_idx in enumerate(batch['continuation_indices']):
# continuation indices refer to indices in the original input's token space
cont_tok_logits = outputs[batch_idx].index_select(
dim=0,
index=cont_idx - 1,
)
# labels have been shifted left by one index, so the cont_idx needs to be shifted as well.
cont_tok_targ = labels[batch_idx].index_select(
dim=0,
index=cont_idx - 1,
)
cross_entropy = F.cross_entropy(cont_tok_logits, cont_tok_targ)
perplexity = torch.exp(cross_entropy)
perplexities.append(perplexity)
for (start, end), gold_idx in zip(
batch['choice_groupings'],
batch['gold_indices'],
):
subset = perplexities[start:end]
idx_min = subset.index(min(subset))
if idx_min == gold_idx:
self.correct += torch.tensor(1.0)
self.total += torch.tensor(1.0)
def compute(self):
return self.correct.float() / self.total
CURRENT_VERSION = "v2"
def get_aggregated_results(data: dict[str, Any], eval_metadata: pd.DataFrame, aggregation_json: dict[str, Any],
version=CURRENT_VERSION):
data["missing tasks"] = str(
[task for task in eval_metadata["Eval Task"] if task not in data["eval_metrics"]["icl"]]
)
eval_metadata["results"] = eval_metadata["Eval Task"].map(data["eval_metrics"]["icl"])
eval_metadata["centered results"] = (
eval_metadata["results"].astype(float) - 0.01 * eval_metadata[
"Random baseline"].astype(float)
) / (1.0 - 0.01 * eval_metadata["Random baseline"].astype(float))
result_df = eval_metadata.groupby("Task Category").agg({"centered results": "mean"}).reset_index()
data["aggregated_task_categories_centered"] = result_df.set_index("Task Category").to_dict()["centered results"]
data["aggregated_centered_results"] = eval_metadata["centered results"].mean()
data["aggregated_results"] = eval_metadata["results"].mean()
for key in aggregation_json:
tasks = aggregation_json[key]
data[key] = eval_metadata[eval_metadata["Eval Task"].isin(tasks)]["results"].mean()
data[f"{key}_centered"] = eval_metadata[eval_metadata["Eval Task"].isin(tasks)]["centered results"].mean()
# add the new names
if 'low_variance_datasets_centered' in data:
# missing task for Core:
missing_tasks_for_core = [task for task in aggregation_json['low_variance_datasets']
if task not in data["eval_metrics"]["icl"]]
if missing_tasks_for_core:
data[f'Core_{version}'] = "N/A due to missing tasks: " + str(missing_tasks_for_core)
else:
data[f'Core_{version}'] = data['low_variance_datasets_centered']
if 'aggregated_centered_results' in data:
if data["missing tasks"] != "[]":
data[f'Extended_{version}'] = "N/A due to missing tasks: " + data["missing tasks"]
else:
data[f'Extended_{version}'] = data['aggregated_centered_results']
data['eval_version'] = version
# Handle migration for old results
if version == CURRENT_VERSION:
if 'Core' in data and 'Core_v1' not in data:
# If updating an older results file, migrate CORE --> Core_v1 (which won't be present)
data['Core_v1'] = data['Core']
del data['Core']
if 'Extended' in data and 'Extended_v1' not in data:
# If updating an older results file, migrate Extended --> Extended_v1 (which won't be present)
data['Extended_v1'] = data['Extended']
del data['Extended']
# Set unversioned keys to point to current version if it is present
if f'Core_{CURRENT_VERSION}' in data:
data['Core'] = data[f'Core_{CURRENT_VERSION}']
if f'Extended_{CURRENT_VERSION}' in data:
data['Extended'] = data[f'Extended_{CURRENT_VERSION}']
# Extra data
data["eval_metrics"]["icl_centered"] = (
eval_metadata[["Eval Task", "centered results"]]
.dropna(subset=["centered results"])
.set_index("Eval Task")["centered results"]
.to_dict()
)
return data
@torch.inference_mode
def evaluate_core_score(core_dir: Path, model: PreTrainedModel, verbose: bool = True):
device = model.device
model.eval()
model_forward_args = set(inspect.signature(model.forward).parameters.keys())
task_dir_names = os.listdir(core_dir)
# Reading tasks
tasks: dict[str, list[str]] = {}
for dir_name in sorted(task_dir_names):
if (core_dir / dir_name).is_file():
continue
split_name = dir_name.split("__")
task_name = split_name[1]
if len(split_name) == 4:
sub_task_name = split_name[3]
else:
sub_task_name = None
if task_name in tasks:
assert sub_task_name is not None
tasks[task_name].append(dir_name)
else:
tasks[task_name] = [dir_name]
evaluation_results = {}
# Evaluating tasks
for task_name, dir_names in tasks.items():
eval_sum = torch.scalar_tensor(0.0, dtype=torch.float32, device=device)
eval_amount = torch.scalar_tensor(0, dtype=torch.int32, device=device)
for dir_name in dir_names:
split_name = dir_name.split("__")
if len(split_name) == 4:
# print("Subtask", split_name[2])
sub_task_name = split_name[3]
else:
sub_task_name = None
task_title = task_name if sub_task_name is None else f"{task_name} - {sub_task_name}"
with open(core_dir / dir_name / "metric_name.txt") as f:
metric_name = f.readline()
with open(core_dir / dir_name / "batch_amount.txt") as f:
total_batch_amount = int(f.readline())
if metric_name == "InContextLearningLMAccuracy":
metric = InContextLearningLMAccuracy()
elif metric_name == "InContextLearningMultipleChoiceAccuracy":
metric = InContextLearningMultipleChoiceAccuracy()
else:
raise ValueError(f"Unknown metric: '{metric_name}'")
batch_files = sorted(f for f in (core_dir / dir_name).glob(f"batch_*.pt") if f.is_file())
with tqdm.tqdm(desc=task_title, total=total_batch_amount) as pbar:
for file in batch_files:
batches: list[dict[str, torch.Tensor | list | str]] = torch.load(file, map_location=device)
for batch in batches:
assert batch["mode"] == "icl_task"
labels: torch.Tensor = batch.pop('labels')
# HF CausalLM models internally shift labels before computing loss, so we do the same here
labels[:, :-1] = labels[:, 1:].clone()
labels[:, -1] = -100
outputs: CausalLMOutputWithPast = model(
**{k: v for (k, v) in batch.items() if k in model_forward_args})
logits = outputs["logits"]
metric.update(batch, logits, labels)
pbar.update()
metric_result = metric.compute()
eval_sum += metric_result.item()
eval_amount += 1
if sub_task_name is not None and verbose:
print(f"{task_title}: {metric_result.item():.3f}")
mean_eval = (eval_sum / eval_amount).item()
if verbose:
print(f"{task_name}: {mean_eval:.3f}")
evaluation_results[task_name] = mean_eval
# Print direct results
if verbose:
print(json.dumps(evaluation_results, indent=4))
# Get necessary metadata for CORE score calculation
with open(core_dir / "additional_aggregation.json", "r") as f:
aggregation_json = json.load(f)
eval_metadata = pd.read_csv(core_dir / "eval_meta_data.csv")
output = {
"eval_metrics": {
"icl": evaluation_results
}
}
# Calculate CORE score
output = get_aggregated_results(output, eval_metadata, aggregation_json)
if verbose:
print(json.dumps(output, indent=4))
return output
```
with an example execution of
```python
evaluate_core_score(
Path("path/to/downloaded/dataset/core"),
AutoModelForCausalLM.from_pretrained(
"allenai/OLMo-1B-0724-hf", trust_remote_code=True
).to(device="cuda"), verbose=True
)
```
giving the result:
```json
{
"eval_metrics": {
"icl": {
"mmlu_fewshot": 0.28318944573402405,
"hellaswag_zeroshot": 0.6577000021934509,
"jeopardy": 0.2256999909877777,
"bigbench_qa_wikidata": 0.6697999835014343,
"arc_easy": 0.6444000005722046,
"arc_challenge": 0.3463999927043915,
"copa": 0.75,
"commonsense_qa": 0.195700004696846,
"piqa": 0.7638999819755554,
"openbook_qa": 0.3659999966621399,
"lambada_openai": 0.6104999780654907,
"hellaswag": 0.6629999876022339,
"winograd": 0.7985000014305115,
"winogrande": 0.6179999709129333,
"bigbench_dyck_languages": 0.2669999897480011,
"agi_eval_lsat_ar": 0.2825999855995178,
"bigbench_cs_algorithms": 0.4758000075817108,
"bigbench_operators": 0.23810000717639923,
"bigbench_repeat_copy_logic": 0.031199999153614044,
"squad": 0.0,
"coqa": 0.0340999998152256,
"boolq": 0.6370000243186951,
"bigbench_language_identification": 0.2700999975204468
},
"icl_centered": {
"hellaswag_zeroshot": 0.5436000029246012,
"jeopardy": 0.2256999909877777,
"bigbench_qa_wikidata": 0.6697999835014343,
"arc_easy": 0.5258666674296061,
"arc_challenge": 0.12853332360585532,
"mmlu_fewshot": 0.044252594312032066,
"copa": 0.5,
"commonsense_qa": -0.34723617303710885,
"piqa": 0.5277999639511108,
"openbook_qa": 0.15466666221618652,
"lambada_openai": 0.6104999780654907,
"hellaswag": 0.5506666501363119,
"winograd": 0.597000002861023,
"winogrande": 0.2359999418258667,
"bigbench_language_identification": 0.026799996693929035,
"bigbench_dyck_languages": 0.2669999897480011,
"agi_eval_lsat_ar": 0.043466647466023765,
"bigbench_cs_algorithms": 0.4758000075817108,
"bigbench_operators": 0.23810000717639923,
"bigbench_repeat_copy_logic": 0.031199999153614044,
"squad": 0.0,
"coqa": 0.0340999998152256,
"boolq": 0.044736906101829135
}
},
"missing tasks": "['mmlu_zeroshot', 'triviaqa_sm_sub', 'gsm8k_cot', 'agi_eval_sat_math_cot', 'aqua_cot', 'svamp_cot', 'bigbench_misconceptions', 'siqa', 'bigbench_novel_concepts', 'bigbench_strange_stories', 'bigbench_strategy_qa', 'bigbench_conlang_translation', 'bigbench_conceptual_combinations', 'bigbench_elementary_math_qa', 'bigbench_logical_deduction', 'simple_arithmetic_nospaces', 'simple_arithmetic_withspaces', 'math_qa', 'logi_qa', 'pubmed_qa_labeled', 'agi_eval_lsat_rc', 'agi_eval_lsat_lr', 'bigbench_understanding_fables', 'agi_eval_sat_en', 'winogender_mc_female', 'winogender_mc_male', 'enterprise_pii_classification', 'bbq', 'gpqa_main', 'gpqa_diamond']",
"aggregated_task_categories_centered": {
"commonsense reasoning": 0.20880761328254713,
"language understanding": 0.4274277620845371,
"reading comprehension": 0.026278968639018244,
"safety": NaN,
"symbolic problem solving": 0.21111333022514978,
"world knowledge": 0.3188305119673411
},
"aggregated_centered_results": 0.2664501366311704,
"aggregated_results": 0.4273343194762002,
"rw_small": 0.6089166651169459,
"rw_small_centered": 0.32978947364795974,
"95%_CI_above": 0.4416857097475302,
"95%_CI_above_centered": 0.2884428834052838,
"99%_CI_above": 0.4416857097475302,
"99%_CI_above_centered": 0.2884428834052838,
"low_variance_datasets": 0.43388635919175367,
"low_variance_datasets_centered": 0.276550024918404,
"Core_v2": 0.276550024918404,
"Extended_v2": "N/A due to missing tasks: ['mmlu_zeroshot', 'triviaqa_sm_sub', 'gsm8k_cot', 'agi_eval_sat_math_cot', 'aqua_cot', 'svamp_cot', 'bigbench_misconceptions', 'siqa', 'bigbench_novel_concepts', 'bigbench_strange_stories', 'bigbench_strategy_qa', 'bigbench_conlang_translation', 'bigbench_conceptual_combinations', 'bigbench_elementary_math_qa', 'bigbench_logical_deduction', 'simple_arithmetic_nospaces', 'simple_arithmetic_withspaces', 'math_qa', 'logi_qa', 'pubmed_qa_labeled', 'agi_eval_lsat_rc', 'agi_eval_lsat_lr', 'bigbench_understanding_fables', 'agi_eval_sat_en', 'winogender_mc_female', 'winogender_mc_male', 'enterprise_pii_classification', 'bbq', 'gpqa_main', 'gpqa_diamond']",
"eval_version": "v2",
"Core": 0.276550024918404,
"Extended": "N/A due to missing tasks: ['mmlu_zeroshot', 'triviaqa_sm_sub', 'gsm8k_cot', 'agi_eval_sat_math_cot', 'aqua_cot', 'svamp_cot', 'bigbench_misconceptions', 'siqa', 'bigbench_novel_concepts', 'bigbench_strange_stories', 'bigbench_strategy_qa', 'bigbench_conlang_translation', 'bigbench_conceptual_combinations', 'bigbench_elementary_math_qa', 'bigbench_logical_deduction', 'simple_arithmetic_nospaces', 'simple_arithmetic_withspaces', 'math_qa', 'logi_qa', 'pubmed_qa_labeled', 'agi_eval_lsat_rc', 'agi_eval_lsat_lr', 'bigbench_understanding_fables', 'agi_eval_sat_en', 'winogender_mc_female', 'winogender_mc_male', 'enterprise_pii_classification', 'bbq', 'gpqa_main', 'gpqa_diamond']"
}
```
which is equivalent (except missing some non-relevant fields) to the data produced by the DCLM codebase using
```bash
torchrun --nproc_per_node 1 eval/eval_openlm_ckpt.py --hf-model allenai/OLMo-1B-0724-hf --tokenizer allenai/OLMo-1B-0724-hf --eval-yaml "eval/mmlu_and_lowvar.yaml" --output-file exp_data/evals/olmo_eval_mmlu_and_lowvar.json --donot-compute-perplexity
```
提供机构:
Muesli1



