PrimeIntellect/Hendrycks-Math
收藏Hugging Face2025-11-29 更新2026-01-03 收录
下载链接:
https://hf-mirror.com/datasets/PrimeIntellect/Hendrycks-Math
下载链接
链接失效反馈官方服务:
资源简介:
---
license: apache-2.0
pretty_name: Hendrycks-Math
dataset_info:
features:
- name: question
dtype: string
- name: answer
dtype: string
- name: info
struct:
- name: problem_id
dtype: string
- name: task_type
dtype: string
- name: difficulty
dtype: float64
splits:
- name: train
num_bytes: 2049724
num_examples: 7474
download_size: 908348
dataset_size: 2049724
configs:
- config_name: default
data_files:
- split: train
path: data/train-*
---
# Hendrycks-Math
<!-- Provide a quick summary of the dataset. -->
## Generation
This dataset was created by running
````bash
uv run hendrycks-math.py -H -p
````
````python
# hendrycks-math.py
# /// script
# requires-python = ">=3.12"
# dependencies = ["datasets>=4.0.0", "jinja2"]
# ///
import argparse
import json
import sys
import time
from pathlib import Path
from typing import cast
from huggingface_hub import DatasetCard, DatasetCardData, create_repo, whoami
from datasets import Dataset, load_dataset
def prepare_hendrycks_math() -> Dataset:
hendrycks_math = cast(Dataset, load_dataset("justus27/math-hendrycks-genesys-format", split="train"))
def process_example(example):
return {
"question": example["prompt"],
"answer": json.loads(example["verification_info"])["ground_truth"],
"info": {"problem_id": example["problem_id"], "task_type": "verifiable_math"},
"difficulty": example["difficulty"],
}
return hendrycks_math.map(process_example).select_columns(["question", "answer", "info", "difficulty"])
def push_card_to_hub(repo_name: str, push_to_hub: bool):
# Create dataset card
_, dataset_name = repo_name.split("/")
card_meta = DatasetCardData(
pretty_name=dataset_name,
license="apache-2.0",
)
card = DatasetCard.from_template(
card_data=card_meta,
template_path="templates/CARD.md",
dataset_name=dataset_name,
cmd=f"uv run {Path(__file__).stem}.py {' '.join(sys.argv[1:])}",
source=Path(__file__).read_text(encoding="utf-8", errors="replace"),
)
# Push to HF hub
if push_to_hub:
print(f"Pushing to `{repo_name}`")
card.push_to_hub(repo_name, repo_type="dataset")
print(f"✅ Pushed card to `{repo_name}` to HF Hub")
else:
print("ℹ️ Skipped pushing to HF Hub. To push, use the `--push-to-hub` or `-H` flag.")
def prepare_data(repo_name: str, push_to_hub: bool, private: bool):
print("⚙️ Preparing hendrycks math split")
start_time = time.time()
hendrycks_math = prepare_hendrycks_math()
print(f"✅ Prepared hendrycks math split in {time.time() - start_time:.2f} seconds")
if push_to_hub:
hendrycks_math.push_to_hub(repo_name, private=private)
print(f"✅ Pushed hendrycks math split to `{repo_name}` to HF Hub")
del hendrycks_math
def main(repo_name: str, push_to_hub: bool, private: bool):
create_repo(repo_name, private=private, repo_type="dataset", exist_ok=True)
push_card_to_hub(repo_name, push_to_hub)
prepare_data(repo_name, push_to_hub, private)
if push_to_hub:
print(f"✅ Pushed dataset to https://huggingface.co/datasets/{repo_name}")
def check_write_access(org: str):
is_authed = False
try:
info = whoami()
token = info["auth"]["accessToken"]["displayName"]
for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]:
if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]:
is_authed = True
except Exception:
raise ValueError("❌ You are not logged in. Please run `hf auth login` or `export HF_TOKEN=...`")
if not is_authed:
raise ValueError(f"❌ Your current token `{token}` does not have write access to `{org}`")
print(f"✅ Confirmed write access with token `{token}` to `{org}`")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--username", "-U", default="PrimeIntellect", type=str, help="The username to push the dataset to."
)
parser.add_argument("--dataset-name", "-D", default="Hendrycks-Math", type=str, help="The dataset name.")
parser.add_argument("--dataset-private", "-p", action="store_true", help="Whether to make the dataset private.")
parser.add_argument("--push-to-hub", "-H", action="store_true", help="Whether to push the dataset to the hub.")
args = parser.parse_args()
# Validate args
assert len(args.dataset_name.split("/")) == 1, "Dataset name must not include the username"
if args.push_to_hub:
check_write_access(args.username)
main(
repo_name=f"{args.username}/{args.dataset_name}",
push_to_hub=args.push_to_hub,
private=args.dataset_private,
)
````
许可证:apache-2.0
显示名称:亨德里克斯数学(Hendrycks-Math)
数据集信息:
特征:
- 名称:question(问题),数据类型:字符串
- 名称:answer(答案),数据类型:字符串
- 名称:info(信息),结构体:
- 名称:problem_id(题目ID),数据类型:字符串
- 名称:task_type(任务类型),数据类型:字符串
- 名称:difficulty(难度),数据类型:float64(双精度浮点数)
拆分:
- 名称:train(训练集),字节数:2049724,示例数:7474
下载大小:908348,数据集总大小:2049724
配置:
- 配置名称:default(默认配置),数据文件:
- 拆分:train,路径:data/train-*
# 亨德里克斯数学(Hendrycks-Math)
<!-- 请提供该数据集的简要概述。 -->
## 数据集生成
本数据集通过运行以下命令生成:
bash
uv run hendrycks-math.py -H -p
python
# hendrycks-math.py
# /// script
# requires-python = ">=3.12"
# dependencies = ["datasets>=4.0.0", "jinja2"]
# ///
import argparse
import json
import sys
import time
from pathlib import Path
from typing import cast
from huggingface_hub import DatasetCard, DatasetCardData, create_repo, whoami
from datasets import Dataset, load_dataset
def prepare_hendrycks_math() -> Dataset:
hendrycks_math = cast(Dataset, load_dataset("justus27/math-hendrycks-genesys-format", split="train"))
def process_example(example):
return {
"question": example["prompt"],
"answer": json.loads(example["verification_info"])["ground_truth"],
"info": {"problem_id": example["problem_id"], "task_type": "verifiable_math"},
"difficulty": example["difficulty"],
}
return hendrycks_math.map(process_example).select_columns(["question", "answer", "info", "difficulty"])
def push_card_to_hub(repo_name: str, push_to_hub: bool):
# 创建数据集卡片 (Dataset Card)
_, dataset_name = repo_name.split("/")
card_meta = DatasetCardData(
pretty_name=dataset_name,
license="apache-2.0",
)
card = DatasetCard.from_template(
card_data=card_meta,
template_path="templates/CARD.md",
dataset_name=dataset_name,
cmd=f"uv run {Path(__file__).stem}.py {' '.join(sys.argv[1:])}",
source=Path(__file__).read_text(encoding="utf-8", errors="replace"),
)
# 推送至Hugging Face Hub
if push_to_hub:
print(f"正在推送至 `{repo_name}`")
card.push_to_hub(repo_name, repo_type="dataset")
print(f"✅ 已将数据集卡片推送至Hugging Face Hub的`{repo_name}`")
else:
print("ℹ️ 跳过推送至Hugging Face Hub。如需推送,请使用`--push-to-hub`或`-H`参数。")
def prepare_data(repo_name: str, push_to_hub: bool, private: bool):
print("⚙️ 正在准备亨德里克斯数学数据集拆分")
start_time = time.time()
hendrycks_math = prepare_hendrycks_math()
print(f"✅ 已在{time.time() - start_time:.2f}秒内完成亨德里克斯数学数据集拆分的准备工作")
if push_to_hub:
hendrycks_math.push_to_hub(repo_name, private=private)
print(f"✅ 已将亨德里克斯数学数据集拆分推送至Hugging Face Hub的`{repo_name}`")
del hendrycks_math
def main(repo_name: str, push_to_hub: bool, private: bool):
create_repo(repo_name, private=private, repo_type="dataset", exist_ok=True)
push_card_to_hub(repo_name, push_to_hub)
prepare_data(repo_name, push_to_hub, private)
if push_to_hub:
print(f"✅ 已将数据集推送至 https://huggingface.co/datasets/{repo_name}")
def check_write_access(org: str):
is_authed = False
try:
info = whoami()
token = info["auth"]["accessToken"]["displayName"]
for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]:
if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]:
is_authed = True
except Exception:
raise ValueError("❌ 您尚未登录,请运行`hf auth login`或设置环境变量`export HF_TOKEN=...`")
if not is_authed:
raise ValueError(f"❌ 当前使用的令牌`{token}`不具备对`{org}`的写入权限")
print(f"✅ 已确认令牌`{token}`具备对`{org}`的写入权限")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--username", "-U", default="PrimeIntellect", type=str, help="要将数据集推送至的用户名。"
)
parser.add_argument("--dataset-name", "-D", default="Hendrycks-Math", type=str, help="数据集名称。")
parser.add_argument("--dataset-private", "-p", action="store_true", help="是否将数据集设为私有。")
parser.add_argument("--push-to-hub", "-H", action="store_true", help="是否将数据集推送至Hugging Face Hub。")
args = parser.parse_args()
# 验证参数合法性
assert len(args.dataset_name.split("/")) == 1, "数据集名称不得包含用户名"
if args.push_to_hub:
check_write_access(args.username)
main(
repo_name=f"{args.username}/{args.dataset_name}",
push_to_hub=args.push_to_hub,
private=args.dataset_private,
)
提供机构:
PrimeIntellect



