Hendrycks-Math
收藏魔搭社区2025-12-05 更新2025-12-06 收录
下载链接:
https://modelscope.cn/datasets/PrimeIntellect/Hendrycks-Math
下载链接
链接失效反馈官方服务:
资源简介:
# Hendrycks-Math
<!-- Provide a quick summary of the dataset. -->
## Generation
This dataset was created by running
````bash
uv run hendrycks-math.py -H -p
````
````python
# hendrycks-math.py
# /// script
# requires-python = ">=3.12"
# dependencies = ["datasets>=4.0.0", "jinja2"]
# ///
import argparse
import json
import sys
import time
from pathlib import Path
from typing import cast
from huggingface_hub import DatasetCard, DatasetCardData, create_repo, whoami
from datasets import Dataset, load_dataset
def prepare_hendrycks_math() -> Dataset:
hendrycks_math = cast(Dataset, load_dataset("justus27/math-hendrycks-genesys-format", split="train"))
def process_example(example):
return {
"question": example["prompt"],
"answer": json.loads(example["verification_info"])["ground_truth"],
"info": {"problem_id": example["problem_id"], "task_type": "verifiable_math"},
"difficulty": example["difficulty"],
}
return hendrycks_math.map(process_example).select_columns(["question", "answer", "info", "difficulty"])
def push_card_to_hub(repo_name: str, push_to_hub: bool):
# Create dataset card
_, dataset_name = repo_name.split("/")
card_meta = DatasetCardData(
pretty_name=dataset_name,
license="apache-2.0",
)
card = DatasetCard.from_template(
card_data=card_meta,
template_path="templates/CARD.md",
dataset_name=dataset_name,
cmd=f"uv run {Path(__file__).stem}.py {' '.join(sys.argv[1:])}",
source=Path(__file__).read_text(encoding="utf-8", errors="replace"),
)
# Push to HF hub
if push_to_hub:
print(f"Pushing to `{repo_name}`")
card.push_to_hub(repo_name, repo_type="dataset")
print(f"✅ Pushed card to `{repo_name}` to HF Hub")
else:
print("ℹ️ Skipped pushing to HF Hub. To push, use the `--push-to-hub` or `-H` flag.")
def prepare_data(repo_name: str, push_to_hub: bool, private: bool):
print("⚙️ Preparing hendrycks math split")
start_time = time.time()
hendrycks_math = prepare_hendrycks_math()
print(f"✅ Prepared hendrycks math split in {time.time() - start_time:.2f} seconds")
if push_to_hub:
hendrycks_math.push_to_hub(repo_name, private=private)
print(f"✅ Pushed hendrycks math split to `{repo_name}` to HF Hub")
del hendrycks_math
def main(repo_name: str, push_to_hub: bool, private: bool):
create_repo(repo_name, private=private, repo_type="dataset", exist_ok=True)
push_card_to_hub(repo_name, push_to_hub)
prepare_data(repo_name, push_to_hub, private)
if push_to_hub:
print(f"✅ Pushed dataset to https://huggingface.co/datasets/{repo_name}")
def check_write_access(org: str):
is_authed = False
try:
info = whoami()
token = info["auth"]["accessToken"]["displayName"]
for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]:
if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]:
is_authed = True
except Exception:
raise ValueError("❌ You are not logged in. Please run `hf auth login` or `export HF_TOKEN=...`")
if not is_authed:
raise ValueError(f"❌ Your current token `{token}` does not have write access to `{org}`")
print(f"✅ Confirmed write access with token `{token}` to `{org}`")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--username", "-U", default="PrimeIntellect", type=str, help="The username to push the dataset to."
)
parser.add_argument("--dataset-name", "-D", default="Hendrycks-Math", type=str, help="The dataset name.")
parser.add_argument("--dataset-private", "-p", action="store_true", help="Whether to make the dataset private.")
parser.add_argument("--push-to-hub", "-H", action="store_true", help="Whether to push the dataset to the hub.")
args = parser.parse_args()
# Validate args
assert len(args.dataset_name.split("/")) == 1, "Dataset name must not include the username"
if args.push_to_hub:
check_write_access(args.username)
main(
repo_name=f"{args.username}/{args.dataset_name}",
push_to_hub=args.push_to_hub,
private=args.dataset_private,
)
````
# Hendrycks-Math
<!-- 请简要概述该数据集。 -->
## 数据集生成
本数据集通过执行以下命令构建生成:
`bash
uv run hendrycks-math.py -H -p
`
`python
# hendrycks-math.py
# /// script
# requires-python = ">=3.12"
# dependencies = ["datasets>=4.0.0", "jinja2"]
# ///
import argparse
import json
import sys
import time
from pathlib import Path
from typing import cast
from huggingface_hub import DatasetCard, DatasetCardData, create_repo, whoami
from datasets import Dataset, load_dataset
def prepare_hendrycks_math() -> Dataset:
hendrycks_math = cast(Dataset, load_dataset("justus27/math-hendrycks-genesys-format", split="train"))
def process_example(example):
return {
"question": example["prompt"],
"answer": json.loads(example["verification_info"])["ground_truth"],
"info": {"problem_id": example["problem_id"], "task_type": "verifiable_math"},
"difficulty": example["difficulty"],
}
return hendrycks_math.map(process_example).select_columns(["question", "answer", "info", "difficulty"])
def push_card_to_hub(repo_name: str, push_to_hub: bool):
# 创建数据集卡片
_, dataset_name = repo_name.split("/")
card_meta = DatasetCardData(
pretty_name=dataset_name,
license="apache-2.0",
)
card = DatasetCard.from_template(
card_data=card_meta,
template_path="templates/CARD.md",
dataset_name=dataset_name,
cmd=f"uv run {Path(__file__).stem}.py {' '.join(sys.argv[1:])}",
source=Path(__file__).read_text(encoding="utf-8", errors="replace"),
)
# 推送至 Hugging Face Hub
if push_to_hub:
print(f"正在推送至 `{repo_name}`")
card.push_to_hub(repo_name, repo_type="dataset")
print(f"✅ 已将数据集卡片推送至 HF Hub 的 `{repo_name}`")
else:
print("ℹ️ 未推送至 Hugging Face Hub。如需推送,请使用 `--push-to-hub` 或 `-H` 参数。")
def prepare_data(repo_name: str, push_to_hub: bool, private: bool):
print("⚙️ 正在准备亨德里克斯数学数据集拆分")
start_time = time.time()
hendrycks_math = prepare_hendrycks_math()
print(f"✅ 已在 {time.time() - start_time:.2f} 秒内完成亨德里克斯数学数据集拆分的准备工作")
if push_to_hub:
hendrycks_math.push_to_hub(repo_name, private=private)
print(f"✅ 已将亨德里克斯数学数据集拆分推送至 HF Hub 的 `{repo_name}`")
del hendrycks_math
def main(repo_name: str, push_to_hub: bool, private: bool):
create_repo(repo_name, private=private, repo_type="dataset", exist_ok=True)
push_card_to_hub(repo_name, push_to_hub)
prepare_data(repo_name, push_to_hub, private)
if push_to_hub:
print(f"✅ 已将数据集推送至 https://huggingface.co/datasets/{repo_name}")
def check_write_access(org: str):
is_authed = False
try:
info = whoami()
token = info["auth"]["accessToken"]["displayName"]
for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]:
if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]:
is_authed = True
except Exception:
raise ValueError("❌ 您未登录,请执行 `hf auth login` 或设置环境变量 `HF_TOKEN=...`")
if not is_authed:
raise ValueError(f"❌ 当前令牌 `{token}` 不具备对 `{org}` 的写入权限")
print(f"✅ 已确认令牌 `{token}` 对 `{org}` 具备写入权限")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--username", "-U", default="PrimeIntellect", type=str, help="用于指定推送数据集的用户名,默认为 PrimeIntellect。"
)
parser.add_argument("--dataset-name", "-D", default="Hendrycks-Math", type=str, help="用于指定数据集名称,默认为 Hendrycks-Math。")
parser.add_argument("--dataset-private", "-p", action="store_true", help="用于指定数据集是否设为私有。")
parser.add_argument("--push-to-hub", "-H", action="store_true", help="用于指定是否将数据集推送至 Hub。")
args = parser.parse_args()
# 验证参数合法性
assert len(args.dataset_name.split("/")) == 1, "数据集名称不得包含用户名"
if args.push_to_hub:
check_write_access(args.username)
main(
repo_name=f"{args.username}/{args.dataset_name}",
push_to_hub=args.push_to_hub,
private=args.dataset_private,
)
`
提供机构:
maas
创建时间:
2025-11-30



