Hendrycks-Math

Name: Hendrycks-Math
Creator: maas
Published: 2025-12-05 16:57:42
License: 暂无描述

魔搭社区2025-12-05 更新2025-12-06 收录

下载链接：

https://modelscope.cn/datasets/PrimeIntellect/Hendrycks-Math

下载链接

链接失效反馈

官方服务：

资源简介：

# Hendrycks-Math  ## Generation This dataset was created by running ````bash uv run hendrycks-math.py -H -p ```` ````python # hendrycks-math.py # /// script # requires-python = ">=3.12" # dependencies = ["datasets>=4.0.0", "jinja2"] # /// import argparse import json import sys import time from pathlib import Path from typing import cast from huggingface_hub import DatasetCard, DatasetCardData, create_repo, whoami from datasets import Dataset, load_dataset def prepare_hendrycks_math() -> Dataset: hendrycks_math = cast(Dataset, load_dataset("justus27/math-hendrycks-genesys-format", split="train")) def process_example(example): return { "question": example["prompt"], "answer": json.loads(example["verification_info"])["ground_truth"], "info": {"problem_id": example["problem_id"], "task_type": "verifiable_math"}, "difficulty": example["difficulty"], } return hendrycks_math.map(process_example).select_columns(["question", "answer", "info", "difficulty"]) def push_card_to_hub(repo_name: str, push_to_hub: bool): # Create dataset card _, dataset_name = repo_name.split("/") card_meta = DatasetCardData( pretty_name=dataset_name, license="apache-2.0", ) card = DatasetCard.from_template( card_data=card_meta, template_path="templates/CARD.md", dataset_name=dataset_name, cmd=f"uv run {Path(__file__).stem}.py {' '.join(sys.argv[1:])}", source=Path(__file__).read_text(encoding="utf-8", errors="replace"), ) # Push to HF hub if push_to_hub: print(f"Pushing to `{repo_name}`") card.push_to_hub(repo_name, repo_type="dataset") print(f"✅ Pushed card to `{repo_name}` to HF Hub") else: print("ℹ️ Skipped pushing to HF Hub. To push, use the `--push-to-hub` or `-H` flag.") def prepare_data(repo_name: str, push_to_hub: bool, private: bool): print("⚙️ Preparing hendrycks math split") start_time = time.time() hendrycks_math = prepare_hendrycks_math() print(f"✅ Prepared hendrycks math split in {time.time() - start_time:.2f} seconds") if push_to_hub: hendrycks_math.push_to_hub(repo_name, private=private) print(f"✅ Pushed hendrycks math split to `{repo_name}` to HF Hub") del hendrycks_math def main(repo_name: str, push_to_hub: bool, private: bool): create_repo(repo_name, private=private, repo_type="dataset", exist_ok=True) push_card_to_hub(repo_name, push_to_hub) prepare_data(repo_name, push_to_hub, private) if push_to_hub: print(f"✅ Pushed dataset to https://huggingface.co/datasets/{repo_name}") def check_write_access(org: str): is_authed = False try: info = whoami() token = info["auth"]["accessToken"]["displayName"] for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]: if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]: is_authed = True except Exception: raise ValueError("❌ You are not logged in. Please run `hf auth login` or `export HF_TOKEN=...`") if not is_authed: raise ValueError(f"❌ Your current token `{token}` does not have write access to `{org}`") print(f"✅ Confirmed write access with token `{token}` to `{org}`") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--username", "-U", default="PrimeIntellect", type=str, help="The username to push the dataset to." ) parser.add_argument("--dataset-name", "-D", default="Hendrycks-Math", type=str, help="The dataset name.") parser.add_argument("--dataset-private", "-p", action="store_true", help="Whether to make the dataset private.") parser.add_argument("--push-to-hub", "-H", action="store_true", help="Whether to push the dataset to the hub.") args = parser.parse_args() # Validate args assert len(args.dataset_name.split("/")) == 1, "Dataset name must not include the username" if args.push_to_hub: check_write_access(args.username) main( repo_name=f"{args.username}/{args.dataset_name}", push_to_hub=args.push_to_hub, private=args.dataset_private, ) ````

# Hendrycks-Math  ## 数据集生成本数据集通过执行以下命令构建生成： `bash uv run hendrycks-math.py -H -p ` `python # hendrycks-math.py # /// script # requires-python = ">=3.12" # dependencies = ["datasets>=4.0.0", "jinja2"] # /// import argparse import json import sys import time from pathlib import Path from typing import cast from huggingface_hub import DatasetCard, DatasetCardData, create_repo, whoami from datasets import Dataset, load_dataset def prepare_hendrycks_math() -> Dataset: hendrycks_math = cast(Dataset, load_dataset("justus27/math-hendrycks-genesys-format", split="train")) def process_example(example): return { "question": example["prompt"], "answer": json.loads(example["verification_info"])["ground_truth"], "info": {"problem_id": example["problem_id"], "task_type": "verifiable_math"}, "difficulty": example["difficulty"], } return hendrycks_math.map(process_example).select_columns(["question", "answer", "info", "difficulty"]) def push_card_to_hub(repo_name: str, push_to_hub: bool): # 创建数据集卡片 _, dataset_name = repo_name.split("/") card_meta = DatasetCardData( pretty_name=dataset_name, license="apache-2.0", ) card = DatasetCard.from_template( card_data=card_meta, template_path="templates/CARD.md", dataset_name=dataset_name, cmd=f"uv run {Path(__file__).stem}.py {' '.join(sys.argv[1:])}", source=Path(__file__).read_text(encoding="utf-8", errors="replace"), ) # 推送至 Hugging Face Hub if push_to_hub: print(f"正在推送至 `{repo_name}`") card.push_to_hub(repo_name, repo_type="dataset") print(f"✅ 已将数据集卡片推送至 HF Hub 的 `{repo_name}`") else: print("ℹ️ 未推送至 Hugging Face Hub。如需推送，请使用 `--push-to-hub` 或 `-H` 参数。") def prepare_data(repo_name: str, push_to_hub: bool, private: bool): print("⚙️ 正在准备亨德里克斯数学数据集拆分") start_time = time.time() hendrycks_math = prepare_hendrycks_math() print(f"✅ 已在 {time.time() - start_time:.2f} 秒内完成亨德里克斯数学数据集拆分的准备工作") if push_to_hub: hendrycks_math.push_to_hub(repo_name, private=private) print(f"✅ 已将亨德里克斯数学数据集拆分推送至 HF Hub 的 `{repo_name}`") del hendrycks_math def main(repo_name: str, push_to_hub: bool, private: bool): create_repo(repo_name, private=private, repo_type="dataset", exist_ok=True) push_card_to_hub(repo_name, push_to_hub) prepare_data(repo_name, push_to_hub, private) if push_to_hub: print(f"✅ 已将数据集推送至 https://huggingface.co/datasets/{repo_name}") def check_write_access(org: str): is_authed = False try: info = whoami() token = info["auth"]["accessToken"]["displayName"] for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]: if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]: is_authed = True except Exception: raise ValueError("❌ 您未登录，请执行 `hf auth login` 或设置环境变量 `HF_TOKEN=...`") if not is_authed: raise ValueError(f"❌ 当前令牌 `{token}` 不具备对 `{org}` 的写入权限") print(f"✅ 已确认令牌 `{token}` 对 `{org}` 具备写入权限") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--username", "-U", default="PrimeIntellect", type=str, help="用于指定推送数据集的用户名，默认为 PrimeIntellect。" ) parser.add_argument("--dataset-name", "-D", default="Hendrycks-Math", type=str, help="用于指定数据集名称，默认为 Hendrycks-Math。") parser.add_argument("--dataset-private", "-p", action="store_true", help="用于指定数据集是否设为私有。") parser.add_argument("--push-to-hub", "-H", action="store_true", help="用于指定是否将数据集推送至 Hub。") args = parser.parse_args() # 验证参数合法性 assert len(args.dataset_name.split("/")) == 1, "数据集名称不得包含用户名" if args.push_to_hub: check_write_access(args.username) main( repo_name=f"{args.username}/{args.dataset_name}", push_to_hub=args.push_to_hub, private=args.dataset_private, ) `

提供机构：

maas

创建时间：

2025-11-30

5,000+

优质数据集

54 个

任务类型

进入经典数据集