five

PrimeIntellect/Hendrycks-Math

收藏
Hugging Face2025-11-29 更新2026-01-03 收录
下载链接:
https://hf-mirror.com/datasets/PrimeIntellect/Hendrycks-Math
下载链接
链接失效反馈
官方服务:
资源简介:
--- license: apache-2.0 pretty_name: Hendrycks-Math dataset_info: features: - name: question dtype: string - name: answer dtype: string - name: info struct: - name: problem_id dtype: string - name: task_type dtype: string - name: difficulty dtype: float64 splits: - name: train num_bytes: 2049724 num_examples: 7474 download_size: 908348 dataset_size: 2049724 configs: - config_name: default data_files: - split: train path: data/train-* --- # Hendrycks-Math <!-- Provide a quick summary of the dataset. --> ## Generation This dataset was created by running ````bash uv run hendrycks-math.py -H -p ```` ````python # hendrycks-math.py # /// script # requires-python = ">=3.12" # dependencies = ["datasets>=4.0.0", "jinja2"] # /// import argparse import json import sys import time from pathlib import Path from typing import cast from huggingface_hub import DatasetCard, DatasetCardData, create_repo, whoami from datasets import Dataset, load_dataset def prepare_hendrycks_math() -> Dataset: hendrycks_math = cast(Dataset, load_dataset("justus27/math-hendrycks-genesys-format", split="train")) def process_example(example): return { "question": example["prompt"], "answer": json.loads(example["verification_info"])["ground_truth"], "info": {"problem_id": example["problem_id"], "task_type": "verifiable_math"}, "difficulty": example["difficulty"], } return hendrycks_math.map(process_example).select_columns(["question", "answer", "info", "difficulty"]) def push_card_to_hub(repo_name: str, push_to_hub: bool): # Create dataset card _, dataset_name = repo_name.split("/") card_meta = DatasetCardData( pretty_name=dataset_name, license="apache-2.0", ) card = DatasetCard.from_template( card_data=card_meta, template_path="templates/CARD.md", dataset_name=dataset_name, cmd=f"uv run {Path(__file__).stem}.py {' '.join(sys.argv[1:])}", source=Path(__file__).read_text(encoding="utf-8", errors="replace"), ) # Push to HF hub if push_to_hub: print(f"Pushing to `{repo_name}`") card.push_to_hub(repo_name, repo_type="dataset") print(f"✅ Pushed card to `{repo_name}` to HF Hub") else: print("ℹ️ Skipped pushing to HF Hub. To push, use the `--push-to-hub` or `-H` flag.") def prepare_data(repo_name: str, push_to_hub: bool, private: bool): print("⚙️ Preparing hendrycks math split") start_time = time.time() hendrycks_math = prepare_hendrycks_math() print(f"✅ Prepared hendrycks math split in {time.time() - start_time:.2f} seconds") if push_to_hub: hendrycks_math.push_to_hub(repo_name, private=private) print(f"✅ Pushed hendrycks math split to `{repo_name}` to HF Hub") del hendrycks_math def main(repo_name: str, push_to_hub: bool, private: bool): create_repo(repo_name, private=private, repo_type="dataset", exist_ok=True) push_card_to_hub(repo_name, push_to_hub) prepare_data(repo_name, push_to_hub, private) if push_to_hub: print(f"✅ Pushed dataset to https://huggingface.co/datasets/{repo_name}") def check_write_access(org: str): is_authed = False try: info = whoami() token = info["auth"]["accessToken"]["displayName"] for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]: if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]: is_authed = True except Exception: raise ValueError("❌ You are not logged in. Please run `hf auth login` or `export HF_TOKEN=...`") if not is_authed: raise ValueError(f"❌ Your current token `{token}` does not have write access to `{org}`") print(f"✅ Confirmed write access with token `{token}` to `{org}`") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--username", "-U", default="PrimeIntellect", type=str, help="The username to push the dataset to." ) parser.add_argument("--dataset-name", "-D", default="Hendrycks-Math", type=str, help="The dataset name.") parser.add_argument("--dataset-private", "-p", action="store_true", help="Whether to make the dataset private.") parser.add_argument("--push-to-hub", "-H", action="store_true", help="Whether to push the dataset to the hub.") args = parser.parse_args() # Validate args assert len(args.dataset_name.split("/")) == 1, "Dataset name must not include the username" if args.push_to_hub: check_write_access(args.username) main( repo_name=f"{args.username}/{args.dataset_name}", push_to_hub=args.push_to_hub, private=args.dataset_private, ) ````

许可证:apache-2.0 显示名称:亨德里克斯数学(Hendrycks-Math) 数据集信息: 特征: - 名称:question(问题),数据类型:字符串 - 名称:answer(答案),数据类型:字符串 - 名称:info(信息),结构体: - 名称:problem_id(题目ID),数据类型:字符串 - 名称:task_type(任务类型),数据类型:字符串 - 名称:difficulty(难度),数据类型:float64(双精度浮点数) 拆分: - 名称:train(训练集),字节数:2049724,示例数:7474 下载大小:908348,数据集总大小:2049724 配置: - 配置名称:default(默认配置),数据文件: - 拆分:train,路径:data/train-* # 亨德里克斯数学(Hendrycks-Math) <!-- 请提供该数据集的简要概述。 --> ## 数据集生成 本数据集通过运行以下命令生成: bash uv run hendrycks-math.py -H -p python # hendrycks-math.py # /// script # requires-python = ">=3.12" # dependencies = ["datasets>=4.0.0", "jinja2"] # /// import argparse import json import sys import time from pathlib import Path from typing import cast from huggingface_hub import DatasetCard, DatasetCardData, create_repo, whoami from datasets import Dataset, load_dataset def prepare_hendrycks_math() -> Dataset: hendrycks_math = cast(Dataset, load_dataset("justus27/math-hendrycks-genesys-format", split="train")) def process_example(example): return { "question": example["prompt"], "answer": json.loads(example["verification_info"])["ground_truth"], "info": {"problem_id": example["problem_id"], "task_type": "verifiable_math"}, "difficulty": example["difficulty"], } return hendrycks_math.map(process_example).select_columns(["question", "answer", "info", "difficulty"]) def push_card_to_hub(repo_name: str, push_to_hub: bool): # 创建数据集卡片 (Dataset Card) _, dataset_name = repo_name.split("/") card_meta = DatasetCardData( pretty_name=dataset_name, license="apache-2.0", ) card = DatasetCard.from_template( card_data=card_meta, template_path="templates/CARD.md", dataset_name=dataset_name, cmd=f"uv run {Path(__file__).stem}.py {' '.join(sys.argv[1:])}", source=Path(__file__).read_text(encoding="utf-8", errors="replace"), ) # 推送至Hugging Face Hub if push_to_hub: print(f"正在推送至 `{repo_name}`") card.push_to_hub(repo_name, repo_type="dataset") print(f"✅ 已将数据集卡片推送至Hugging Face Hub的`{repo_name}`") else: print("ℹ️ 跳过推送至Hugging Face Hub。如需推送,请使用`--push-to-hub`或`-H`参数。") def prepare_data(repo_name: str, push_to_hub: bool, private: bool): print("⚙️ 正在准备亨德里克斯数学数据集拆分") start_time = time.time() hendrycks_math = prepare_hendrycks_math() print(f"✅ 已在{time.time() - start_time:.2f}秒内完成亨德里克斯数学数据集拆分的准备工作") if push_to_hub: hendrycks_math.push_to_hub(repo_name, private=private) print(f"✅ 已将亨德里克斯数学数据集拆分推送至Hugging Face Hub的`{repo_name}`") del hendrycks_math def main(repo_name: str, push_to_hub: bool, private: bool): create_repo(repo_name, private=private, repo_type="dataset", exist_ok=True) push_card_to_hub(repo_name, push_to_hub) prepare_data(repo_name, push_to_hub, private) if push_to_hub: print(f"✅ 已将数据集推送至 https://huggingface.co/datasets/{repo_name}") def check_write_access(org: str): is_authed = False try: info = whoami() token = info["auth"]["accessToken"]["displayName"] for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]: if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]: is_authed = True except Exception: raise ValueError("❌ 您尚未登录,请运行`hf auth login`或设置环境变量`export HF_TOKEN=...`") if not is_authed: raise ValueError(f"❌ 当前使用的令牌`{token}`不具备对`{org}`的写入权限") print(f"✅ 已确认令牌`{token}`具备对`{org}`的写入权限") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--username", "-U", default="PrimeIntellect", type=str, help="要将数据集推送至的用户名。" ) parser.add_argument("--dataset-name", "-D", default="Hendrycks-Math", type=str, help="数据集名称。") parser.add_argument("--dataset-private", "-p", action="store_true", help="是否将数据集设为私有。") parser.add_argument("--push-to-hub", "-H", action="store_true", help="是否将数据集推送至Hugging Face Hub。") args = parser.parse_args() # 验证参数合法性 assert len(args.dataset_name.split("/")) == 1, "数据集名称不得包含用户名" if args.push_to_hub: check_write_access(args.username) main( repo_name=f"{args.username}/{args.dataset_name}", push_to_hub=args.push_to_hub, private=args.dataset_private, )
提供机构:
PrimeIntellect
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作