five

INTELLECT-3-SFT

收藏
魔搭社区2025-12-03 更新2025-09-06 收录
下载链接:
https://modelscope.cn/datasets/PrimeIntellect/INTELLECT-3-SFT
下载链接
链接失效反馈
官方服务:
资源简介:
# INTELLECT-3-SFT <!-- Provide a quick summary of the dataset. --> ## Generation This dataset was created by running ````bash uv run intellect-3-sft.py -H -S -1 ```` ````python # intellect-3-sft.py # /// script # requires-python = ">=3.12" # dependencies = ["datasets==3.6.0", "jinja2"] # /// import argparse import json import sys from pathlib import Path from typing import cast from huggingface_hub import DatasetCard, DatasetCardData, hf_hub_download, whoami from datasets import Dataset, DatasetDict, IterableDataset, load_dataset def remove_answer_tags(text: str) -> str: return text.replace("<answer>", "").replace("</answer>", "") def load_dataset_subset(name: str, subset: str | None, split: str, subset_size: int) -> Dataset: return Dataset.from_list( list(cast(IterableDataset, load_dataset(name, subset, split=split, streaming=True)).take(subset_size)) ) def prepare_math(subset_size: int) -> Dataset: if subset_size > 0: math = load_dataset_subset( name="nvidia/Nemotron-Post-Training-Dataset-v1", subset=None, split="math", subset_size=subset_size ) else: math = cast(Dataset, load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="math")) # Process math data def prepare_messages(example: dict) -> dict: # From: https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1 MATH_SYSTEM_PROMPT = ( "Solve the following math problem. Explain your reasoning and put the final answer in \\boxed{}." ) assert len(example["messages"]) == 2 prompt, completion = example["messages"] assert len(prompt["tool_calls"]) == len(completion["tool_calls"]) == 0 del prompt["tool_calls"] del completion["tool_calls"] prompt = [{"role": "user", "content": f"{MATH_SYSTEM_PROMPT}\n\n{prompt['content']}"}] return {"prompt": prompt, "completion": [completion]} def add_source(_example: dict, index: int) -> dict: return {"source": {"dataset": "nvidia/Nemotron-Post-Training-Dataset-v1", "split": "math", "index": index}} return ( math.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"]) ) def prepare_code(subset_size: int) -> Dataset: if subset_size > 0: code = load_dataset_subset( name="nvidia/Nemotron-Post-Training-Dataset-v1", subset=None, split="code", subset_size=subset_size ) else: code = cast(Dataset, load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="code")) # Process dataset hf_datasets = { "taco": load_dataset("BAAI/TACO", trust_remote_code=True), "apps": load_dataset("codeparrot/apps", trust_remote_code=True), "code_contests": load_dataset("deepmind/code_contests"), "open-r1/codeforces": load_dataset("open-r1/codeforces"), } def get_question(ds_name, split, index): # From https://huggingface.co/datasets/nvidia/OpenCodeReasoning-2#how-to-use-it benchmark = hf_datasets[ds_name][split][int(index)] # type: ignore if ds_name == "code_contests": if not benchmark["description"]: return None return benchmark["description"] elif ds_name in ["taco", "apps"]: return benchmark["question"] elif ds_name == "open-r1/codeforces": if not benchmark["description"]: return None question = benchmark["description"] if benchmark["input_format"]: question += "\n\nInput\n\n" + benchmark["input_format"] if benchmark["output_format"]: question += "\n\nOutput\n\n" + benchmark["output_format"] if benchmark["examples"]: question += "\n\nExamples" for example in benchmark["examples"]: if "input" in example: question += "\n\nInput\n\n" + example["input"] if "output" in example: question += "\n\nOutput\n\n" + example["output"] if benchmark["note"]: question += "\n\nNote\n\n" + benchmark["note"] return question return None def prepare_messages(example: dict) -> dict: # Extract prompt from external dataset metadata = json.loads(example["metadata"]) assert "dataset" in metadata and "split" in metadata and "index" in metadata ds_name, split, index = metadata["dataset"], metadata["split"], int(metadata["index"]) assert ds_name in list(hf_datasets.keys()) question = get_question(ds_name, split, index) assert question is not None assert example["messages"][0]["content"] == "-" # Prepare prompt and completion CODE_SYSTEM_PROMPT = "Write a solution for the following programming challenge. Provide a brief explanation of your approach, followed by the complete code." prompt = [{"role": "user", "content": f"{CODE_SYSTEM_PROMPT}\n\n{question}"}] completion = example["messages"][1] assert len(completion["tool_calls"]) == 0 del completion["tool_calls"] return {"prompt": prompt, "completion": [completion]} def add_source(example: dict, index: int) -> dict: return {"source": {"dataset": "nvidia/OpenCodeReasoning-2", "split": "code", "index": index}} return ( code.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"]) ) def prepare_science(subset_size: int) -> Dataset: if subset_size > 0: science = load_dataset_subset( name="nvidia/OpenScienceReasoning-2", subset=None, split="train", subset_size=subset_size ) else: science = cast(Dataset, load_dataset("nvidia/OpenScienceReasoning-2", split="train")) # Process science data def prepare_messages(example: dict) -> dict: prompt = [{"role": "user", "content": example["input"]}] completion = [{"role": "assistant", "content": example["output"]}] return {"prompt": prompt, "completion": completion} def add_source(example: dict, index: int) -> dict: return {"source": {"dataset": "nvidia/OpenScienceReasoning-2", "split": "train", "index": index}} return ( science.map(prepare_messages) .map(add_source, with_indices=True) .select_columns(["source", "prompt", "completion"]) ) def prepare_if(subset_size: int) -> Dataset: # Download the JSONL file file_path = hf_hub_download( repo_id="a-m-team/AM-DeepSeek-R1-0528-Distilled", filename="if.jsonl", repo_type="dataset" ) # Load the JSONL file data = [] with open(file_path, "r") as f: for line in f: data.append(json.loads(line)) ifd = Dataset.from_list(data[:subset_size]) def prepare_messages(example: dict) -> dict: conversations = example["conversations"] assert len(conversations) == 2 assert conversations[0]["from"] == "human" assert conversations[1]["from"] == "assistant" assert isinstance(conversations[0]["value"], str) assert isinstance(conversations[1]["value"], str) prompt = [{"role": "user", "content": conversations[0]["value"]}] completion = [{"role": "assistant", "content": remove_answer_tags(conversations[1]["value"])}] return {"prompt": prompt, "completion": completion} def add_source(example: dict, index: int) -> dict: return { "source": { "dataset": "a-m-team/AM-DeepSeek-R1-0528-Distilled", "split": "train", "index": index, } } return ( ifd.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"]) ) def prepare_chat(subset_size: int) -> Dataset: # Download the JSONL file file_path = hf_hub_download( repo_id="a-m-team/AM-DeepSeek-R1-0528-Distilled", filename="other.jsonl", repo_type="dataset" ) # Load the JSONL file data = [] with open(file_path, "r") as f: for line in f: data.append(json.loads(line)) chat = Dataset.from_list(data[:subset_size]) def prepare_messages(example: dict) -> dict: conversations = example["conversations"] assert len(conversations) == 2 assert conversations[0]["from"] == "human" assert conversations[1]["from"] == "assistant" assert isinstance(conversations[0]["value"], str) assert isinstance(conversations[1]["value"], str) prompt = [{"role": "user", "content": conversations[0]["value"]}] completion = [{"role": "assistant", "content": remove_answer_tags(conversations[1]["value"])}] return {"prompt": prompt, "completion": completion} def add_source(example: dict, index: int) -> dict: return { "source": { "dataset": "a-m-team/AM-DeepSeek-R1-0528-Distilled", "split": "train", "index": index, } } return ( chat.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"]) ) def prepare_data(subset_size: int) -> DatasetDict: return DatasetDict( { "math": prepare_math(subset_size), "code": prepare_code(subset_size), "science": prepare_science(subset_size), "if": prepare_if(subset_size), "chat": prepare_chat(subset_size), } ) def main(repo_name: str, push_to_hub: bool, subset_size: int): # Prepare dataset dataset = prepare_data(subset_size) print(f"✅ Prepared dataset with {len(dataset):,} split") for split in dataset: print(f" - Split `{split}` with {len(dataset[split]):,} samples") # Create dataset card _, dataset_name = repo_name.split("/") card_meta = DatasetCardData( pretty_name=dataset_name, license="apache-2.0", ) card = DatasetCard.from_template( card_data=card_meta, template_path="templates/CARD.md", dataset_name=dataset_name, cmd=f"uv run intellect-3-sft.py {' '.join(sys.argv[1:])}", source=Path(__file__).read_text(encoding="utf-8", errors="replace"), ) # Push to HF hub if push_to_hub: print(f"Pushing to `{repo_name}`") dataset.push_to_hub(repo_name) card.push_to_hub(repo_name, repo_type="dataset") print(f"✅ Pushed dataset `{repo_name}` to HF Hub") else: print("ℹ️ Skipped pushing to HF Hub. To push, use the `--push-to-hub` or `-H` flag.") def check_write_access(org: str): is_authed = False try: info = whoami() token = info["auth"]["accessToken"]["displayName"] for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]: if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]: is_authed = True except Exception: raise ValueError("❌ You are not logged in. Please run `hf auth login` or `export HF_TOKEN=...`") if not is_authed: raise ValueError(f"❌ Your current token `{token}` does not have write access to `{org}`") print(f"✅ Confirmed write access with token `{token}` to `{org}`") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--username", "-U", default="PrimeIntellect", type=str, help="The username to push the dataset to." ) parser.add_argument("--dataset-name", "-D", default="INTELLECT-3-SFT", type=str, help="The dataset name.") parser.add_argument("--subset-size", "-S", default=1000, type=int, help="The subset size to use.") parser.add_argument("--push-to-hub", "-H", action="store_true", help="Whether to push the dataset to the hub.") args = parser.parse_args() # Validate args assert len(args.dataset_name.split("/")) == 1, "Dataset name must not include the username" if args.push_to_hub: check_write_access(args.username) main(repo_name=f"{args.username}/{args.dataset_name}", push_to_hub=args.push_to_hub, subset_size=args.subset_size) ````

# INTELLECT-3-SFT <!-- 请简要概述该数据集。 --> ## 数据生成 本数据集通过执行以下命令生成: bash uv run intellect-3-sft.py -H -S -1 python # intellect-3-sft.py # /// 脚本配置 # 依赖Python版本 >=3.12 # 依赖包:["datasets==3.6.0", "jinja2"] # /// import argparse import json import sys from pathlib import Path from typing import cast from huggingface_hub import 数据集卡片(DatasetCard)、数据集卡片数据(DatasetCardData)、hf_hub_download、whoami from datasets import 数据集(Dataset)、数据集字典(DatasetDict)、可迭代数据集(IterableDataset)、加载数据集函数(load_dataset) def remove_answer_tags(text: str) -> str: """移除文本中的<answer>与</answer>标签""" return text.replace("<answer>", "").replace("</answer>", "") def load_dataset_subset(name: str, subset: str | None, split: str, subset_size: int) -> Dataset: """加载指定子集的数据集""" return Dataset.from_list( list(cast(IterableDataset, load_dataset(name, subset, split=split, streaming=True)).take(subset_size)) ) def prepare_math(subset_size: int) -> Dataset: if subset_size > 0: math = load_dataset_subset( name="nvidia/Nemotron-Post-Training-Dataset-v1", subset=None, split="math", subset_size=subset_size ) else: math = cast(Dataset, load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="math")) # 处理数学数据集 def prepare_messages(example: dict) -> dict: # 参考来源:https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1 MATH_SYSTEM_PROMPT = ( "解决以下数学问题。请阐述推理过程,并将最终答案置于\boxed{}中。" ) assert len(example["messages"]) == 2 prompt, completion = example["messages"] assert len(prompt["tool_calls"]) == len(completion["tool_calls"]) == 0 del prompt["tool_calls"] del completion["tool_calls"] prompt = [{"role": "user", "content": f"{MATH_SYSTEM_PROMPT} {prompt['content']}"}] return {"prompt": prompt, "completion": [completion]} def add_source(_example: dict, index: int) -> dict: """添加数据集来源信息""" return {"source": {"dataset": "nvidia/Nemotron-Post-Training-Dataset-v1", "split": "math", "index": index}} return ( math.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"]) ) def prepare_code(subset_size: int) -> Dataset: if subset_size > 0: code = load_dataset_subset( name="nvidia/Nemotron-Post-Training-Dataset-v1", subset=None, split="code", subset_size=subset_size ) else: code = cast(Dataset, load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="code")) # 处理代码数据集 hf_datasets = { "taco": load_dataset("BAAI/TACO", trust_remote_code=True), "apps": load_dataset("codeparrot/apps", trust_remote_code=True), "code_contests": load_dataset("deepmind/code_contests"), "open-r1/codeforces": load_dataset("open-r1/codeforces"), } def get_question(ds_name, split, index): # 参考来源:https://huggingface.co/datasets/nvidia/OpenCodeReasoning-2#how-to-use-it benchmark = hf_datasets[ds_name][split][int(index)] # type: ignore if ds_name == "code_contests": if not benchmark["description"]: return None return benchmark["description"] elif ds_name in ["taco", "apps"]: return benchmark["question"] elif ds_name == "open-r1/codeforces": if not benchmark["description"]: return None question = benchmark["description"] if benchmark["input_format"]: question += " 输入格式 " + benchmark["input_format"] if benchmark["output_format"]: question += " 输出格式 " + benchmark["output_format"] if benchmark["examples"]: question += " 示例" for example in benchmark["examples"]: if "input" in example: question += " 输入 " + example["input"] if "output" in example: question += " 输出 " + example["output"] if benchmark["note"]: question += " 备注 " + benchmark["note"] return question return None def prepare_messages(example: dict) -> dict: # 从外部数据集中提取提示词 metadata = json.loads(example["metadata"]) assert "dataset" in metadata and "split" in metadata and "index" in metadata ds_name, split, index = metadata["dataset"], metadata["split"], int(metadata["index"]) assert ds_name in list(hf_datasets.keys()) question = get_question(ds_name, split, index) assert question is not None assert example["messages"][0]["content"] == "-" # 准备提示与补全内容 CODE_SYSTEM_PROMPT = "为以下编程挑战编写解决方案。请简要阐述你的实现思路,随后给出完整代码。" prompt = [{"role": "user", "content": f"{CODE_SYSTEM_PROMPT} {question}"}] completion = example["messages"][1] assert len(completion["tool_calls"]) == 0 del completion["tool_calls"] return {"prompt": prompt, "completion": [completion]} def add_source(example: dict, index: int) -> dict: """添加数据集来源信息""" return {"source": {"dataset": "nvidia/OpenCodeReasoning-2", "split": "code", "index": index}} return ( code.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"]) ) def prepare_science(subset_size: int) -> Dataset: if subset_size > 0: science = load_dataset_subset( name="nvidia/OpenScienceReasoning-2", subset=None, split="train", subset_size=subset_size ) else: science = cast(Dataset, load_dataset("nvidia/OpenScienceReasoning-2", split="train")) # 处理科学数据集 def prepare_messages(example: dict) -> dict: prompt = [{"role": "user", "content": example["input"]}] completion = [{"role": "assistant", "content": example["output"]}] return {"prompt": prompt, "completion": completion} def add_source(example: dict, index: int) -> dict: """添加数据集来源信息""" return {"source": {"dataset": "nvidia/OpenScienceReasoning-2", "split": "train", "index": index}} return ( science.map(prepare_messages) .map(add_source, with_indices=True) .select_columns(["source", "prompt", "completion"]) ) def prepare_if(subset_size: int) -> Dataset: # 从Hugging Face Hub下载JSONL格式文件 file_path = hf_hub_download( repo_id="a-m-team/AM-DeepSeek-R1-0528-Distilled", filename="if.jsonl", repo_type="dataset" ) # 加载JSONL文件 data = [] with open(file_path, "r") as f: for line in f: data.append(json.loads(line)) ifd = Dataset.from_list(data[:subset_size]) def prepare_messages(example: dict) -> dict: conversations = example["conversations"] assert len(conversations) == 2 assert conversations[0]["from"] == "human" assert conversations[1]["from"] == "assistant" assert isinstance(conversations[0]["value"], str) assert isinstance(conversations[1]["value"], str) prompt = [{"role": "user", "content": conversations[0]["value"]}] completion = [{"role": "assistant", "content": remove_answer_tags(conversations[1]["value"])}] return {"prompt": prompt, "completion": completion} def add_source(example: dict, index: int) -> dict: """添加数据集来源信息""" return { "source": { "dataset": "a-m-team/AM-DeepSeek-R1-0528-Distilled", "split": "train", "index": index, } } return ( ifd.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"]) ) def prepare_chat(subset_size: int) -> Dataset: # 从Hugging Face Hub下载JSONL格式文件 file_path = hf_hub_download( repo_id="a-m-team/AM-DeepSeek-R1-0528-Distilled", filename="other.jsonl", repo_type="dataset" ) # 加载JSONL文件 data = [] with open(file_path, "r") as f: for line in f: data.append(json.loads(line)) chat = Dataset.from_list(data[:subset_size]) def prepare_messages(example: dict) -> dict: conversations = example["conversations"] assert len(conversations) == 2 assert conversations[0]["from"] == "human" assert conversations[1]["from"] == "assistant" assert isinstance(conversations[0]["value"], str) assert isinstance(conversations[1]["value"], str) prompt = [{"role": "user", "content": conversations[0]["value"]}] completion = [{"role": "assistant", "content": remove_answer_tags(conversations[1]["value"])}] return {"prompt": prompt, "completion": completion} def add_source(example: dict, index: int) -> dict: """添加数据集来源信息""" return { "source": { "dataset": "a-m-team/AM-DeepSeek-R1-0528-Distilled", "split": "train", "index": index, } } return ( chat.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"]) ) def prepare_data(subset_size: int) -> DatasetDict: """整合所有数据集拆分""" return DatasetDict( { "math": prepare_math(subset_size), "code": prepare_code(subset_size), "science": prepare_science(subset_size), "if": prepare_if(subset_size), "chat": prepare_chat(subset_size), } ) def main(repo_name: str, push_to_hub: bool, subset_size: int): # 准备数据集 dataset = prepare_data(subset_size) print(f"✅ 已准备完成数据集,共包含{len(dataset):,}个拆分") for split in dataset: print(f" - 拆分 `{split}`,包含{len(dataset[split]):,}条样本") # 创建数据集卡片 _, dataset_name = repo_name.split("/") card_meta = DatasetCardData( pretty_name=dataset_name, license="apache-2.0", ) card = DatasetCard.from_template( card_data=card_meta, template_path="templates/CARD.md", dataset_name=dataset_name, cmd=f"uv run intellect-3-sft.py {' '.join(sys.argv[1:])}", source=Path(__file__).read_text(encoding="utf-8", errors="replace"), ) # 推送到Hugging Face Hub if push_to_hub: print(f"正在推送到仓库 `{repo_name}`") dataset.push_to_hub(repo_name) card.push_to_hub(repo_name, repo_type="dataset") print(f"✅ 已成功将数据集 `{repo_name}` 推送至Hugging Face Hub") else: print("ℹ️ 未执行推送操作。如需推送,请使用`--push-to-hub`或`-H`参数。") def check_write_access(org: str): is_authed = False try: info = whoami() token = info["auth"]["accessToken"]["displayName"] for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]: if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]: is_authed = True except Exception: raise ValueError("❌ 未登录,请执行`hf auth login`或设置环境变量`HF_TOKEN=...`") if not is_authed: raise ValueError(f"❌ 当前令牌`{token}`不具备对`{org}`的写入权限") print(f"✅ 已确认令牌`{token}`具备对`{org}`的写入权限") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--username", "-U", default="PrimeIntellect", type=str, help="要推送至的用户名或组织名" ) parser.add_argument("--dataset-name", "-D", default="INTELLECT-3-SFT", type=str, help="数据集名称") parser.add_argument("--subset-size", "-S", default=1000, type=int, help="使用的子集样本数量") parser.add_argument("--push-to-hub", "-H", action="store_true", help="是否将数据集推送至Hub") args = parser.parse_args() # 验证参数合法性 assert len(args.dataset_name.split("/")) == 1, "数据集名称不得包含用户名" if args.push_to_hub: check_write_access(args.username) main(repo_name=f"{args.username}/{args.dataset_name}", push_to_hub=args.push_to_hub, subset_size=args.subset_size)
提供机构:
maas
创建时间:
2025-08-31
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作