INTELLECT-3-SFT
收藏魔搭社区2025-12-03 更新2025-09-06 收录
下载链接:
https://modelscope.cn/datasets/PrimeIntellect/INTELLECT-3-SFT
下载链接
链接失效反馈官方服务:
资源简介:
# INTELLECT-3-SFT
<!-- Provide a quick summary of the dataset. -->
## Generation
This dataset was created by running
````bash
uv run intellect-3-sft.py -H -S -1
````
````python
# intellect-3-sft.py
# /// script
# requires-python = ">=3.12"
# dependencies = ["datasets==3.6.0", "jinja2"]
# ///
import argparse
import json
import sys
from pathlib import Path
from typing import cast
from huggingface_hub import DatasetCard, DatasetCardData, hf_hub_download, whoami
from datasets import Dataset, DatasetDict, IterableDataset, load_dataset
def remove_answer_tags(text: str) -> str:
return text.replace("<answer>", "").replace("</answer>", "")
def load_dataset_subset(name: str, subset: str | None, split: str, subset_size: int) -> Dataset:
return Dataset.from_list(
list(cast(IterableDataset, load_dataset(name, subset, split=split, streaming=True)).take(subset_size))
)
def prepare_math(subset_size: int) -> Dataset:
if subset_size > 0:
math = load_dataset_subset(
name="nvidia/Nemotron-Post-Training-Dataset-v1", subset=None, split="math", subset_size=subset_size
)
else:
math = cast(Dataset, load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="math"))
# Process math data
def prepare_messages(example: dict) -> dict:
# From: https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1
MATH_SYSTEM_PROMPT = (
"Solve the following math problem. Explain your reasoning and put the final answer in \\boxed{}."
)
assert len(example["messages"]) == 2
prompt, completion = example["messages"]
assert len(prompt["tool_calls"]) == len(completion["tool_calls"]) == 0
del prompt["tool_calls"]
del completion["tool_calls"]
prompt = [{"role": "user", "content": f"{MATH_SYSTEM_PROMPT}\n\n{prompt['content']}"}]
return {"prompt": prompt, "completion": [completion]}
def add_source(_example: dict, index: int) -> dict:
return {"source": {"dataset": "nvidia/Nemotron-Post-Training-Dataset-v1", "split": "math", "index": index}}
return (
math.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"])
)
def prepare_code(subset_size: int) -> Dataset:
if subset_size > 0:
code = load_dataset_subset(
name="nvidia/Nemotron-Post-Training-Dataset-v1", subset=None, split="code", subset_size=subset_size
)
else:
code = cast(Dataset, load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="code"))
# Process dataset
hf_datasets = {
"taco": load_dataset("BAAI/TACO", trust_remote_code=True),
"apps": load_dataset("codeparrot/apps", trust_remote_code=True),
"code_contests": load_dataset("deepmind/code_contests"),
"open-r1/codeforces": load_dataset("open-r1/codeforces"),
}
def get_question(ds_name, split, index):
# From https://huggingface.co/datasets/nvidia/OpenCodeReasoning-2#how-to-use-it
benchmark = hf_datasets[ds_name][split][int(index)] # type: ignore
if ds_name == "code_contests":
if not benchmark["description"]:
return None
return benchmark["description"]
elif ds_name in ["taco", "apps"]:
return benchmark["question"]
elif ds_name == "open-r1/codeforces":
if not benchmark["description"]:
return None
question = benchmark["description"]
if benchmark["input_format"]:
question += "\n\nInput\n\n" + benchmark["input_format"]
if benchmark["output_format"]:
question += "\n\nOutput\n\n" + benchmark["output_format"]
if benchmark["examples"]:
question += "\n\nExamples"
for example in benchmark["examples"]:
if "input" in example:
question += "\n\nInput\n\n" + example["input"]
if "output" in example:
question += "\n\nOutput\n\n" + example["output"]
if benchmark["note"]:
question += "\n\nNote\n\n" + benchmark["note"]
return question
return None
def prepare_messages(example: dict) -> dict:
# Extract prompt from external dataset
metadata = json.loads(example["metadata"])
assert "dataset" in metadata and "split" in metadata and "index" in metadata
ds_name, split, index = metadata["dataset"], metadata["split"], int(metadata["index"])
assert ds_name in list(hf_datasets.keys())
question = get_question(ds_name, split, index)
assert question is not None
assert example["messages"][0]["content"] == "-"
# Prepare prompt and completion
CODE_SYSTEM_PROMPT = "Write a solution for the following programming challenge. Provide a brief explanation of your approach, followed by the complete code."
prompt = [{"role": "user", "content": f"{CODE_SYSTEM_PROMPT}\n\n{question}"}]
completion = example["messages"][1]
assert len(completion["tool_calls"]) == 0
del completion["tool_calls"]
return {"prompt": prompt, "completion": [completion]}
def add_source(example: dict, index: int) -> dict:
return {"source": {"dataset": "nvidia/OpenCodeReasoning-2", "split": "code", "index": index}}
return (
code.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"])
)
def prepare_science(subset_size: int) -> Dataset:
if subset_size > 0:
science = load_dataset_subset(
name="nvidia/OpenScienceReasoning-2", subset=None, split="train", subset_size=subset_size
)
else:
science = cast(Dataset, load_dataset("nvidia/OpenScienceReasoning-2", split="train"))
# Process science data
def prepare_messages(example: dict) -> dict:
prompt = [{"role": "user", "content": example["input"]}]
completion = [{"role": "assistant", "content": example["output"]}]
return {"prompt": prompt, "completion": completion}
def add_source(example: dict, index: int) -> dict:
return {"source": {"dataset": "nvidia/OpenScienceReasoning-2", "split": "train", "index": index}}
return (
science.map(prepare_messages)
.map(add_source, with_indices=True)
.select_columns(["source", "prompt", "completion"])
)
def prepare_if(subset_size: int) -> Dataset:
# Download the JSONL file
file_path = hf_hub_download(
repo_id="a-m-team/AM-DeepSeek-R1-0528-Distilled", filename="if.jsonl", repo_type="dataset"
)
# Load the JSONL file
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line))
ifd = Dataset.from_list(data[:subset_size])
def prepare_messages(example: dict) -> dict:
conversations = example["conversations"]
assert len(conversations) == 2
assert conversations[0]["from"] == "human"
assert conversations[1]["from"] == "assistant"
assert isinstance(conversations[0]["value"], str)
assert isinstance(conversations[1]["value"], str)
prompt = [{"role": "user", "content": conversations[0]["value"]}]
completion = [{"role": "assistant", "content": remove_answer_tags(conversations[1]["value"])}]
return {"prompt": prompt, "completion": completion}
def add_source(example: dict, index: int) -> dict:
return {
"source": {
"dataset": "a-m-team/AM-DeepSeek-R1-0528-Distilled",
"split": "train",
"index": index,
}
}
return (
ifd.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"])
)
def prepare_chat(subset_size: int) -> Dataset:
# Download the JSONL file
file_path = hf_hub_download(
repo_id="a-m-team/AM-DeepSeek-R1-0528-Distilled", filename="other.jsonl", repo_type="dataset"
)
# Load the JSONL file
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line))
chat = Dataset.from_list(data[:subset_size])
def prepare_messages(example: dict) -> dict:
conversations = example["conversations"]
assert len(conversations) == 2
assert conversations[0]["from"] == "human"
assert conversations[1]["from"] == "assistant"
assert isinstance(conversations[0]["value"], str)
assert isinstance(conversations[1]["value"], str)
prompt = [{"role": "user", "content": conversations[0]["value"]}]
completion = [{"role": "assistant", "content": remove_answer_tags(conversations[1]["value"])}]
return {"prompt": prompt, "completion": completion}
def add_source(example: dict, index: int) -> dict:
return {
"source": {
"dataset": "a-m-team/AM-DeepSeek-R1-0528-Distilled",
"split": "train",
"index": index,
}
}
return (
chat.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"])
)
def prepare_data(subset_size: int) -> DatasetDict:
return DatasetDict(
{
"math": prepare_math(subset_size),
"code": prepare_code(subset_size),
"science": prepare_science(subset_size),
"if": prepare_if(subset_size),
"chat": prepare_chat(subset_size),
}
)
def main(repo_name: str, push_to_hub: bool, subset_size: int):
# Prepare dataset
dataset = prepare_data(subset_size)
print(f"✅ Prepared dataset with {len(dataset):,} split")
for split in dataset:
print(f" - Split `{split}` with {len(dataset[split]):,} samples")
# Create dataset card
_, dataset_name = repo_name.split("/")
card_meta = DatasetCardData(
pretty_name=dataset_name,
license="apache-2.0",
)
card = DatasetCard.from_template(
card_data=card_meta,
template_path="templates/CARD.md",
dataset_name=dataset_name,
cmd=f"uv run intellect-3-sft.py {' '.join(sys.argv[1:])}",
source=Path(__file__).read_text(encoding="utf-8", errors="replace"),
)
# Push to HF hub
if push_to_hub:
print(f"Pushing to `{repo_name}`")
dataset.push_to_hub(repo_name)
card.push_to_hub(repo_name, repo_type="dataset")
print(f"✅ Pushed dataset `{repo_name}` to HF Hub")
else:
print("ℹ️ Skipped pushing to HF Hub. To push, use the `--push-to-hub` or `-H` flag.")
def check_write_access(org: str):
is_authed = False
try:
info = whoami()
token = info["auth"]["accessToken"]["displayName"]
for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]:
if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]:
is_authed = True
except Exception:
raise ValueError("❌ You are not logged in. Please run `hf auth login` or `export HF_TOKEN=...`")
if not is_authed:
raise ValueError(f"❌ Your current token `{token}` does not have write access to `{org}`")
print(f"✅ Confirmed write access with token `{token}` to `{org}`")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--username", "-U", default="PrimeIntellect", type=str, help="The username to push the dataset to."
)
parser.add_argument("--dataset-name", "-D", default="INTELLECT-3-SFT", type=str, help="The dataset name.")
parser.add_argument("--subset-size", "-S", default=1000, type=int, help="The subset size to use.")
parser.add_argument("--push-to-hub", "-H", action="store_true", help="Whether to push the dataset to the hub.")
args = parser.parse_args()
# Validate args
assert len(args.dataset_name.split("/")) == 1, "Dataset name must not include the username"
if args.push_to_hub:
check_write_access(args.username)
main(repo_name=f"{args.username}/{args.dataset_name}", push_to_hub=args.push_to_hub, subset_size=args.subset_size)
````
# INTELLECT-3-SFT
<!-- 请简要概述该数据集。 -->
## 数据生成
本数据集通过执行以下命令生成:
bash
uv run intellect-3-sft.py -H -S -1
python
# intellect-3-sft.py
# /// 脚本配置
# 依赖Python版本 >=3.12
# 依赖包:["datasets==3.6.0", "jinja2"]
# ///
import argparse
import json
import sys
from pathlib import Path
from typing import cast
from huggingface_hub import 数据集卡片(DatasetCard)、数据集卡片数据(DatasetCardData)、hf_hub_download、whoami
from datasets import 数据集(Dataset)、数据集字典(DatasetDict)、可迭代数据集(IterableDataset)、加载数据集函数(load_dataset)
def remove_answer_tags(text: str) -> str:
"""移除文本中的<answer>与</answer>标签"""
return text.replace("<answer>", "").replace("</answer>", "")
def load_dataset_subset(name: str, subset: str | None, split: str, subset_size: int) -> Dataset:
"""加载指定子集的数据集"""
return Dataset.from_list(
list(cast(IterableDataset, load_dataset(name, subset, split=split, streaming=True)).take(subset_size))
)
def prepare_math(subset_size: int) -> Dataset:
if subset_size > 0:
math = load_dataset_subset(
name="nvidia/Nemotron-Post-Training-Dataset-v1", subset=None, split="math", subset_size=subset_size
)
else:
math = cast(Dataset, load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="math"))
# 处理数学数据集
def prepare_messages(example: dict) -> dict:
# 参考来源:https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1
MATH_SYSTEM_PROMPT = (
"解决以下数学问题。请阐述推理过程,并将最终答案置于\boxed{}中。"
)
assert len(example["messages"]) == 2
prompt, completion = example["messages"]
assert len(prompt["tool_calls"]) == len(completion["tool_calls"]) == 0
del prompt["tool_calls"]
del completion["tool_calls"]
prompt = [{"role": "user", "content": f"{MATH_SYSTEM_PROMPT}
{prompt['content']}"}]
return {"prompt": prompt, "completion": [completion]}
def add_source(_example: dict, index: int) -> dict:
"""添加数据集来源信息"""
return {"source": {"dataset": "nvidia/Nemotron-Post-Training-Dataset-v1", "split": "math", "index": index}}
return (
math.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"])
)
def prepare_code(subset_size: int) -> Dataset:
if subset_size > 0:
code = load_dataset_subset(
name="nvidia/Nemotron-Post-Training-Dataset-v1", subset=None, split="code", subset_size=subset_size
)
else:
code = cast(Dataset, load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="code"))
# 处理代码数据集
hf_datasets = {
"taco": load_dataset("BAAI/TACO", trust_remote_code=True),
"apps": load_dataset("codeparrot/apps", trust_remote_code=True),
"code_contests": load_dataset("deepmind/code_contests"),
"open-r1/codeforces": load_dataset("open-r1/codeforces"),
}
def get_question(ds_name, split, index):
# 参考来源:https://huggingface.co/datasets/nvidia/OpenCodeReasoning-2#how-to-use-it
benchmark = hf_datasets[ds_name][split][int(index)] # type: ignore
if ds_name == "code_contests":
if not benchmark["description"]:
return None
return benchmark["description"]
elif ds_name in ["taco", "apps"]:
return benchmark["question"]
elif ds_name == "open-r1/codeforces":
if not benchmark["description"]:
return None
question = benchmark["description"]
if benchmark["input_format"]:
question += "
输入格式
" + benchmark["input_format"]
if benchmark["output_format"]:
question += "
输出格式
" + benchmark["output_format"]
if benchmark["examples"]:
question += "
示例"
for example in benchmark["examples"]:
if "input" in example:
question += "
输入
" + example["input"]
if "output" in example:
question += "
输出
" + example["output"]
if benchmark["note"]:
question += "
备注
" + benchmark["note"]
return question
return None
def prepare_messages(example: dict) -> dict:
# 从外部数据集中提取提示词
metadata = json.loads(example["metadata"])
assert "dataset" in metadata and "split" in metadata and "index" in metadata
ds_name, split, index = metadata["dataset"], metadata["split"], int(metadata["index"])
assert ds_name in list(hf_datasets.keys())
question = get_question(ds_name, split, index)
assert question is not None
assert example["messages"][0]["content"] == "-"
# 准备提示与补全内容
CODE_SYSTEM_PROMPT = "为以下编程挑战编写解决方案。请简要阐述你的实现思路,随后给出完整代码。"
prompt = [{"role": "user", "content": f"{CODE_SYSTEM_PROMPT}
{question}"}]
completion = example["messages"][1]
assert len(completion["tool_calls"]) == 0
del completion["tool_calls"]
return {"prompt": prompt, "completion": [completion]}
def add_source(example: dict, index: int) -> dict:
"""添加数据集来源信息"""
return {"source": {"dataset": "nvidia/OpenCodeReasoning-2", "split": "code", "index": index}}
return (
code.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"])
)
def prepare_science(subset_size: int) -> Dataset:
if subset_size > 0:
science = load_dataset_subset(
name="nvidia/OpenScienceReasoning-2", subset=None, split="train", subset_size=subset_size
)
else:
science = cast(Dataset, load_dataset("nvidia/OpenScienceReasoning-2", split="train"))
# 处理科学数据集
def prepare_messages(example: dict) -> dict:
prompt = [{"role": "user", "content": example["input"]}]
completion = [{"role": "assistant", "content": example["output"]}]
return {"prompt": prompt, "completion": completion}
def add_source(example: dict, index: int) -> dict:
"""添加数据集来源信息"""
return {"source": {"dataset": "nvidia/OpenScienceReasoning-2", "split": "train", "index": index}}
return (
science.map(prepare_messages)
.map(add_source, with_indices=True)
.select_columns(["source", "prompt", "completion"])
)
def prepare_if(subset_size: int) -> Dataset:
# 从Hugging Face Hub下载JSONL格式文件
file_path = hf_hub_download(
repo_id="a-m-team/AM-DeepSeek-R1-0528-Distilled", filename="if.jsonl", repo_type="dataset"
)
# 加载JSONL文件
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line))
ifd = Dataset.from_list(data[:subset_size])
def prepare_messages(example: dict) -> dict:
conversations = example["conversations"]
assert len(conversations) == 2
assert conversations[0]["from"] == "human"
assert conversations[1]["from"] == "assistant"
assert isinstance(conversations[0]["value"], str)
assert isinstance(conversations[1]["value"], str)
prompt = [{"role": "user", "content": conversations[0]["value"]}]
completion = [{"role": "assistant", "content": remove_answer_tags(conversations[1]["value"])}]
return {"prompt": prompt, "completion": completion}
def add_source(example: dict, index: int) -> dict:
"""添加数据集来源信息"""
return {
"source": {
"dataset": "a-m-team/AM-DeepSeek-R1-0528-Distilled",
"split": "train",
"index": index,
}
}
return (
ifd.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"])
)
def prepare_chat(subset_size: int) -> Dataset:
# 从Hugging Face Hub下载JSONL格式文件
file_path = hf_hub_download(
repo_id="a-m-team/AM-DeepSeek-R1-0528-Distilled", filename="other.jsonl", repo_type="dataset"
)
# 加载JSONL文件
data = []
with open(file_path, "r") as f:
for line in f:
data.append(json.loads(line))
chat = Dataset.from_list(data[:subset_size])
def prepare_messages(example: dict) -> dict:
conversations = example["conversations"]
assert len(conversations) == 2
assert conversations[0]["from"] == "human"
assert conversations[1]["from"] == "assistant"
assert isinstance(conversations[0]["value"], str)
assert isinstance(conversations[1]["value"], str)
prompt = [{"role": "user", "content": conversations[0]["value"]}]
completion = [{"role": "assistant", "content": remove_answer_tags(conversations[1]["value"])}]
return {"prompt": prompt, "completion": completion}
def add_source(example: dict, index: int) -> dict:
"""添加数据集来源信息"""
return {
"source": {
"dataset": "a-m-team/AM-DeepSeek-R1-0528-Distilled",
"split": "train",
"index": index,
}
}
return (
chat.map(prepare_messages).map(add_source, with_indices=True).select_columns(["source", "prompt", "completion"])
)
def prepare_data(subset_size: int) -> DatasetDict:
"""整合所有数据集拆分"""
return DatasetDict(
{
"math": prepare_math(subset_size),
"code": prepare_code(subset_size),
"science": prepare_science(subset_size),
"if": prepare_if(subset_size),
"chat": prepare_chat(subset_size),
}
)
def main(repo_name: str, push_to_hub: bool, subset_size: int):
# 准备数据集
dataset = prepare_data(subset_size)
print(f"✅ 已准备完成数据集,共包含{len(dataset):,}个拆分")
for split in dataset:
print(f" - 拆分 `{split}`,包含{len(dataset[split]):,}条样本")
# 创建数据集卡片
_, dataset_name = repo_name.split("/")
card_meta = DatasetCardData(
pretty_name=dataset_name,
license="apache-2.0",
)
card = DatasetCard.from_template(
card_data=card_meta,
template_path="templates/CARD.md",
dataset_name=dataset_name,
cmd=f"uv run intellect-3-sft.py {' '.join(sys.argv[1:])}",
source=Path(__file__).read_text(encoding="utf-8", errors="replace"),
)
# 推送到Hugging Face Hub
if push_to_hub:
print(f"正在推送到仓库 `{repo_name}`")
dataset.push_to_hub(repo_name)
card.push_to_hub(repo_name, repo_type="dataset")
print(f"✅ 已成功将数据集 `{repo_name}` 推送至Hugging Face Hub")
else:
print("ℹ️ 未执行推送操作。如需推送,请使用`--push-to-hub`或`-H`参数。")
def check_write_access(org: str):
is_authed = False
try:
info = whoami()
token = info["auth"]["accessToken"]["displayName"]
for entity in info["auth"]["accessToken"]["fineGrained"]["scoped"]:
if entity["entity"]["name"] == org and "repo.write" in entity["permissions"]:
is_authed = True
except Exception:
raise ValueError("❌ 未登录,请执行`hf auth login`或设置环境变量`HF_TOKEN=...`")
if not is_authed:
raise ValueError(f"❌ 当前令牌`{token}`不具备对`{org}`的写入权限")
print(f"✅ 已确认令牌`{token}`具备对`{org}`的写入权限")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--username", "-U", default="PrimeIntellect", type=str, help="要推送至的用户名或组织名"
)
parser.add_argument("--dataset-name", "-D", default="INTELLECT-3-SFT", type=str, help="数据集名称")
parser.add_argument("--subset-size", "-S", default=1000, type=int, help="使用的子集样本数量")
parser.add_argument("--push-to-hub", "-H", action="store_true", help="是否将数据集推送至Hub")
args = parser.parse_args()
# 验证参数合法性
assert len(args.dataset_name.split("/")) == 1, "数据集名称不得包含用户名"
if args.push_to_hub:
check_write_access(args.username)
main(repo_name=f"{args.username}/{args.dataset_name}", push_to_hub=args.push_to_hub, subset_size=args.subset_size)
提供机构:
maas
创建时间:
2025-08-31



