Felladrin/ChatML-WebGLM-QA
收藏数据集概述
许可证
- Apache 2.0
任务类别
- 问答
- 文本生成
语言
- 英语
数据集大小
- 10K<n<100K
数据集格式
- ChatML
数据集转换代码
python from datasets import load_dataset import pandas import re import random from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path="Felladrin/Llama-160M-Chat-v1" )
dataset = load_dataset("THUDM/webglm-qa", split="train")
def format(columns): references = " ".join( [ f"- {columns[references][i].strip()}" for i in range(len(columns["references"])) ] ) question = columns["question"].strip() answer = columns["answer"].strip() assistant_message = re.sub(r"[d]", "", answer)
if random.random() < 0.5:
user_message = f"Question:
{question}
Context: {references}" else: user_message = f"Context: {references}
Question: {question}"
messages = [
{
"role": "user",
"content": user_message,
},
{
"role": "assistant",
"content": assistant_message,
},
]
return tokenizer.apply_chat_template(messages, tokenize=False)
pandas.DataFrame({"text": [format(columns) for columns in dataset]}).to_parquet("train.parquet", index=False)



