shuyuej/metamath_gsm8k
收藏GSM8K训练集
数据集使用
运行以下命令加载数据: python from datasets import load_dataset
dataset = load_dataset("shuyuej/metamath_gsm8k") dataset = dataset[train] print(dataset)
数据集修改代码
python
coding=utf-8
import re
import jsonlines from datasets import load_dataset, Features, Value
def clean_up(sentence): # 查找所有"<<"的位置 matches = [match.start() for match in re.finditer(r<<, sentence)]
for match in matches:
# 获取每个"<<"左边20个字符
left_chars = sentence[match-20:match]
# 如果左边20个字符中有"x"或"X",将其替换为"*"
modified_chars = sentence[match-20:match].replace(x, *).replace(X, *)
# 修改原始句子
if x in left_chars or X in left_chars:
sentence = sentence.replace(left_chars, modified_chars)
# 定义一个模式来匹配"<<"和">>"之间的文本
pattern = r"<<(.*?)>>"
# 使用re.sub替换匹配的模式为空字符串
sentence = re.sub(pattern, "", sentence)
# 查找所有"*"的位置
asterisks = [i for i, char in enumerate(sentence) if char == *]
# 检查并添加"*"周围的空格
for index in reversed(asterisks):
if index > 0 and index < len(sentence) - 1 and sentence[index - 1] != and sentence[index + 1] != :
sentence = sentence[:index] + + sentence[index] + + sentence[index + 1:]
elif index > 0 and index < len(sentence) - 1 and sentence[index - 1] != and sentence[index + 1] == :
sentence = sentence[:index] + + sentence[index] + sentence[index + 1:]
elif index > 0 and index < len(sentence) - 1 and sentence[index - 1] == and sentence[index + 1] != :
sentence = sentence[:index] + sentence[index] + + sentence[index + 1:]
# 查找所有"+"的位置
asterisks = [i for i, char in enumerate(sentence) if char == +]
# 检查并添加"+"周围的空格
for index in reversed(asterisks):
if index > 0 and index < len(sentence) - 1 and sentence[index - 1] != and sentence[index + 1] != :
sentence = sentence[:index] + + sentence[index] + + sentence[index + 1:]
elif index > 0 and index < len(sentence) - 1 and sentence[index - 1] != and sentence[index + 1] == :
sentence = sentence[:index] + + sentence[index] + sentence[index + 1:]
elif index > 0 and index < len(sentence) - 1 and sentence[index - 1] == and sentence[index + 1] != :
sentence = sentence[:index] + sentence[index] + + sentence[index + 1:]
# 查找所有"-"的位置
asterisks = [i for i, char in enumerate(sentence) if char == -]
# 检查并添加"-"周围的空格
for index in reversed(asterisks):
if index > 0 and index < len(sentence) - 1 and sentence[index - 1] != and sentence[index + 1] != :
sentence = sentence[:index] + + sentence[index] + + sentence[index + 1:]
elif index > 0 and index < len(sentence) - 1 and sentence[index - 1] != and sentence[index + 1] == :
sentence = sentence[:index] + + sentence[index] + sentence[index + 1:]
elif index > 0 and index < len(sentence) - 1 and sentence[index - 1] == and sentence[index + 1] != :
sentence = sentence[:index] + sentence[index] + + sentence[index + 1:]
# 查找所有"="的位置
asterisks = [i for i, char in enumerate(sentence) if char == =]
# 检查并添加"="周围的空格
for index in reversed(asterisks):
if index > 0 and index < len(sentence) - 1 and sentence[index - 1] != and sentence[index + 1] != :
sentence = sentence[:index] + + sentence[index] + + sentence[index + 1:]
elif index > 0 and index < len(sentence) - 1 and sentence[index - 1] != and sentence[index + 1] == :
sentence = sentence[:index] + + sentence[index] + sentence[index + 1:]
elif index > 0 and index < len(sentence) - 1 and sentence[index - 1] == and sentence[index + 1] != :
sentence = sentence[:index] + sentence[index] + + sentence[index + 1:]
# 查找所有"."的位置
dots_locations = [match.start() for match in re.finditer(r., sentence)]
# 检查并修改".",如果左边是空格且右边是数字
for dot_location in reversed(dots_locations):
if sentence[dot_location - 1].isspace() and sentence[dot_location + 1].isdigit():
sentence = sentence[:dot_location] + 0 + sentence[dot_location:]
# 检查是否有"."在"
"之前
if ".
" not in sentence:
# 如果没有,添加"."
sentence = sentence.replace("
", ".
")
return sentence
获取训练和测试数据库的路径
context_feat = Features({"question": Value(dtype=string, id=None), "answer": Value(dtype=string, id=None)}) train_set = load_dataset(json, data_files=train.jsonl, split=train, features=context_feat)
data = [] for example in train_set: number = example[answer].split(#### )[1] number = int(number.replace(,, )) append = " The answer is: " + str(number) answer = example[answer] + append answer = clean_up(sentence=answer)
question = example[question]
data.append({"question": question, "answer": answer})
将修改后的数据保存到jsonl文件
output_file = gsm8k_train.jsonl with jsonlines.open(output_file, w) as writer: writer.write_all(data)
print(f"Modified data saved to {output_file}")




