pietrolesci/mpe
收藏数据集概述
原始数据集链接:这里。
数据集处理
与原始数据集相同的数据和分割。新增以下列:
premise:将premise1,premise2,premise3, 和premise4连接起来。label:使用以下映射对gold_label进行编码:{"entailment": 0, "neutral": 1, "contradiction": 2}。
数据集创建代码
python import pandas as pd from datasets import Features, Value, ClassLabel, Dataset, DatasetDict from pathlib import Path
读取数据
path = Path("<path to files>") datasets = {} for dataset_path in path.rglob("*.txt"): df = pd.read_csv(dataset_path, sep=" ") datasets[dataset_path.name.split("_")[1].split(".")[0]] = df
ds = {} for name, df_ in datasets.items(): df = df_.copy()
# 修复 dev 分割的解析错误
if name == "dev":
df.loc[df["contradiction_judgments"] == "3 contradiction", "contradiction_judgments"] = 3
df.loc[df["gold_label"].isna(), "gold_label"] = "contradiction"
# 检查无 NaN 值
assert df.isna().sum().sum() == 0
# 修复数据类型
for col in ("entailment_judgments", "neutral_judgments", "contradiction_judgments"):
df[col] = df[col].astype(int)
# 修复 premise 列
for i in range(1, 4 + 1):
df[f"premise{i}"] = df[f"premise{i}"].str.split("/", expand=True)[1]
df["premise"] = df[[f"premise{i}" for i in range(1, 4 + 1)]].agg(" ".join, axis=1)
# 编码标签
df["label"] = df["gold_label"].map({"entailment": 0, "neutral": 1, "contradiction": 2})
# 转换为数据集
features = Features({
"premise1": Value(dtype="string", id=None),
"premise2": Value(dtype="string", id=None),
"premise3": Value(dtype="string", id=None),
"premise4": Value(dtype="string", id=None),
"premise": Value(dtype="string", id=None),
"hypothesis": Value(dtype="string", id=None),
"entailment_judgments": Value(dtype="int32"),
"neutral_judgments": Value(dtype="int32"),
"contradiction_judgments": Value(dtype="int32"),
"gold_label": Value(dtype="string"),
"label": ClassLabel(num_classes=3, names=["entailment", "neutral", "contradiction"]),
})
ds[name] = Dataset.from_pandas(df, features=features)
推送到 hub
ds = DatasetDict(ds) ds.push_to_hub("mpe", token="<token>")
检查分割之间的重叠
from itertools import combinations for i, j in combinations(ds.keys(), 2): print( f"{i} - {j}: ", pd.merge( ds[i].to_pandas(), ds[j].to_pandas(), on=["premise", "hypothesis", "label"], how="inner", ).shape[0], ) #> dev - test: 0 #> dev - train: 0 #> test - train: 0



