MAVEN-FACT
收藏MAVEN-FACT 数据集概述
数据集简介
MAVEN-FACT 是一个基于 MAVEN 数据集的大规模高质量事件事实性检测数据集。它包含 112,276 个事件的事实性标注和非事实性事件的支持证据标注。
数据获取
数据集可以从 Google Drive 获取。
数据格式
每个 .jsonl 文件是 MAVEN-FACT 的一个子集,文件中的每一行是一个文档的 JSON 字符串。train.jsonl 和 valid.jsonl 的 JSON 格式示例如下:
json5
{
"id": "364ed14fc610df6e25a2f446e2b2d2ab", // 每个文档的唯一字符串
"title": "Expedition of the Thousand", // 文档标题
"document": "The Expedition of the Thousand ( Italian Spedizione dei Mille ) was an event of the Italian Risorgimento that took place in 1860 . a corps of volunteers led by giuseppe garibaldi sailed from quarto , near genoa ( now quarto dei mille ) and landed in marsala , sicily , in order to conquer the kingdom of the two sicilies , ruled by the house of bourbon-two sicilies . The project was an ambitious and risky venture aiming to conquer , with a thousand men , a kingdom with a larger regular army and a more powerful navy . The expedition was a success and concluded with a plebiscite that brought Naples and Sicily into the Kingdom of Sardinia , the last territorial conquest before the creation of the Kingdom of Italy on 17 March 1861 . The sea venture was the only desired action that was jointly decided by the four fathers of the nation Giuseppe Mazzini , Giuseppe Garibaldi , Victor Emmanuel II , and Camillo Cavour , pursuing divergent goals . However , the Expedition was instigated by Francesco Crispi , who utilized his political influence to bolster the Italian unification project . The various groups participated in the expedition for a variety of reasons : for Garibaldi , it was to achieve a united Italy ; to the Sicilian bourgeoisie , an independent Sicily as part of the kingdom of Italy , and for common people , land distribution and the end of oppression .", // 文档内容
"tokens": [ // 分词后的文档内容,每个元素是一个分词后的句子
[
"The", "project", "was", "an", "ambitious", "and", "risky", "venture",
"aiming", "to", "conquer", ",", "with", "a", "thousand", "men", ",",
"a", "kingdom", "with", "a", "larger", "regular", "army", "and", "a",
"more", "powerful", "navy", ".",
],
],
"sentences": [ // 未分词的句子,每个元素是一个句子(字符串)
"The project was an ambitious and risky venture aiming to conquer, with a thousand men, a kingdom with a larger regular army and a more powerful navy.",
],
"has_arguments": true, // 文档是否包含 arguments 属性
"events": [ // 标注的事件列表,每个元素是一个事件(共指链)
{
"id": "EVENT_c027e659d7fe424a0a57ecbe35b3a7f9", // 事件的唯一字符串
"type": "Conquering", // 事件类型
"type_id": 21, // 事件类型的数值 ID,与 MAVEN 一致
"mention": [ // 共指事件提及的列表,每个元素是一个字典,它们之间有共指关系
{
"id": "cfd1fa5450f7f4a3ce3d6ae48ca642d3", // 事件提及的唯一字符串
"trigger_word": "conquer", // 触发词或短语
"sent_id": 1, // 对应的句子索引,从 0 开始
"offset": [30,31], // 触发词在 tokens 列表中的偏移量
"factuality": "PS+", // 事件提及的事实性值
"evidence_word": ["in", "order", "to"], // 支持事实性值的词列表(仅对非事实性事件)
"evidence_offset": [ // 支持词的偏移量列表,每个元素是 [句子索引, 偏移量]
[1, 27], [1, 28], [1, 29]
]
},
],
"arguments": [ // 与事件相关的论据列表,每个元素是一个字典
{
"mentions": [ // 论据提及的列表
{
"mention": "a corps of volunteers led by giuseppe garibaldi", // 论据词或短语
"offset": [137, 184] // 论据提及在文档中的偏移量
}
],
"type": "Agent" // 论据类型
},
]
},
],
"TIMEX": [ // 标注的时间表达式列表,每个元素是一个字典
{
"id": "TIME_c61b2c2b8b8c6656a1cc8443fed8c58a", // 时间表达式的唯一字符串
"mention": "1860", // 时间表达式的提及
"type": "DATE", // 时间表达式的类型
"sent_id": 0, // 对应的句子索引,从 0 开始
"offset": [24, 25] // 触发词在 tokens 列表中的偏移量
},
],
"temporal_relations": { // 事件(和时间表达式)之间的时间关系列表
"BEFORE": [ // BEFORE 类型的时间关系列表
["EVENT_id_1", "EVENT_id_2"], // 时间关系实例,表示 EVENT_id_1 在 EVENT_id_2 之前
],
"OVERLAP": [ // 以下类型类似
["EVENT_id_1", "EVENT_id_2"],
],
"CONTAINS": [
["EVENT_id_1", "EVENT_id_2"],
],
"SIMULTANEOUS": [
["EVENT_id_1", "EVENT_id_2"],
],
"ENDS-ON": [
["EVENT_id_1", "EVENT_id_2"],
],
"BEGINS-ON": [
["EVENT_id_1", "EVENT_id_2"],
]
},
"causal_relation": { // 事件之间的因果关系列表
"CAUSE": [ // CAUSE 类型的因果关系列表
["EVENT_id_1", "EVENT_id_2"], // 因果关系实例,表示 EVENT_id_1 导致 EVENT_id_2
],
"PRECONDITION": [ // PRECONDITION 类型类似
["EVENT_id_1", "EVENT_id_2"],
]
},
"subevent_relations": [ // 事件之间的子事件关系列表
["EVENT_id_1", "EVENT_id_2"], // 子事件关系实例,表示 EVENT_id_2 是 EVENT_id_1 的子事件
]
}




