KETI-AIR/kor_nq_open

Name: KETI-AIR/kor_nq_open
Creator: KETI-AIR
Published: 2023-12-05 06:39:46
License: 暂无描述

Hugging Face2023-12-05 更新2024-03-04 收录

下载链接：

https://hf-mirror.com/datasets/KETI-AIR/kor_nq_open

下载链接

链接失效反馈

官方服务：

资源简介：

--- configs: - config_name: default data_files: - split: train path: data/train-* - split: validation path: data/validation-* dataset_info: features: - name: data_index_by_user dtype: int32 - name: question dtype: string - name: answer sequence: string splits: - name: train num_bytes: 8520218 num_examples: 87925 - name: validation num_bytes: 394518 num_examples: 3610 download_size: 5925491 dataset_size: 8914736 license: cc-by-sa-3.0 --- # Dataset Card for "kor_nq_open" [More Information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards) # Source Data Citation Information ``` @article{doi:10.1162/tacl\_a\_00276, author = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion and Kelcey, Matthew and Chang, Ming-Wei and Dai, Andrew M. and Uszkoreit, Jakob and Le, Quoc and Petrov, Slav}, title = {Natural Questions: A Benchmark for Question Answering Research}, journal = {Transactions of the Association for Computational Linguistics}, volume = {7}, number = {}, pages = {453-466}, year = {2019}, doi = {10.1162/tacl\_a\_00276}, URL = { https://doi.org/10.1162/tacl_a_00276 }, eprint = { https://doi.org/10.1162/tacl_a_00276 }, abstract = { We present the Natural Questions corpus, a question answering data set. Questions consist of real anonymized, aggregated queries issued to the Google search engine. An annotator is presented with a question along with a Wikipedia page from the top 5 search results, and annotates a long answer (typically a paragraph) and a short answer (one or more entities) if present on the page, or marks null if no long/short answer is present. The public release consists of 307,373 training examples with single annotations; 7,830 examples with 5-way annotations for development data; and a further 7,842 examples with 5-way annotated sequestered as test data. We present experiments validating quality of the data. We also describe analysis of 25-way annotations on 302 examples, giving insights into human variability on the annotation task. We introduce robust metrics for the purposes of evaluating question answering systems; demonstrate high human upper bounds on these metrics; and establish baseline results using competitive methods drawn from related literature. } } @inproceedings{lee-etal-2019-latent, title = "Latent Retrieval for Weakly Supervised Open Domain Question Answering", author = "Lee, Kenton and Chang, Ming-Wei and Toutanova, Kristina", booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2019", address = "Florence, Italy", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/P19-1612", doi = "10.18653/v1/P19-1612", pages = "6086--6096", abstract = "Recent work on open domain question answering (QA) assumes strong supervision of the supporting evidence and/or assumes a blackbox information retrieval (IR) system to retrieve evidence candidates. We argue that both are suboptimal, since gold evidence is not always available, and QA is fundamentally different from IR. We show for the first time that it is possible to jointly learn the retriever and reader from question-answer string pairs and without any IR system. In this setting, evidence retrieval from all of Wikipedia is treated as a latent variable. Since this is impractical to learn from scratch, we pre-train the retriever with an Inverse Cloze Task. We evaluate on open versions of five QA datasets. On datasets where the questioner already knows the answer, a traditional IR system such as BM25 is sufficient. On datasets where a user is genuinely seeking an answer, we show that learned retrieval is crucial, outperforming BM25 by up to 19 points in exact match.", } ```

提供机构：

KETI-AIR

原始信息汇总

数据集概述

数据集配置

默认配置：
- 训练集：路径为 data/train-*
- 验证集：路径为 data/validation-*

数据特征

data_index_by_user：数据类型为 int32
question：数据类型为 string
answer：数据类型为 string 序列

数据集划分

训练集：
- 字节数：8520218
- 样本数：87925
验证集：
- 字节数：394518
- 样本数：3610

数据集大小

下载大小：5925491 字节
数据集大小：8914736 字节

许可

许可证：cc-by-sa-3.0

数据来源引用

@article{doi:10.1162/tacl_a_00276, author = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion and Kelcey, Matthew and Chang, Ming-Wei and Dai, Andrew M. and Uszkoreit, Jakob and Le, Quoc and Petrov, Slav}, title = {Natural Questions: A Benchmark for Question Answering Research}, journal = {Transactions of the Association for Computational Linguistics}, volume = {7}, number = {}, pages = {453-466}, year = {2019}, doi = {10.1162/tacl_a_00276}, URL = {https://doi.org/10.1162/tacl_a_00276}, eprint = {https://doi.org/10.1162/tacl_a_00276}, abstract = {我们提出自然问题语料库，一个问答数据集。问题由向Google搜索引擎发出的真实匿名聚合查询组成。注释者会看到一个问题以及来自前5个搜索结果的维基百科页面，并在页面上注释一个长答案（通常是一个段落）和一个短答案（一个或多个实体），如果页面不存在长/短答案，则标记为空。公开发布包含307,373个单注释的训练样本；7,830个5向注释的开发数据样本；以及另外7,842个5向注释的测试数据样本。我们进行了实验，验证了数据的质量。我们还描述了302个样本的25向注释分析，为人类在注释任务上的变异性提供了见解。我们引入了用于评估问答系统的稳健指标；展示了人类在这些指标上的上限；并使用相关文献中的竞争方法建立了基线结果。} }

@inproceedings{lee-etal-2019-latent, title = "Latent Retrieval for Weakly Supervised Open Domain Question Answering", author = "Lee, Kenton and Chang, Ming-Wei and Toutanova, Kristina", booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2019", address = "Florence, Italy", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/P19-1612", doi = "10.18653/v1/P19-1612", pages = "6086--6096", abstract = "最近关于开放域问答（QA）的工作假设了对支持证据的强监督和/或假设了一个黑盒信息检索（IR）系统来检索证据候选。我们认为这两者都是次优的，因为黄金证据并不总是可用，而且QA从根本上不同于IR。我们首次展示了可以从问题-答案字符串对中联合学习检索器和阅读器，而无需任何IR系统。在这种设置中，从所有维基百科中检索证据被视为一个潜在变量。由于从头开始学习是不切实际的，我们使用逆完形填空任务预训练了检索器。我们在五个QA数据集的开放版本上进行评估。在提问者已经知道答案的数据集上，传统的IR系统如BM25是足够的。在用户真正寻求答案的数据集上，我们展示了学习检索是至关重要的，比BM25的完全匹配高出19个百分点。" }

5,000+

优质数据集

54 个

任务类型

进入经典数据集