answerdotai/STITCH
收藏Hugging Face2024-04-19 更新2024-06-12 收录
下载链接:
https://hf-mirror.com/datasets/answerdotai/STITCH
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: all_corpus
features:
- name: id
dtype: string
- name: content
dtype: string
- name: content_translated
dtype: string
splits:
- name: test
num_bytes: 5761705
num_examples: 2799
download_size: 0
dataset_size: 5761705
- config_name: all_known_questions
features:
- name: question
dtype: string
- name: category
dtype: string
- name: subcategory
dtype: string
- name: correct
dtype: string
- name: incorrect
sequence: string
- name: noise_article_ids
sequence: int64
- name: relevant_doc_ids
sequence: int64
- name: correct_letter
dtype: string
- name: answers_string
dtype: string
- name: question_translated
dtype: string
- name: correct_translated
dtype: string
- name: incorrect_translated
sequence: string
- name: answers_string_translated
dtype: string
- name: subset
dtype: string
- name: document
dtype: string
- name: noise_ids
sequence: int64
splits:
- name: test
num_bytes: 498457
num_examples: 108
download_size: 0
dataset_size: 498457
- config_name: all_questions
features:
- name: question
dtype: string
- name: category
dtype: string
- name: subcategory
dtype: string
- name: correct
dtype: string
- name: incorrect
sequence: string
- name: noise_article_ids
sequence: int64
- name: relevant_doc_ids
sequence: int64
- name: correct_letter
dtype: string
- name: answers_string
dtype: string
- name: question_translated
dtype: string
- name: correct_translated
dtype: string
- name: incorrect_translated
sequence: string
- name: answers_string_translated
dtype: string
- name: subset
dtype: string
- name: document
dtype: string
- name: full_document
dtype: string
- name: noise_docs
dtype: 'null'
- name: noise_ids
sequence: int64
splits:
- name: test
num_bytes: 1674811
num_examples: 162
download_size: 0
dataset_size: 1674811
- config_name: biomrc_corpus
features:
- name: id
dtype: string
- name: content
dtype: string
splits:
- name: test
num_bytes: 1866866
num_examples: 915
download_size: 0
dataset_size: 1866866
- config_name: biomrc_known_questions
features:
- name: question
dtype: string
- name: correct
dtype: string
- name: incorrect
sequence: string
- name: answers_string
dtype: string
- name: correct_letter
dtype: string
- name: document
dtype: string
- name: noise_ids
sequence: int64
- name: subset
dtype: string
splits:
- name: test
num_bytes: 227169
num_examples: 54
download_size: 0
dataset_size: 227169
- config_name: biomrc_questions
features:
- name: question
dtype: string
- name: correct
dtype: string
- name: incorrect
sequence: string
- name: answers_string
dtype: string
- name: correct_letter
dtype: string
- name: document
dtype: string
- name: noise_ids
sequence: int64
- name: subset
dtype: string
splits:
- name: test
num_bytes: 210272
num_examples: 54
download_size: 0
dataset_size: 210272
- config_name: bsard_corpus
features:
- name: id
dtype: string
- name: content
dtype: string
- name: content_translated
dtype: string
splits:
- name: test
num_bytes: 3447233
num_examples: 1858
download_size: 0
dataset_size: 3447233
- config_name: bsard_known_questions
features:
- name: question
dtype: string
- name: category
dtype: string
- name: subcategory
dtype: string
- name: correct
dtype: string
- name: incorrect
sequence: string
- name: noise_article_ids
sequence: int64
- name: relevant_doc_ids
sequence: int64
- name: correct_letter
dtype: string
- name: answers_string
dtype: string
- name: question_translated
dtype: string
- name: correct_translated
dtype: string
- name: incorrect_translated
sequence: string
- name: answers_string_translated
dtype: string
- name: subset
dtype: string
splits:
- name: test
num_bytes: 269058
num_examples: 54
download_size: 0
dataset_size: 269058
- config_name: bsard_questions
features:
- name: question
dtype: string
- name: category
dtype: string
- name: subcategory
dtype: string
- name: correct
dtype: string
- name: incorrect
sequence: string
- name: noise_article_ids
sequence: int64
- name: relevant_doc_ids
sequence: int64
- name: correct_letter
dtype: string
- name: answers_string
dtype: string
- name: question_translated
dtype: string
- name: correct_translated
dtype: string
- name: incorrect_translated
sequence: string
- name: answers_string_translated
dtype: string
- name: subset
dtype: string
splits:
- name: test
num_bytes: 284408
num_examples: 54
download_size: 0
dataset_size: 284408
- config_name: proxima_corpus
features:
- name: id
dtype: string
- name: content
dtype: string
splits:
- name: test
num_bytes: 443723
num_examples: 26
download_size: 0
dataset_size: 443723
- config_name: proxima_questions
features:
- name: question
dtype: string
- name: correct
dtype: string
- name: incorrect
sequence: string
- name: answers_string
dtype: string
- name: correct_letter
dtype: string
- name: document
dtype: string
- name: full_document
dtype: string
- name: noise_docs
dtype: 'null'
- name: subset
dtype: string
splits:
- name: test
num_bytes: 1175448
num_examples: 54
download_size: 0
dataset_size: 1175448
configs:
- config_name: all_corpus
data_files:
- split: test
path: all_corpus/test-*
- config_name: all_known_questions
data_files:
- split: test
path: all_known_questions/test-*
- config_name: all_questions
data_files:
- split: test
path: all_questions/test-*
- config_name: biomrc_corpus
data_files:
- split: test
path: biomrc_corpus/test-*
- config_name: biomrc_known_questions
data_files:
- split: test
path: biomrc_known_questions/test-*
- config_name: biomrc_questions
data_files:
- split: test
path: biomrc_questions/test-*
- config_name: bsard_corpus
data_files:
- split: test
path: bsard_corpus/test-*
- config_name: bsard_known_questions
data_files:
- split: test
path: bsard_known_questions/test-*
- config_name: bsard_questions
data_files:
- split: test
path: bsard_questions/test-*
- config_name: proxima_corpus
data_files:
- split: test
path: proxima_corpus/test-*
- config_name: proxima_questions
data_files:
- split: test
path: proxima_questions/test-*
---
# Dataset Card for "STITCH"
[More Information needed](https://github.com/huggingface/datasets/blob/main/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
提供机构:
answerdotai
原始信息汇总
数据集概述
1. all_corpus
- 特征:
- id: 字符串类型
- content: 字符串类型
- content_translated: 字符串类型
- 分割:
- test: 2799个样本,数据大小5761705字节
2. all_known_questions
- 特征:
- question: 字符串类型
- category: 字符串类型
- subcategory: 字符串类型
- correct: 字符串类型
- incorrect: 字符串序列类型
- noise_article_ids: 整数序列类型
- relevant_doc_ids: 整数序列类型
- correct_letter: 字符串类型
- answers_string: 字符串类型
- question_translated: 字符串类型
- correct_translated: 字符串类型
- incorrect_translated: 字符串序列类型
- answers_string_translated: 字符串类型
- subset: 字符串类型
- document: 字符串类型
- noise_ids: 整数序列类型
- 分割:
- test: 108个样本,数据大小498457字节
3. all_questions
- 特征:
- question: 字符串类型
- category: 字符串类型
- subcategory: 字符串类型
- correct: 字符串类型
- incorrect: 字符串序列类型
- noise_article_ids: 整数序列类型
- relevant_doc_ids: 整数序列类型
- correct_letter: 字符串类型
- answers_string: 字符串类型
- question_translated: 字符串类型
- correct_translated: 字符串类型
- incorrect_translated: 字符串序列类型
- answers_string_translated: 字符串类型
- subset: 字符串类型
- document: 字符串类型
- full_document: 字符串类型
- noise_docs: null类型
- noise_ids: 整数序列类型
- 分割:
- test: 162个样本,数据大小1674811字节
4. biomrc_corpus
- 特征:
- id: 字符串类型
- content: 字符串类型
- 分割:
- test: 915个样本,数据大小1866866字节
5. biomrc_known_questions
- 特征:
- question: 字符串类型
- correct: 字符串类型
- incorrect: 字符串序列类型
- answers_string: 字符串类型
- correct_letter: 字符串类型
- document: 字符串类型
- noise_ids: 整数序列类型
- subset: 字符串类型
- 分割:
- test: 54个样本,数据大小227169字节
6. biomrc_questions
- 特征:
- question: 字符串类型
- correct: 字符串类型
- incorrect: 字符串序列类型
- answers_string: 字符串类型
- correct_letter: 字符串类型
- document: 字符串类型
- noise_ids: 整数序列类型
- subset: 字符串类型
- 分割:
- test: 54个样本,数据大小210272字节
7. bsard_corpus
- 特征:
- id: 字符串类型
- content: 字符串类型
- content_translated: 字符串类型
- 分割:
- test: 1858个样本,数据大小3447233字节
8. bsard_known_questions
- 特征:
- question: 字符串类型
- category: 字符串类型
- subcategory: 字符串类型
- correct: 字符串类型
- incorrect: 字符串序列类型
- noise_article_ids: 整数序列类型
- relevant_doc_ids: 整数序列类型
- correct_letter: 字符串类型
- answers_string: 字符串类型
- question_translated: 字符串类型
- correct_translated: 字符串类型
- incorrect_translated: 字符串序列类型
- answers_string_translated: 字符串类型
- subset: 字符串类型
- 分割:
- test: 54个样本,数据大小269058字节
9. bsard_questions
- 特征:
- question: 字符串类型
- category: 字符串类型
- subcategory: 字符串类型
- correct: 字符串类型
- incorrect: 字符串序列类型
- noise_article_ids: 整数序列类型
- relevant_doc_ids: 整数序列类型
- correct_letter: 字符串类型
- answers_string: 字符串类型
- question_translated: 字符串类型
- correct_translated: 字符串类型
- incorrect_translated: 字符串序列类型
- answers_string_translated: 字符串类型
- subset: 字符串类型
- 分割:
- test: 54个样本,数据大小284408字节
10. proxima_corpus
- 特征:
- id: 字符串类型
- content: 字符串类型
- 分割:
- test: 26个样本,数据大小443723字节
11. proxima_questions
- 特征:
- question: 字符串类型
- correct: 字符串类型
- incorrect: 字符串序列类型
- answers_string: 字符串类型
- correct_letter: 字符串类型
- document: 字符串类型
- full_document: 字符串类型
- noise_docs: null类型
- subset: 字符串类型
- 分割:
- test: 54个样本,数据大小1175448字节



