MLP-Lemma/Instruct-datasets-preprocessed
收藏Hugging Face2024-05-13 更新2024-06-12 收录
下载链接:
https://hf-mirror.com/datasets/MLP-Lemma/Instruct-datasets-preprocessed
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: BigPatent
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 2187341940
num_examples: 41383
download_size: 430231363
dataset_size: 2187341940
- config_name: BookSum
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 560667212
num_examples: 9371
download_size: 131314418
dataset_size: 560667212
- config_name: BoolQ
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 26210184
num_examples: 9426
download_size: 3930274
dataset_size: 26210184
- config_name: CNN-DM
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 1001044244
num_examples: 99951
download_size: 224147945
dataset_size: 1001044244
- config_name: CosmosQA
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 86154456
num_examples: 25262
download_size: 11075155
dataset_size: 86154456
- config_name: DROP
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 41230848
num_examples: 9977
download_size: 6663863
dataset_size: 41230848
- config_name: GovReport
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 1484331476
num_examples: 16590
download_size: 314257236
dataset_size: 1484331476
- config_name: HotpotQA
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 1184530448
num_examples: 90208
download_size: 244328551
dataset_size: 1184530448
- config_name: LongAlpaca
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 751565872
num_examples: 7573
download_size: 148032145
dataset_size: 751565872
- config_name: MultiNews
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 1084301372
num_examples: 44398
download_size: 251648587
dataset_size: 1084301372
- config_name: MultiRC
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 52418040
num_examples: 12025
download_size: 2643742
dataset_size: 52418040
- config_name: NarrativeQA
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 8622923508
num_examples: 12857
download_size: 1844849479
dataset_size: 8622923508
- config_name: QMsum
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 149132292
num_examples: 1257
download_size: 22634545
dataset_size: 149132292
- config_name: Qasper
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 119525772
num_examples: 2461
download_size: 23057137
dataset_size: 119525772
- config_name: Quality
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 144079688
num_examples: 2523
download_size: 24262763
dataset_size: 144079688
- config_name: ReCoRD
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 396630060
num_examples: 100682
download_size: 69341165
dataset_size: 396630060
- config_name: SQuAD
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 263068320
num_examples: 87580
download_size: 35289932
dataset_size: 263068320
- config_name: TriviaQA
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 4937139228
num_examples: 52359
download_size: 1021208372
dataset_size: 4937139228
- config_name: XSum
features:
- name: input_ids
sequence: int32
- name: input_sentences_ids
sequence:
sequence: int64
- name: labels
sequence: int64
- name: inst_ids
sequence:
sequence: int64
splits:
- name: train
num_bytes: 586055844
num_examples: 99827
download_size: 125544836
dataset_size: 586055844
configs:
- config_name: BigPatent
data_files:
- split: train
path: BigPatent/train-*
- config_name: BookSum
data_files:
- split: train
path: BookSum/train-*
- config_name: BoolQ
data_files:
- split: train
path: BoolQ/train-*
- config_name: CNN-DM
data_files:
- split: train
path: CNN-DM/train-*
- config_name: CosmosQA
data_files:
- split: train
path: CosmosQA/train-*
- config_name: DROP
data_files:
- split: train
path: DROP/train-*
- config_name: GovReport
data_files:
- split: train
path: GovReport/train-*
- config_name: HotpotQA
data_files:
- split: train
path: HotpotQA/train-*
- config_name: LongAlpaca
data_files:
- split: train
path: LongAlpaca/train-*
- config_name: MultiNews
data_files:
- split: train
path: MultiNews/train-*
- config_name: MultiRC
data_files:
- split: train
path: MultiRC/train-*
- config_name: NarrativeQA
data_files:
- split: train
path: NarrativeQA/train-*
- config_name: QMsum
data_files:
- split: train
path: QMsum/train-*
- config_name: Qasper
data_files:
- split: train
path: Qasper/train-*
- config_name: Quality
data_files:
- split: train
path: Quality/train-*
- config_name: ReCoRD
data_files:
- split: train
path: ReCoRD/train-*
- config_name: SQuAD
data_files:
- split: train
path: SQuAD/train-*
- config_name: TriviaQA
data_files:
- split: train
path: TriviaQA/train-*
- config_name: XSum
data_files:
- split: train
path: XSum/train-*
---
This dataset consists of multiple sub-datasets, each with a specific configuration name, features, and details about the training split. The features include input_ids, input_sentences_ids, labels, and inst_ids, all of which are sequences of integers. The datasets cover various domains such as patents, books, news, and question-answering. Each sub-dataset provides information about the number of examples, download size, and dataset size for the training split.
提供机构:
MLP-Lemma
原始信息汇总
数据集概述
BigPatent
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 2187341940 字节
- 示例数量: 41383
BookSum
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 560667212 字节
- 示例数量: 9371
BoolQ
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 26210184 字节
- 示例数量: 9426
CNN-DM
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 1001044244 字节
- 示例数量: 99951
CosmosQA
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 86154456 字节
- 示例数量: 25262
DROP
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 41230848 字节
- 示例数量: 9977
GovReport
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 1484331476 字节
- 示例数量: 16590
HotpotQA
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 1184530448 字节
- 示例数量: 90208
LongAlpaca
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 751565872 字节
- 示例数量: 7573
MultiNews
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 1084301372 字节
- 示例数量: 44398
MultiRC
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 52418040 字节
- 示例数量: 12025
NarrativeQA
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 8622923508 字节
- 示例数量: 12857
QMsum
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 149132292 字节
- 示例数量: 1257
Qasper
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 119525772 字节
- 示例数量: 2461
Quality
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 144079688 字节
- 示例数量: 2523
ReCoRD
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 396630060 字节
- 示例数量: 100682
SQuAD
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 263068320 字节
- 示例数量: 87580
TriviaQA
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 4937139228 字节
- 示例数量: 52359
XSum
- 特征:
- input_ids: 序列类型为 int32
- input_sentences_ids: 序列类型为 int64
- labels: 序列类型为 int64
- inst_ids: 序列类型为 int64
- 训练集:
- 大小: 586055844 字节
- 示例数量: 99827



