Cognitive-Lab/Aya_Marathi
收藏Hugging Face2024-03-19 更新2024-06-15 收录
下载链接:
https://hf-mirror.com/datasets/Cognitive-Lab/Aya_Marathi
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: complete_dataset
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 4119380566
num_examples: 3575683
download_size: 1356608562
dataset_size: 4119380566
- config_name: templated_indic_paraphrase
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 773026
num_examples: 1001
download_size: 255328
dataset_size: 773026
- config_name: templated_indic_sentiment
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 745782
num_examples: 1156
download_size: 307088
dataset_size: 745782
- config_name: templated_xlel_wd
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 1429815
num_examples: 1161
download_size: 503445
dataset_size: 1429815
- config_name: translated_adversarial_qa
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 22931274
num_examples: 10000
download_size: 5791791
dataset_size: 22931274
- config_name: translated_cnn_dailymail
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 612614699
num_examples: 100000
download_size: 225268596
dataset_size: 612614699
- config_name: translated_dolly
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 30988209
num_examples: 14808
download_size: 12027773
dataset_size: 30988209
- config_name: translated_flan_coqa
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 40235091
num_examples: 6409
download_size: 15430700
dataset_size: 40235091
- config_name: translated_flan_cot
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 98331455
num_examples: 91910
download_size: 34295182
dataset_size: 98331455
- config_name: translated_flan_gem_wiki
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 160449052
num_examples: 27147
download_size: 58344118
dataset_size: 160449052
- config_name: translated_flan_lambada
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 2885792
num_examples: 4279
download_size: 1068206
dataset_size: 2885792
- config_name: translated_flan_qa
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 433734
num_examples: 540
download_size: 154930
dataset_size: 433734
- config_name: translated_hotpotqa
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 171545509
num_examples: 355476
download_size: 51033087
dataset_size: 171545509
- config_name: translated_joke_explaination
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 1334320
num_examples: 754
download_size: 268958
dataset_size: 1334320
- config_name: translated_mintaka
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 5691487
num_examples: 14000
download_size: 989653
dataset_size: 5691487
- config_name: translated_nqopen
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 53241715
num_examples: 175850
download_size: 15297113
dataset_size: 53241715
- config_name: translated_paws
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 44574443
num_examples: 49401
download_size: 6122839
dataset_size: 44574443
- config_name: translated_piqa
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 17321849
num_examples: 16113
download_size: 5006389
dataset_size: 17321849
- config_name: translated_soda
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 977442017
num_examples: 1191582
download_size: 283089235
dataset_size: 977442017
- config_name: translated_wiki_split
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 1019477318
num_examples: 989944
download_size: 319167021
dataset_size: 1019477318
- config_name: translated_wikiqa
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 740836
num_examples: 1040
download_size: 266979
dataset_size: 740836
- config_name: translated_xlel_wd
features:
- name: targets
dtype: string
- name: task_type
dtype: string
- name: id
dtype: int64
- name: template_id
dtype: int64
- name: dataset_name
dtype: string
- name: script
dtype: string
- name: split
dtype: string
- name: inputs
dtype: string
- name: sub_dataset_name
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 856193143
num_examples: 523112
download_size: 321169799
dataset_size: 856193143
configs:
- config_name: complete_dataset
data_files:
- split: train
path: complete_dataset/train-*
- config_name: templated_indic_paraphrase
data_files:
- split: train
path: templated_indic_paraphrase/train-*
- config_name: templated_indic_sentiment
data_files:
- split: train
path: templated_indic_sentiment/train-*
- config_name: templated_xlel_wd
data_files:
- split: train
path: templated_xlel_wd/train-*
- config_name: translated_adversarial_qa
data_files:
- split: train
path: translated_adversarial_qa/train-*
- config_name: translated_cnn_dailymail
data_files:
- split: train
path: translated_cnn_dailymail/train-*
- config_name: translated_dolly
data_files:
- split: train
path: translated_dolly/train-*
- config_name: translated_flan_coqa
data_files:
- split: train
path: translated_flan_coqa/train-*
- config_name: translated_flan_cot
data_files:
- split: train
path: translated_flan_cot/train-*
- config_name: translated_flan_gem_wiki
data_files:
- split: train
path: translated_flan_gem_wiki/train-*
- config_name: translated_flan_lambada
data_files:
- split: train
path: translated_flan_lambada/train-*
- config_name: translated_flan_qa
data_files:
- split: train
path: translated_flan_qa/train-*
- config_name: translated_hotpotqa
data_files:
- split: train
path: translated_hotpotqa/train-*
- config_name: translated_joke_explaination
data_files:
- split: train
path: translated_joke_explaination/train-*
- config_name: translated_mintaka
data_files:
- split: train
path: translated_mintaka/train-*
- config_name: translated_nqopen
data_files:
- split: train
path: translated_nqopen/train-*
- config_name: translated_paws
data_files:
- split: train
path: translated_paws/train-*
- config_name: translated_piqa
data_files:
- split: train
path: translated_piqa/train-*
- config_name: translated_soda
data_files:
- split: train
path: translated_soda/train-*
- config_name: translated_wiki_split
data_files:
- split: train
path: translated_wiki_split/train-*
- config_name: translated_wikiqa
data_files:
- split: train
path: translated_wikiqa/train-*
- config_name: translated_xlel_wd
data_files:
- split: train
path: translated_xlel_wd/train-*
---
提供机构:
Cognitive-Lab
原始信息汇总
数据集概述
数据集配置
complete_dataset
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 4119380566
- 样本数: 3575683
- 下载大小: 1356608562
- 数据集大小: 4119380566
templated_indic_paraphrase
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 773026
- 样本数: 1001
- 下载大小: 255328
- 数据集大小: 773026
templated_indic_sentiment
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 745782
- 样本数: 1156
- 下载大小: 307088
- 数据集大小: 745782
templated_xlel_wd
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 1429815
- 样本数: 1161
- 下载大小: 503445
- 数据集大小: 1429815
translated_adversarial_qa
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 22931274
- 样本数: 10000
- 下载大小: 5791791
- 数据集大小: 22931274
translated_cnn_dailymail
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 612614699
- 样本数: 100000
- 下载大小: 225268596
- 数据集大小: 612614699
translated_dolly
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 30988209
- 样本数: 14808
- 下载大小: 12027773
- 数据集大小: 30988209
translated_flan_coqa
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 40235091
- 样本数: 6409
- 下载大小: 15430700
- 数据集大小: 40235091
translated_flan_cot
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 98331455
- 样本数: 91910
- 下载大小: 34295182
- 数据集大小: 98331455
translated_flan_gem_wiki
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 160449052
- 样本数: 27147
- 下载大小: 58344118
- 数据集大小: 160449052
translated_flan_lambada
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 2885792
- 样本数: 4279
- 下载大小: 1068206
- 数据集大小: 2885792
translated_flan_qa
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 433734
- 样本数: 540
- 下载大小: 154930
- 数据集大小: 433734
translated_hotpotqa
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 171545509
- 样本数: 355476
- 下载大小: 51033087
- 数据集大小: 171545509
translated_joke_explaination
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 1334320
- 样本数: 754
- 下载大小: 268958
- 数据集大小: 1334320
translated_mintaka
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 5691487
- 样本数: 14000
- 下载大小: 989653
- 数据集大小: 5691487
translated_nqopen
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 53241715
- 样本数: 175850
- 下载大小: 15297113
- 数据集大小: 53241715
translated_paws
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 44574443
- 样本数: 49401
- 下载大小: 6122839
- 数据集大小: 44574443
translated_piqa
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 17321849
- 样本数: 16113
- 下载大小: 5006389
- 数据集大小: 17321849
translated_soda
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 977442017
- 样本数: 1191582
- 下载大小: 283089235
- 数据集大小: 977442017
translated_wiki_split
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 1019477318
- 样本数: 989944
- 下载大小: 319167021
- 数据集大小: 1019477318
translated_wikiqa
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 740836
- 样本数: 1040
- 下载大小: 266979
- 数据集大小: 740836
translated_xlel_wd
- 特征:
targets: stringtask_type: stringid: int64template_id: int64dataset_name: stringscript: stringsplit: stringinputs: stringsub_dataset_name: stringlanguage: string
- 分割:
train:- 字节数: 856193143
- 样本数: 523112
- 下载大小: 321169799
- 数据集大小: 856193143



