ai4bharat/clean-doc-bench
收藏Hugging Face2026-04-20 更新2026-04-26 收录
下载链接:
https://hf-mirror.com/datasets/ai4bharat/clean-doc-bench
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: assamese
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 656491
num_examples: 100
download_size: 254086
dataset_size: 656491
- config_name: bengali
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 529401
num_examples: 100
download_size: 201472
dataset_size: 529401
- config_name: bodo
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 503610
num_examples: 101
download_size: 187294
dataset_size: 503610
- config_name: dogri
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 502454
num_examples: 100
download_size: 182308
dataset_size: 502454
- config_name: gujarati
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 617887
num_examples: 100
download_size: 241802
dataset_size: 617887
- config_name: hindi
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 365409
num_examples: 100
download_size: 151100
dataset_size: 365409
- config_name: kannada
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 882790
num_examples: 100
download_size: 338145
dataset_size: 882790
- config_name: kashmiri
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 360432
num_examples: 99
download_size: 178585
dataset_size: 360432
- config_name: konkani
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 622046
num_examples: 100
download_size: 250837
dataset_size: 622046
- config_name: maithili
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 548236
num_examples: 100
download_size: 217541
dataset_size: 548236
- config_name: malayalam
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 769387
num_examples: 100
download_size: 273343
dataset_size: 769387
- config_name: manipuri
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 240737
num_examples: 55
download_size: 96369
dataset_size: 240737
- config_name: marathi
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 636958
num_examples: 100
download_size: 253720
dataset_size: 636958
- config_name: nepali
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 611206
num_examples: 101
download_size: 227445
dataset_size: 611206
- config_name: odia
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 558774
num_examples: 100
download_size: 199991
dataset_size: 558774
- config_name: punjabi
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 436374
num_examples: 100
download_size: 176331
dataset_size: 436374
- config_name: sanskrit
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 611844
num_examples: 109
download_size: 233119
dataset_size: 611844
- config_name: santali
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 200331
num_examples: 38
download_size: 77457
dataset_size: 200331
- config_name: sindhi
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 160139
num_examples: 99
download_size: 71231
dataset_size: 160139
- config_name: tamil
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 727120
num_examples: 100
download_size: 250347
dataset_size: 727120
- config_name: telugu
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 556385
num_examples: 100
download_size: 209993
dataset_size: 556385
- config_name: urdu
features:
- name: id
dtype: string
- name: language
dtype: string
- name: domain
dtype: string
- name: document
dtype: string
splits:
- name: train
num_bytes: 300799
num_examples: 100
download_size: 153373
dataset_size: 300799
configs:
- config_name: assamese
data_files:
- split: train
path: assamese/train-*
- config_name: bengali
data_files:
- split: train
path: bengali/train-*
- config_name: bodo
data_files:
- split: train
path: bodo/train-*
- config_name: dogri
data_files:
- split: train
path: dogri/train-*
- config_name: gujarati
data_files:
- split: train
path: gujarati/train-*
- config_name: hindi
data_files:
- split: train
path: hindi/train-*
- config_name: kannada
data_files:
- split: train
path: kannada/train-*
- config_name: kashmiri
data_files:
- split: train
path: kashmiri/train-*
- config_name: konkani
data_files:
- split: train
path: konkani/train-*
- config_name: maithili
data_files:
- split: train
path: maithili/train-*
- config_name: malayalam
data_files:
- split: train
path: malayalam/train-*
- config_name: manipuri
data_files:
- split: train
path: manipuri/train-*
- config_name: marathi
data_files:
- split: train
path: marathi/train-*
- config_name: nepali
data_files:
- split: train
path: nepali/train-*
- config_name: odia
data_files:
- split: train
path: odia/train-*
- config_name: punjabi
data_files:
- split: train
path: punjabi/train-*
- config_name: sanskrit
data_files:
- split: train
path: sanskrit/train-*
- config_name: santali
data_files:
- split: train
path: santali/train-*
- config_name: sindhi
data_files:
- split: train
path: sindhi/train-*
- config_name: tamil
data_files:
- split: train
path: tamil/train-*
- config_name: telugu
data_files:
- split: train
path: telugu/train-*
- config_name: urdu
data_files:
- split: train
path: urdu/train-*
---
提供机构:
ai4bharat



