AtharvImmverse/indic-mozhi-ocr
收藏Hugging Face2026-03-24 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/AtharvImmverse/indic-mozhi-ocr
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: assamese
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 265570997.392
num_examples: 79697
- name: validation
num_bytes: 33100702.895
num_examples: 9945
- name: test
num_bytes: 33623172.86
num_examples: 10146
download_size: 299353644
dataset_size: 332294873.147
- config_name: bengali
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 268847938.428
num_examples: 80113
- name: validation
num_bytes: 32428383.579
num_examples: 9787
- name: test
num_bytes: 33902827.992
num_examples: 10113
download_size: 299852910
dataset_size: 335179149.99899995
- config_name: gujarati
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 253966384.5
num_examples: 79910
- name: validation
num_bytes: 31526917.44
num_examples: 10016
- name: test
num_bytes: 32090495.43
num_examples: 10090
download_size: 281263236
dataset_size: 317583797.37
- config_name: hindi
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 198049404.138
num_examples: 79762
- name: validation
num_bytes: 24776946.202
num_examples: 10114
- name: test
num_bytes: 25277613.658
num_examples: 10173
download_size: 204512634
dataset_size: 248103963.998
- config_name: kannada
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 376152021.21
num_examples: 80085
- name: validation
num_bytes: 47514270.672
num_examples: 10088
- name: test
num_bytes: 46340967.258
num_examples: 9838
download_size: 446867230
dataset_size: 470007259.14
- config_name: malayalam
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 532147736.13
num_examples: 80146
- name: validation
num_bytes: 66376475.321
num_examples: 9893
- name: test
num_bytes: 66580820.18
num_examples: 9980
download_size: 655174267
dataset_size: 665105031.6309999
- config_name: manipuri
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 321182248.307
num_examples: 79691
- name: validation
num_bytes: 41233541.808
num_examples: 10254
- name: test
num_bytes: 40502308.484
num_examples: 10061
download_size: 376148254
dataset_size: 402918098.599
- config_name: marathi
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 290388545.043
num_examples: 80151
- name: validation
num_bytes: 36470130.49
num_examples: 10005
- name: test
num_bytes: 36164887.53
num_examples: 9855
download_size: 331383977
dataset_size: 363023563.06299996
- config_name: oriya
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 322233586.795
num_examples: 79945
- name: validation
num_bytes: 40426447.639
num_examples: 10089
- name: test
num_bytes: 40036025.48
num_examples: 9994
download_size: 374008114
dataset_size: 402696059.91400003
- config_name: punjabi
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 221962277.765
num_examples: 79931
- name: validation
num_bytes: 27908428.392
num_examples: 10036
- name: test
num_bytes: 27888736.212
num_examples: 10038
download_size: 236697029
dataset_size: 277759442.36899996
- config_name: tamil
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 475740415.76
num_examples: 80022
- name: validation
num_bytes: 59824654.814
num_examples: 10021
- name: test
num_bytes: 59222798.18
num_examples: 9974
download_size: 581365184
dataset_size: 594787868.754
- config_name: telugu
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 359428295.993
num_examples: 80337
- name: validation
num_bytes: 44229025.172
num_examples: 9811
- name: test
num_bytes: 44031609.668
num_examples: 9876
download_size: 423945117
dataset_size: 447688930.83299994
- config_name: urdu
features:
- name: id
dtype: string
- name: image
dtype: image
- name: text
dtype: string
splits:
- name: train
num_bytes: 102975635.2
num_examples: 9100
- name: validation
num_bytes: 13033674.31
num_examples: 1138
- name: test
num_bytes: 13082758.144
num_examples: 1137
download_size: 129029608
dataset_size: 129092067.654
configs:
- config_name: assamese
data_files:
- split: train
path: assamese/train-*
- split: validation
path: assamese/validation-*
- split: test
path: assamese/test-*
- config_name: bengali
data_files:
- split: train
path: bengali/train-*
- split: validation
path: bengali/validation-*
- split: test
path: bengali/test-*
- config_name: gujarati
data_files:
- split: train
path: gujarati/train-*
- split: validation
path: gujarati/validation-*
- split: test
path: gujarati/test-*
- config_name: hindi
data_files:
- split: train
path: hindi/train-*
- split: validation
path: hindi/validation-*
- split: test
path: hindi/test-*
- config_name: kannada
data_files:
- split: train
path: kannada/train-*
- split: validation
path: kannada/validation-*
- split: test
path: kannada/test-*
- config_name: malayalam
data_files:
- split: train
path: malayalam/train-*
- split: validation
path: malayalam/validation-*
- split: test
path: malayalam/test-*
- config_name: manipuri
data_files:
- split: train
path: manipuri/train-*
- split: validation
path: manipuri/validation-*
- split: test
path: manipuri/test-*
- config_name: marathi
data_files:
- split: train
path: marathi/train-*
- split: validation
path: marathi/validation-*
- split: test
path: marathi/test-*
- config_name: oriya
data_files:
- split: train
path: oriya/train-*
- split: validation
path: oriya/validation-*
- split: test
path: oriya/test-*
- config_name: punjabi
data_files:
- split: train
path: punjabi/train-*
- split: validation
path: punjabi/validation-*
- split: test
path: punjabi/test-*
- config_name: tamil
data_files:
- split: train
path: tamil/train-*
- split: validation
path: tamil/validation-*
- split: test
path: tamil/test-*
- config_name: telugu
data_files:
- split: train
path: telugu/train-*
- split: validation
path: telugu/validation-*
- split: test
path: telugu/test-*
- config_name: urdu
data_files:
- split: train
path: urdu/train-*
- split: validation
path: urdu/validation-*
- split: test
path: urdu/test-*
---
提供机构:
AtharvImmverse



