five

AtharvImmverse/indic-mozhi-ocr

收藏
Hugging Face2026-03-24 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/AtharvImmverse/indic-mozhi-ocr
下载链接
链接失效反馈
官方服务:
资源简介:
--- dataset_info: - config_name: assamese features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 265570997.392 num_examples: 79697 - name: validation num_bytes: 33100702.895 num_examples: 9945 - name: test num_bytes: 33623172.86 num_examples: 10146 download_size: 299353644 dataset_size: 332294873.147 - config_name: bengali features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 268847938.428 num_examples: 80113 - name: validation num_bytes: 32428383.579 num_examples: 9787 - name: test num_bytes: 33902827.992 num_examples: 10113 download_size: 299852910 dataset_size: 335179149.99899995 - config_name: gujarati features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 253966384.5 num_examples: 79910 - name: validation num_bytes: 31526917.44 num_examples: 10016 - name: test num_bytes: 32090495.43 num_examples: 10090 download_size: 281263236 dataset_size: 317583797.37 - config_name: hindi features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 198049404.138 num_examples: 79762 - name: validation num_bytes: 24776946.202 num_examples: 10114 - name: test num_bytes: 25277613.658 num_examples: 10173 download_size: 204512634 dataset_size: 248103963.998 - config_name: kannada features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 376152021.21 num_examples: 80085 - name: validation num_bytes: 47514270.672 num_examples: 10088 - name: test num_bytes: 46340967.258 num_examples: 9838 download_size: 446867230 dataset_size: 470007259.14 - config_name: malayalam features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 532147736.13 num_examples: 80146 - name: validation num_bytes: 66376475.321 num_examples: 9893 - name: test num_bytes: 66580820.18 num_examples: 9980 download_size: 655174267 dataset_size: 665105031.6309999 - config_name: manipuri features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 321182248.307 num_examples: 79691 - name: validation num_bytes: 41233541.808 num_examples: 10254 - name: test num_bytes: 40502308.484 num_examples: 10061 download_size: 376148254 dataset_size: 402918098.599 - config_name: marathi features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 290388545.043 num_examples: 80151 - name: validation num_bytes: 36470130.49 num_examples: 10005 - name: test num_bytes: 36164887.53 num_examples: 9855 download_size: 331383977 dataset_size: 363023563.06299996 - config_name: oriya features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 322233586.795 num_examples: 79945 - name: validation num_bytes: 40426447.639 num_examples: 10089 - name: test num_bytes: 40036025.48 num_examples: 9994 download_size: 374008114 dataset_size: 402696059.91400003 - config_name: punjabi features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 221962277.765 num_examples: 79931 - name: validation num_bytes: 27908428.392 num_examples: 10036 - name: test num_bytes: 27888736.212 num_examples: 10038 download_size: 236697029 dataset_size: 277759442.36899996 - config_name: tamil features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 475740415.76 num_examples: 80022 - name: validation num_bytes: 59824654.814 num_examples: 10021 - name: test num_bytes: 59222798.18 num_examples: 9974 download_size: 581365184 dataset_size: 594787868.754 - config_name: telugu features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 359428295.993 num_examples: 80337 - name: validation num_bytes: 44229025.172 num_examples: 9811 - name: test num_bytes: 44031609.668 num_examples: 9876 download_size: 423945117 dataset_size: 447688930.83299994 - config_name: urdu features: - name: id dtype: string - name: image dtype: image - name: text dtype: string splits: - name: train num_bytes: 102975635.2 num_examples: 9100 - name: validation num_bytes: 13033674.31 num_examples: 1138 - name: test num_bytes: 13082758.144 num_examples: 1137 download_size: 129029608 dataset_size: 129092067.654 configs: - config_name: assamese data_files: - split: train path: assamese/train-* - split: validation path: assamese/validation-* - split: test path: assamese/test-* - config_name: bengali data_files: - split: train path: bengali/train-* - split: validation path: bengali/validation-* - split: test path: bengali/test-* - config_name: gujarati data_files: - split: train path: gujarati/train-* - split: validation path: gujarati/validation-* - split: test path: gujarati/test-* - config_name: hindi data_files: - split: train path: hindi/train-* - split: validation path: hindi/validation-* - split: test path: hindi/test-* - config_name: kannada data_files: - split: train path: kannada/train-* - split: validation path: kannada/validation-* - split: test path: kannada/test-* - config_name: malayalam data_files: - split: train path: malayalam/train-* - split: validation path: malayalam/validation-* - split: test path: malayalam/test-* - config_name: manipuri data_files: - split: train path: manipuri/train-* - split: validation path: manipuri/validation-* - split: test path: manipuri/test-* - config_name: marathi data_files: - split: train path: marathi/train-* - split: validation path: marathi/validation-* - split: test path: marathi/test-* - config_name: oriya data_files: - split: train path: oriya/train-* - split: validation path: oriya/validation-* - split: test path: oriya/test-* - config_name: punjabi data_files: - split: train path: punjabi/train-* - split: validation path: punjabi/validation-* - split: test path: punjabi/test-* - config_name: tamil data_files: - split: train path: tamil/train-* - split: validation path: tamil/validation-* - split: test path: tamil/test-* - config_name: telugu data_files: - split: train path: telugu/train-* - split: validation path: telugu/validation-* - split: test path: telugu/test-* - config_name: urdu data_files: - split: train path: urdu/train-* - split: validation path: urdu/validation-* - split: test path: urdu/test-* ---
提供机构:
AtharvImmverse
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作