deepdml/fleurs-neucodec
收藏Hugging Face2026-04-08 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/deepdml/fleurs-neucodec
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: af_za
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 2674598
num_examples: 1032
- name: validation
num_bytes: 469653
num_examples: 198
- name: test
num_bytes: 660480
num_examples: 264
download_size: 2481292
dataset_size: 3804731
- config_name: am_et
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8152555
num_examples: 3163
- name: validation
num_bytes: 480967
num_examples: 223
- name: test
num_bytes: 1186649
num_examples: 516
download_size: 6039583
dataset_size: 9820171
- config_name: ar_eg
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 4461675
num_examples: 2104
- name: validation
num_bytes: 643845
num_examples: 295
- name: test
num_bytes: 958209
num_examples: 428
download_size: 7992262
dataset_size: 6063729
- config_name: as_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7838734
num_examples: 2812
- name: validation
num_bytes: 1034094
num_examples: 418
- name: test
num_bytes: 2546934
num_examples: 984
download_size: 6835033
dataset_size: 11419762
- config_name: ast_es
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5555866
num_examples: 2511
- name: validation
num_bytes: 730349
num_examples: 398
- name: test
num_bytes: 1801941
num_examples: 946
download_size: 5165514
dataset_size: 8088156
- config_name: az_az
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6840300
num_examples: 2665
- name: validation
num_bytes: 991003
num_examples: 400
- name: test
num_bytes: 2376591
num_examples: 923
download_size: 6176359
dataset_size: 10207894
- config_name: be_by
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6973946
num_examples: 2433
- name: validation
num_bytes: 1207162
num_examples: 408
- name: test
num_bytes: 2951263
num_examples: 967
download_size: 6586056
dataset_size: 11132371
- config_name: bg_bg
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6992579
num_examples: 2973
- name: validation
num_bytes: 779567
num_examples: 395
- name: test
num_bytes: 1369891
num_examples: 658
download_size: 5697042
dataset_size: 9142037
- config_name: bn_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7874521
num_examples: 3006
- name: validation
num_bytes: 1062282
num_examples: 402
- name: test
num_bytes: 2517463
num_examples: 920
download_size: 6855687
dataset_size: 11454266
- config_name: bs_ba
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7349746
num_examples: 3091
- name: validation
num_bytes: 982530
num_examples: 400
- name: test
num_bytes: 2323555
num_examples: 925
download_size: 6481850
dataset_size: 10655831
- config_name: ca_es
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5461798
num_examples: 2300
- name: validation
num_bytes: 952851
num_examples: 404
- name: test
num_bytes: 2303993
num_examples: 940
download_size: 10906478
dataset_size: 8718642
- config_name: ceb_ph
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8932815
num_examples: 3261
- name: validation
num_bytes: 644106
num_examples: 225
- name: test
num_bytes: 1622539
num_examples: 541
download_size: 6783332
dataset_size: 11199460
- config_name: ckb_iq
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7684223
num_examples: 3040
- name: validation
num_bytes: 906946
num_examples: 386
- name: test
num_bytes: 2202021
num_examples: 922
download_size: 6490917
dataset_size: 10793190
- config_name: cmn_hans_cn
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7185724
num_examples: 3246
- name: validation
num_bytes: 937433
num_examples: 409
- name: test
num_bytes: 2265405
num_examples: 945
download_size: 6456784
dataset_size: 10388562
- config_name: cs_cz
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6220359
num_examples: 2811
- name: validation
num_bytes: 728541
num_examples: 305
- name: test
num_bytes: 1799684
num_examples: 723
download_size: 5399635
dataset_size: 8748584
- config_name: cy_gb
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8935350
num_examples: 3427
- name: validation
num_bytes: 1320018
num_examples: 447
- name: test
num_bytes: 3134709
num_examples: 1021
download_size: 8054066
dataset_size: 13390077
- config_name: da_dk
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5527571
num_examples: 2465
- name: validation
num_bytes: 862498
num_examples: 395
- name: test
num_bytes: 2161795
num_examples: 930
download_size: 5247344
dataset_size: 8551864
- config_name: de_de
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6631036
num_examples: 2987
- name: validation
num_bytes: 927034
num_examples: 363
- name: test
num_bytes: 2312448
num_examples: 862
download_size: 5994007
dataset_size: 9870518
- config_name: el_gr
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7392728
num_examples: 3215
- name: validation
num_bytes: 565178
num_examples: 271
- name: test
num_bytes: 1404500
num_examples: 650
download_size: 5925312
dataset_size: 9362406
- config_name: en_us
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5524510
num_examples: 2602
- name: validation
num_bytes: 772092
num_examples: 394
- name: test
num_bytes: 1309507
num_examples: 647
download_size: 4860355
dataset_size: 7606109
- config_name: es_419
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6483654
num_examples: 2796
- name: validation
num_bytes: 994474
num_examples: 408
- name: test
num_bytes: 2271000
num_examples: 908
download_size: 5947646
dataset_size: 9749128
- config_name: et_ee
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5380471
num_examples: 2501
- name: validation
num_bytes: 912961
num_examples: 387
- name: test
num_bytes: 2209176
num_examples: 893
download_size: 5231292
dataset_size: 8502608
- config_name: fa_ir
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8846677
num_examples: 3101
- name: validation
num_bytes: 1117914
num_examples: 369
- name: test
num_bytes: 2707410
num_examples: 871
download_size: 7606459
dataset_size: 12672001
- config_name: ff_sn
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 10161946
num_examples: 3235
- name: validation
num_bytes: 757360
num_examples: 273
- name: test
num_bytes: 1924061
num_examples: 660
download_size: 7517093
dataset_size: 12843367
- config_name: fi_fi
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6480514
num_examples: 2704
- name: validation
num_bytes: 1037258
num_examples: 415
- name: test
num_bytes: 2424247
num_examples: 918
download_size: 6092276
dataset_size: 9942019
- config_name: fil_ph
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5647352
num_examples: 1884
- name: validation
num_bytes: 1443329
num_examples: 418
- name: test
num_bytes: 3497208
num_examples: 964
download_size: 6178889
dataset_size: 10587889
- config_name: fr_fr
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7593430
num_examples: 3193
- name: validation
num_bytes: 586548
num_examples: 289
- name: test
num_bytes: 1438209
num_examples: 676
download_size: 6024280
dataset_size: 9618187
- config_name: ga_ie
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8869591
num_examples: 2845
- name: validation
num_bytes: 1087938
num_examples: 369
- name: test
num_bytes: 2534016
num_examples: 842
download_size: 7363897
dataset_size: 12491545
- config_name: gl_es
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 4922061
num_examples: 2175
- name: validation
num_bytes: 773155
num_examples: 395
- name: test
num_bytes: 1920946
num_examples: 927
download_size: 9673610
dataset_size: 7616162
- config_name: gu_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6615605
num_examples: 3145
- name: validation
num_bytes: 880495
num_examples: 432
- name: test
num_bytes: 2148116
num_examples: 1000
download_size: 6009761
dataset_size: 9644216
- config_name: ha_ng
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 9969323
num_examples: 3259
- name: validation
num_bytes: 1115634
num_examples: 296
- name: test
num_bytes: 2435472
num_examples: 621
download_size: 7989604
dataset_size: 13520429
- config_name: he_il
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6985386
num_examples: 3242
- name: validation
num_bytes: 606115
num_examples: 328
- name: test
num_bytes: 1515631
num_examples: 792
download_size: 5790999
dataset_size: 9107132
- config_name: hi_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 4898980
num_examples: 2120
- name: validation
num_bytes: 524598
num_examples: 239
- name: test
num_bytes: 987108
num_examples: 418
download_size: 4155278
dataset_size: 6410686
- config_name: hr_hr
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8653788
num_examples: 3461
- name: validation
num_bytes: 755704
num_examples: 377
- name: test
num_bytes: 1892642
num_examples: 914
download_size: 6884181
dataset_size: 11302134
- config_name: hu_hu
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6842174
num_examples: 3095
- name: validation
num_bytes: 981755
num_examples: 407
- name: test
num_bytes: 2254837
num_examples: 905
download_size: 6218333
dataset_size: 10078766
- config_name: hy_am
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7619250
num_examples: 3053
- name: validation
num_bytes: 888859
num_examples: 395
- name: test
num_bytes: 2210714
num_examples: 932
download_size: 6434386
dataset_size: 10718823
- config_name: id_id
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6677360
num_examples: 2579
- name: validation
num_bytes: 851560
num_examples: 350
- name: test
num_bytes: 1737061
num_examples: 687
download_size: 5714158
dataset_size: 9265981
- config_name: ig_ng
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 10074523
num_examples: 2839
- name: validation
num_bytes: 1386397
num_examples: 413
- name: test
num_bytes: 3512213
num_examples: 969
download_size: 8570497
dataset_size: 14973133
- config_name: is_is
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 2096749
num_examples: 926
- name: validation
num_bytes: 88270
num_examples: 36
- name: test
num_bytes: 131252
num_examples: 46
download_size: 1517809
dataset_size: 2316271
- config_name: it_it
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6631336
num_examples: 3030
- name: validation
num_bytes: 1132559
num_examples: 391
- name: test
num_bytes: 2577973
num_examples: 865
download_size: 6349035
dataset_size: 10341868
- config_name: ja_jp
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5457068
num_examples: 2292
- name: validation
num_bytes: 690777
num_examples: 266
- name: test
num_bytes: 1734346
num_examples: 650
download_size: 4915501
dataset_size: 7882191
- config_name: jv_id
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8211757
num_examples: 3051
- name: validation
num_bytes: 815491
num_examples: 295
- name: test
num_bytes: 2061097
num_examples: 728
download_size: 6703062
dataset_size: 11088345
- config_name: ka_ge
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 3723501
num_examples: 1491
- name: validation
num_bytes: 898213
num_examples: 409
- name: test
num_bytes: 2259061
num_examples: 979
download_size: 4253080
dataset_size: 6880775
- config_name: kam_ke
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 10790396
num_examples: 3340
- name: validation
num_bytes: 1057282
num_examples: 338
- name: test
num_bytes: 2727307
num_examples: 827
download_size: 8622958
dataset_size: 14574985
- config_name: kea_cv
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7705834
num_examples: 2715
- name: validation
num_bytes: 970985
num_examples: 366
- name: test
num_bytes: 2395683
num_examples: 864
download_size: 6633006
dataset_size: 11072502
- config_name: kk_kz
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8654476
num_examples: 3200
- name: validation
num_bytes: 1116808
num_examples: 369
- name: test
num_bytes: 2803105
num_examples: 856
download_size: 13283394
dataset_size: 12574389
- config_name: km_kh
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5197343
num_examples: 1675
- name: validation
num_bytes: 944319
num_examples: 326
- name: test
num_bytes: 2306769
num_examples: 771
download_size: 5109206
dataset_size: 8448431
- config_name: kn_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6076680
num_examples: 2283
- name: validation
num_bytes: 953942
num_examples: 368
- name: test
num_bytes: 2329006
num_examples: 838
download_size: 5764998
dataset_size: 9359628
- config_name: ko_kr
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5828089
num_examples: 2307
- name: validation
num_bytes: 566914
num_examples: 226
- name: test
num_bytes: 979867
num_examples: 382
download_size: 4634073
dataset_size: 7374870
- config_name: ky_kg
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6856651
num_examples: 2818
- name: validation
num_bytes: 981710
num_examples: 422
- name: test
num_bytes: 2385553
num_examples: 977
download_size: 6045261
dataset_size: 10223914
- config_name: lb_lu
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6216735
num_examples: 2502
- name: validation
num_bytes: 823189
num_examples: 408
- name: test
num_bytes: 2012863
num_examples: 934
download_size: 5569676
dataset_size: 9052787
- config_name: lg_ug
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 9222118
num_examples: 2478
- name: validation
num_bytes: 1017861
num_examples: 306
- name: test
num_bytes: 2485663
num_examples: 723
download_size: 7356228
dataset_size: 12725642
- config_name: ln_cd
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 13299414
num_examples: 3350
- name: validation
num_bytes: 777227
num_examples: 209
- name: test
num_bytes: 1884053
num_examples: 478
download_size: 9156053
dataset_size: 15960694
- config_name: lo_la
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5381591
num_examples: 1809
- name: validation
num_bytes: 423024
num_examples: 191
- name: test
num_bytes: 1000636
num_examples: 404
download_size: 4080214
dataset_size: 6805251
- config_name: lt_lt
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7169812
num_examples: 2937
- name: validation
num_bytes: 863579
num_examples: 416
- name: test
num_bytes: 2186001
num_examples: 986
download_size: 6199043
dataset_size: 10219392
- config_name: luo_ke
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7410676
num_examples: 2384
- name: validation
num_bytes: 273189
num_examples: 102
- name: test
num_bytes: 714425
num_examples: 256
download_size: 5114906
dataset_size: 8398290
- config_name: lv_lv
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 4807633
num_examples: 2110
- name: validation
num_bytes: 826551
num_examples: 356
- name: test
num_bytes: 2085756
num_examples: 851
download_size: 4797068
dataset_size: 7719940
- config_name: mi_nz
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 13250252
num_examples: 3249
- name: validation
num_bytes: 1687871
num_examples: 429
- name: test
num_bytes: 4257056
num_examples: 1008
download_size: 11029260
dataset_size: 19195179
- config_name: mk_mk
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5018626
num_examples: 2337
- name: validation
num_bytes: 947152
num_examples: 415
- name: test
num_bytes: 2354578
num_examples: 973
download_size: 9058245
dataset_size: 8320356
- config_name: ml_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7393010
num_examples: 3043
- name: validation
num_bytes: 1230561
num_examples: 418
- name: test
num_bytes: 2861822
num_examples: 958
download_size: 6873942
dataset_size: 11485393
- config_name: mn_mn
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8459959
num_examples: 3074
- name: validation
num_bytes: 859179
num_examples: 405
- name: test
num_bytes: 2100263
num_examples: 949
download_size: 6827128
dataset_size: 11419401
- config_name: mr_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8767758
num_examples: 3269
- name: validation
num_bytes: 1145883
num_examples: 443
- name: test
num_bytes: 2826769
num_examples: 1015
download_size: 7778139
dataset_size: 12740410
- config_name: ms_my
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7010332
num_examples: 2667
- name: validation
num_bytes: 681778
num_examples: 324
- name: test
num_bytes: 1667091
num_examples: 749
download_size: 5749549
dataset_size: 9359201
- config_name: mt_mt
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7280378
num_examples: 2895
- name: validation
num_bytes: 1095283
num_examples: 404
- name: test
num_bytes: 2599894
num_examples: 926
download_size: 6592968
dataset_size: 10975555
- config_name: my_mm
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8891367
num_examples: 3058
- name: validation
num_bytes: 1201430
num_examples: 384
- name: test
num_bytes: 2789781
num_examples: 880
download_size: 7637855
dataset_size: 12882578
- config_name: nb_no
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8016740
num_examples: 3167
- name: validation
num_bytes: 422981
num_examples: 163
- name: test
num_bytes: 915410
num_examples: 357
download_size: 5815763
dataset_size: 9355131
- config_name: ne_np
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8266083
num_examples: 3331
- name: validation
num_bytes: 667715
num_examples: 305
- name: test
num_bytes: 1680720
num_examples: 726
download_size: 6594697
dataset_size: 10614518
- config_name: nl_nl
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5662932
num_examples: 2918
- name: validation
num_bytes: 333005
num_examples: 171
- name: test
num_bytes: 715459
num_examples: 364
download_size: 4250650
dataset_size: 6711396
- config_name: nso_za
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 9540682
num_examples: 1990
- name: validation
num_bytes: 1334538
num_examples: 363
- name: test
num_bytes: 3065454
num_examples: 790
download_size: 7917510
dataset_size: 13940674
- config_name: ny_mw
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7964147
num_examples: 2694
- name: validation
num_bytes: 1002159
num_examples: 311
- name: test
num_bytes: 2569668
num_examples: 761
download_size: 6656435
dataset_size: 11535974
- config_name: oc_fr
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 10442703
num_examples: 3379
- name: validation
num_bytes: 1320904
num_examples: 427
- name: test
num_bytes: 3295520
num_examples: 998
download_size: 8859902
dataset_size: 15059127
- config_name: om_et
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 4878103
num_examples: 1701
- name: validation
num_bytes: 44058
num_examples: 19
- name: test
num_bytes: 95607
num_examples: 41
download_size: 3067745
dataset_size: 5017768
- config_name: or_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 2527557
num_examples: 1081
- name: validation
num_bytes: 912794
num_examples: 392
- name: test
num_bytes: 2178538
num_examples: 883
download_size: 3512317
dataset_size: 5618889
- config_name: pa_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 4677347
num_examples: 1923
- name: validation
num_bytes: 550134
num_examples: 251
- name: test
num_bytes: 1350674
num_examples: 574
download_size: 4124886
dataset_size: 6578155
- config_name: pl_pl
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6752707
num_examples: 2841
- name: validation
num_bytes: 624610
num_examples: 338
- name: test
num_bytes: 1513897
num_examples: 758
download_size: 5484038
dataset_size: 8891214
- config_name: ps_af
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6523119
num_examples: 2513
- name: validation
num_bytes: 528227
num_examples: 217
- name: test
num_bytes: 1295216
num_examples: 512
download_size: 5126702
dataset_size: 8346562
- config_name: pt_br
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7469492
num_examples: 2793
- name: validation
num_bytes: 948190
num_examples: 386
- name: test
num_bytes: 2377456
num_examples: 919
download_size: 6550140
dataset_size: 10795138
- config_name: ro_ro
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7434753
num_examples: 2891
- name: validation
num_bytes: 794566
num_examples: 387
- name: test
num_bytes: 1863493
num_examples: 883
download_size: 6161795
dataset_size: 10092812
- config_name: ru_ru
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5925332
num_examples: 2562
- name: validation
num_bytes: 793946
num_examples: 356
- name: test
num_bytes: 1837048
num_examples: 775
download_size: 5330969
dataset_size: 8556326
- config_name: sd_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 9016634
num_examples: 3443
- name: validation
num_bytes: 978036
num_examples: 426
- name: test
num_bytes: 2418790
num_examples: 980
download_size: 7512276
dataset_size: 12413460
- config_name: sk_sk
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 4323532
num_examples: 1957
- name: validation
num_bytes: 800149
num_examples: 352
- name: test
num_bytes: 1920862
num_examples: 792
download_size: 4348813
dataset_size: 7044543
- config_name: sl_si
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5717153
num_examples: 2512
- name: validation
num_bytes: 660422
num_examples: 349
- name: test
num_bytes: 1675844
num_examples: 834
download_size: 4953783
dataset_size: 8053419
- config_name: sn_zw
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7308792
num_examples: 2463
- name: validation
num_bytes: 1131817
num_examples: 393
- name: test
num_bytes: 2794764
num_examples: 925
download_size: 6622484
dataset_size: 11235373
- config_name: so_so
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 9719921
num_examples: 3148
- name: validation
num_bytes: 1136749
num_examples: 432
- name: test
num_bytes: 2869504
num_examples: 1019
download_size: 8213841
dataset_size: 13726174
- config_name: sr_rs
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7824032
num_examples: 2944
- name: validation
num_bytes: 614595
num_examples: 290
- name: test
num_bytes: 1564507
num_examples: 700
download_size: 6108297
dataset_size: 10003134
- config_name: sv_se
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6139966
num_examples: 2385
- name: validation
num_bytes: 724956
num_examples: 330
- name: test
num_bytes: 1717316
num_examples: 759
download_size: 5263829
dataset_size: 8582238
- config_name: sw_ke
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 9853821
num_examples: 3070
- name: validation
num_bytes: 587422
num_examples: 211
- name: test
num_bytes: 1411960
num_examples: 487
download_size: 7089436
dataset_size: 11853203
- config_name: ta_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6367761
num_examples: 2367
- name: validation
num_bytes: 921690
num_examples: 377
- name: test
num_bytes: 1563661
num_examples: 591
download_size: 5551970
dataset_size: 8853112
- config_name: te_in
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5815785
num_examples: 2302
- name: validation
num_bytes: 658792
num_examples: 311
- name: test
num_bytes: 1066554
num_examples: 472
download_size: 4809066
dataset_size: 7541131
- config_name: tg_tj
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6307404
num_examples: 2298
- name: validation
num_bytes: 694155
num_examples: 240
- name: test
num_bytes: 1773823
num_examples: 600
download_size: 5132344
dataset_size: 8775382
- config_name: th_th
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6244649
num_examples: 2602
- name: validation
num_bytes: 1049175
num_examples: 439
- name: test
num_bytes: 2514143
num_examples: 1021
download_size: 5968386
dataset_size: 9807967
- config_name: tr_tr
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6103326
num_examples: 2526
- name: validation
num_bytes: 823153
num_examples: 338
- name: test
num_bytes: 1912738
num_examples: 743
download_size: 5502046
dataset_size: 8839217
- config_name: uk_ua
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6646786
num_examples: 2810
- name: validation
num_bytes: 693905
num_examples: 325
- name: test
num_bytes: 1666076
num_examples: 750
download_size: 5590233
dataset_size: 9006767
- config_name: umb_ao
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 8221357
num_examples: 1597
- name: validation
num_bytes: 689247
num_examples: 135
- name: test
num_bytes: 1995904
num_examples: 379
download_size: 6301136
dataset_size: 10906508
- config_name: ur_pk
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5166662
num_examples: 2109
- name: validation
num_bytes: 556691
num_examples: 267
- name: test
num_bytes: 601148
num_examples: 299
download_size: 3929974
dataset_size: 6324501
- config_name: uz_uz
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7407165
num_examples: 2943
- name: validation
num_bytes: 855447
num_examples: 363
- name: test
num_bytes: 2085574
num_examples: 862
download_size: 6226305
dataset_size: 10348186
- config_name: vi_vn
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6685954
num_examples: 2994
- name: validation
num_bytes: 877361
num_examples: 361
- name: test
num_bytes: 2205075
num_examples: 857
download_size: 5949654
dataset_size: 9768390
- config_name: wo_sn
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 6395646
num_examples: 2279
- name: validation
num_bytes: 560712
num_examples: 169
- name: test
num_bytes: 1276391
num_examples: 371
download_size: 4889392
dataset_size: 8232749
- config_name: xh_za
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 9776050
num_examples: 3466
- name: validation
num_bytes: 1131158
num_examples: 446
- name: test
num_bytes: 2774749
num_examples: 1041
download_size: 7845439
dataset_size: 13681957
- config_name: yo_ng
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 7340819
num_examples: 2339
- name: validation
num_bytes: 1251912
num_examples: 378
- name: test
num_bytes: 2752857
num_examples: 831
download_size: 6768330
dataset_size: 11345588
- config_name: yue_hant_hk
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 5337604
num_examples: 1939
- name: validation
num_bytes: 841394
num_examples: 362
- name: test
num_bytes: 1953243
num_examples: 819
download_size: 4929177
dataset_size: 8132241
- config_name: zu_za
features:
- name: audio_path
dtype: string
- name: duration
dtype: float32
- name: codes
sequence: int32
- name: language
dtype: string
splits:
- name: train
num_bytes: 10842601
num_examples: 2858
- name: validation
num_bytes: 1117482
num_examples: 354
- name: test
num_bytes: 2832400
num_examples: 854
download_size: 8476982
dataset_size: 14792483
configs:
- config_name: af_za
data_files:
- split: train
path: af_za/train-*
- split: validation
path: af_za/validation-*
- split: test
path: af_za/test-*
- config_name: am_et
data_files:
- split: train
path: am_et/train-*
- split: validation
path: am_et/validation-*
- split: test
path: am_et/test-*
- config_name: ar_eg
data_files:
- split: train
path: ar_eg/train-*
- split: validation
path: ar_eg/validation-*
- split: test
path: ar_eg/test-*
- config_name: as_in
data_files:
- split: train
path: as_in/train-*
- split: validation
path: as_in/validation-*
- split: test
path: as_in/test-*
- config_name: ast_es
data_files:
- split: train
path: ast_es/train-*
- split: validation
path: ast_es/validation-*
- split: test
path: ast_es/test-*
- config_name: az_az
data_files:
- split: train
path: az_az/train-*
- split: validation
path: az_az/validation-*
- split: test
path: az_az/test-*
- config_name: be_by
data_files:
- split: train
path: be_by/train-*
- split: validation
path: be_by/validation-*
- split: test
path: be_by/test-*
- config_name: bg_bg
data_files:
- split: train
path: bg_bg/train-*
- split: validation
path: bg_bg/validation-*
- split: test
path: bg_bg/test-*
- config_name: bn_in
data_files:
- split: train
path: bn_in/train-*
- split: validation
path: bn_in/validation-*
- split: test
path: bn_in/test-*
- config_name: bs_ba
data_files:
- split: train
path: bs_ba/train-*
- split: validation
path: bs_ba/validation-*
- split: test
path: bs_ba/test-*
- config_name: ca_es
data_files:
- split: train
path: ca_es/train-*
- split: validation
path: ca_es/validation-*
- split: test
path: ca_es/test-*
- config_name: ceb_ph
data_files:
- split: train
path: ceb_ph/train-*
- split: validation
path: ceb_ph/validation-*
- split: test
path: ceb_ph/test-*
- config_name: ckb_iq
data_files:
- split: train
path: ckb_iq/train-*
- split: validation
path: ckb_iq/validation-*
- split: test
path: ckb_iq/test-*
- config_name: cmn_hans_cn
data_files:
- split: train
path: cmn_hans_cn/train-*
- split: validation
path: cmn_hans_cn/validation-*
- split: test
path: cmn_hans_cn/test-*
- config_name: cs_cz
data_files:
- split: train
path: cs_cz/train-*
- split: validation
path: cs_cz/validation-*
- split: test
path: cs_cz/test-*
- config_name: cy_gb
data_files:
- split: train
path: cy_gb/train-*
- split: validation
path: cy_gb/validation-*
- split: test
path: cy_gb/test-*
- config_name: da_dk
data_files:
- split: train
path: da_dk/train-*
- split: validation
path: da_dk/validation-*
- split: test
path: da_dk/test-*
- config_name: de_de
data_files:
- split: train
path: de_de/train-*
- split: validation
path: de_de/validation-*
- split: test
path: de_de/test-*
- config_name: el_gr
data_files:
- split: train
path: el_gr/train-*
- split: validation
path: el_gr/validation-*
- split: test
path: el_gr/test-*
- config_name: en_us
data_files:
- split: train
path: en_us/train-*
- split: validation
path: en_us/validation-*
- split: test
path: en_us/test-*
- config_name: es_419
data_files:
- split: train
path: es_419/train-*
- split: validation
path: es_419/validation-*
- split: test
path: es_419/test-*
- config_name: et_ee
data_files:
- split: train
path: et_ee/train-*
- split: validation
path: et_ee/validation-*
- split: test
path: et_ee/test-*
- config_name: fa_ir
data_files:
- split: train
path: fa_ir/train-*
- split: validation
path: fa_ir/validation-*
- split: test
path: fa_ir/test-*
- config_name: ff_sn
data_files:
- split: train
path: ff_sn/train-*
- split: validation
path: ff_sn/validation-*
- split: test
path: ff_sn/test-*
- config_name: fi_fi
data_files:
- split: train
path: fi_fi/train-*
- split: validation
path: fi_fi/validation-*
- split: test
path: fi_fi/test-*
- config_name: fil_ph
data_files:
- split: train
path: fil_ph/train-*
- split: validation
path: fil_ph/validation-*
- split: test
path: fil_ph/test-*
- config_name: fr_fr
data_files:
- split: train
path: fr_fr/train-*
- split: validation
path: fr_fr/validation-*
- split: test
path: fr_fr/test-*
- config_name: ga_ie
data_files:
- split: train
path: ga_ie/train-*
- split: validation
path: ga_ie/validation-*
- split: test
path: ga_ie/test-*
- config_name: gl_es
data_files:
- split: train
path: gl_es/train-*
- split: validation
path: gl_es/validation-*
- split: test
path: gl_es/test-*
- config_name: gu_in
data_files:
- split: train
path: gu_in/train-*
- split: validation
path: gu_in/validation-*
- split: test
path: gu_in/test-*
- config_name: ha_ng
data_files:
- split: train
path: ha_ng/train-*
- split: validation
path: ha_ng/validation-*
- split: test
path: ha_ng/test-*
- config_name: he_il
data_files:
- split: train
path: he_il/train-*
- split: validation
path: he_il/validation-*
- split: test
path: he_il/test-*
- config_name: hi_in
data_files:
- split: train
path: hi_in/train-*
- split: validation
path: hi_in/validation-*
- split: test
path: hi_in/test-*
- config_name: hr_hr
data_files:
- split: train
path: hr_hr/train-*
- split: validation
path: hr_hr/validation-*
- split: test
path: hr_hr/test-*
- config_name: hu_hu
data_files:
- split: train
path: hu_hu/train-*
- split: validation
path: hu_hu/validation-*
- split: test
path: hu_hu/test-*
- config_name: hy_am
data_files:
- split: train
path: hy_am/train-*
- split: validation
path: hy_am/validation-*
- split: test
path: hy_am/test-*
- config_name: id_id
data_files:
- split: train
path: id_id/train-*
- split: validation
path: id_id/validation-*
- split: test
path: id_id/test-*
- config_name: ig_ng
data_files:
- split: train
path: ig_ng/train-*
- split: validation
path: ig_ng/validation-*
- split: test
path: ig_ng/test-*
- config_name: is_is
data_files:
- split: train
path: is_is/train-*
- split: validation
path: is_is/validation-*
- split: test
path: is_is/test-*
- config_name: it_it
data_files:
- split: train
path: it_it/train-*
- split: validation
path: it_it/validation-*
- split: test
path: it_it/test-*
- config_name: ja_jp
data_files:
- split: train
path: ja_jp/train-*
- split: validation
path: ja_jp/validation-*
- split: test
path: ja_jp/test-*
- config_name: jv_id
data_files:
- split: train
path: jv_id/train-*
- split: validation
path: jv_id/validation-*
- split: test
path: jv_id/test-*
- config_name: ka_ge
data_files:
- split: train
path: ka_ge/train-*
- split: validation
path: ka_ge/validation-*
- split: test
path: ka_ge/test-*
- config_name: kam_ke
data_files:
- split: train
path: kam_ke/train-*
- split: validation
path: kam_ke/validation-*
- split: test
path: kam_ke/test-*
- config_name: kea_cv
data_files:
- split: train
path: kea_cv/train-*
- split: validation
path: kea_cv/validation-*
- split: test
path: kea_cv/test-*
- config_name: kk_kz
data_files:
- split: train
path: kk_kz/train-*
- split: validation
path: kk_kz/validation-*
- split: test
path: kk_kz/test-*
- config_name: km_kh
data_files:
- split: train
path: km_kh/train-*
- split: validation
path: km_kh/validation-*
- split: test
path: km_kh/test-*
- config_name: kn_in
data_files:
- split: train
path: kn_in/train-*
- split: validation
path: kn_in/validation-*
- split: test
path: kn_in/test-*
- config_name: ko_kr
data_files:
- split: train
path: ko_kr/train-*
- split: validation
path: ko_kr/validation-*
- split: test
path: ko_kr/test-*
- config_name: ky_kg
data_files:
- split: train
path: ky_kg/train-*
- split: validation
path: ky_kg/validation-*
- split: test
path: ky_kg/test-*
- config_name: lb_lu
data_files:
- split: train
path: lb_lu/train-*
- split: validation
path: lb_lu/validation-*
- split: test
path: lb_lu/test-*
- config_name: lg_ug
data_files:
- split: train
path: lg_ug/train-*
- split: validation
path: lg_ug/validation-*
- split: test
path: lg_ug/test-*
- config_name: ln_cd
data_files:
- split: train
path: ln_cd/train-*
- split: validation
path: ln_cd/validation-*
- split: test
path: ln_cd/test-*
- config_name: lo_la
data_files:
- split: train
path: lo_la/train-*
- split: validation
path: lo_la/validation-*
- split: test
path: lo_la/test-*
- config_name: lt_lt
data_files:
- split: train
path: lt_lt/train-*
- split: validation
path: lt_lt/validation-*
- split: test
path: lt_lt/test-*
- config_name: luo_ke
data_files:
- split: train
path: luo_ke/train-*
- split: validation
path: luo_ke/validation-*
- split: test
path: luo_ke/test-*
- config_name: lv_lv
data_files:
- split: train
path: lv_lv/train-*
- split: validation
path: lv_lv/validation-*
- split: test
path: lv_lv/test-*
- config_name: mi_nz
data_files:
- split: train
path: mi_nz/train-*
- split: validation
path: mi_nz/validation-*
- split: test
path: mi_nz/test-*
- config_name: mk_mk
data_files:
- split: train
path: mk_mk/train-*
- split: validation
path: mk_mk/validation-*
- split: test
path: mk_mk/test-*
- config_name: ml_in
data_files:
- split: train
path: ml_in/train-*
- split: validation
path: ml_in/validation-*
- split: test
path: ml_in/test-*
- config_name: mn_mn
data_files:
- split: train
path: mn_mn/train-*
- split: validation
path: mn_mn/validation-*
- split: test
path: mn_mn/test-*
- config_name: mr_in
data_files:
- split: train
path: mr_in/train-*
- split: validation
path: mr_in/validation-*
- split: test
path: mr_in/test-*
- config_name: ms_my
data_files:
- split: train
path: ms_my/train-*
- split: validation
path: ms_my/validation-*
- split: test
path: ms_my/test-*
- config_name: mt_mt
data_files:
- split: train
path: mt_mt/train-*
- split: validation
path: mt_mt/validation-*
- split: test
path: mt_mt/test-*
- config_name: my_mm
data_files:
- split: train
path: my_mm/train-*
- split: validation
path: my_mm/validation-*
- split: test
path: my_mm/test-*
- config_name: nb_no
data_files:
- split: train
path: nb_no/train-*
- split: validation
path: nb_no/validation-*
- split: test
path: nb_no/test-*
- config_name: ne_np
data_files:
- split: train
path: ne_np/train-*
- split: validation
path: ne_np/validation-*
- split: test
path: ne_np/test-*
- config_name: nl_nl
data_files:
- split: train
path: nl_nl/train-*
- split: validation
path: nl_nl/validation-*
- split: test
path: nl_nl/test-*
- config_name: nso_za
data_files:
- split: train
path: nso_za/train-*
- split: validation
path: nso_za/validation-*
- split: test
path: nso_za/test-*
- config_name: ny_mw
data_files:
- split: train
path: ny_mw/train-*
- split: validation
path: ny_mw/validation-*
- split: test
path: ny_mw/test-*
- config_name: oc_fr
data_files:
- split: train
path: oc_fr/train-*
- split: validation
path: oc_fr/validation-*
- split: test
path: oc_fr/test-*
- config_name: om_et
data_files:
- split: train
path: om_et/train-*
- split: validation
path: om_et/validation-*
- split: test
path: om_et/test-*
- config_name: or_in
data_files:
- split: train
path: or_in/train-*
- split: validation
path: or_in/validation-*
- split: test
path: or_in/test-*
- config_name: pa_in
data_files:
- split: train
path: pa_in/train-*
- split: validation
path: pa_in/validation-*
- split: test
path: pa_in/test-*
- config_name: pl_pl
data_files:
- split: train
path: pl_pl/train-*
- split: validation
path: pl_pl/validation-*
- split: test
path: pl_pl/test-*
- config_name: ps_af
data_files:
- split: train
path: ps_af/train-*
- split: validation
path: ps_af/validation-*
- split: test
path: ps_af/test-*
- config_name: pt_br
data_files:
- split: train
path: pt_br/train-*
- split: validation
path: pt_br/validation-*
- split: test
path: pt_br/test-*
- config_name: ro_ro
data_files:
- split: train
path: ro_ro/train-*
- split: validation
path: ro_ro/validation-*
- split: test
path: ro_ro/test-*
- config_name: ru_ru
data_files:
- split: train
path: ru_ru/train-*
- split: validation
path: ru_ru/validation-*
- split: test
path: ru_ru/test-*
- config_name: sd_in
data_files:
- split: train
path: sd_in/train-*
- split: validation
path: sd_in/validation-*
- split: test
path: sd_in/test-*
- config_name: sk_sk
data_files:
- split: train
path: sk_sk/train-*
- split: validation
path: sk_sk/validation-*
- split: test
path: sk_sk/test-*
- config_name: sl_si
data_files:
- split: train
path: sl_si/train-*
- split: validation
path: sl_si/validation-*
- split: test
path: sl_si/test-*
- config_name: sn_zw
data_files:
- split: train
path: sn_zw/train-*
- split: validation
path: sn_zw/validation-*
- split: test
path: sn_zw/test-*
- config_name: so_so
data_files:
- split: train
path: so_so/train-*
- split: validation
path: so_so/validation-*
- split: test
path: so_so/test-*
- config_name: sr_rs
data_files:
- split: train
path: sr_rs/train-*
- split: validation
path: sr_rs/validation-*
- split: test
path: sr_rs/test-*
- config_name: sv_se
data_files:
- split: train
path: sv_se/train-*
- split: validation
path: sv_se/validation-*
- split: test
path: sv_se/test-*
- config_name: sw_ke
data_files:
- split: train
path: sw_ke/train-*
- split: validation
path: sw_ke/validation-*
- split: test
path: sw_ke/test-*
- config_name: ta_in
data_files:
- split: train
path: ta_in/train-*
- split: validation
path: ta_in/validation-*
- split: test
path: ta_in/test-*
- config_name: te_in
data_files:
- split: train
path: te_in/train-*
- split: validation
path: te_in/validation-*
- split: test
path: te_in/test-*
- config_name: tg_tj
data_files:
- split: train
path: tg_tj/train-*
- split: validation
path: tg_tj/validation-*
- split: test
path: tg_tj/test-*
- config_name: th_th
data_files:
- split: train
path: th_th/train-*
- split: validation
path: th_th/validation-*
- split: test
path: th_th/test-*
- config_name: tr_tr
data_files:
- split: train
path: tr_tr/train-*
- split: validation
path: tr_tr/validation-*
- split: test
path: tr_tr/test-*
- config_name: uk_ua
data_files:
- split: train
path: uk_ua/train-*
- split: validation
path: uk_ua/validation-*
- split: test
path: uk_ua/test-*
- config_name: umb_ao
data_files:
- split: train
path: umb_ao/train-*
- split: validation
path: umb_ao/validation-*
- split: test
path: umb_ao/test-*
- config_name: ur_pk
data_files:
- split: train
path: ur_pk/train-*
- split: validation
path: ur_pk/validation-*
- split: test
path: ur_pk/test-*
- config_name: uz_uz
data_files:
- split: train
path: uz_uz/train-*
- split: validation
path: uz_uz/validation-*
- split: test
path: uz_uz/test-*
- config_name: vi_vn
data_files:
- split: train
path: vi_vn/train-*
- split: validation
path: vi_vn/validation-*
- split: test
path: vi_vn/test-*
- config_name: wo_sn
data_files:
- split: train
path: wo_sn/train-*
- split: validation
path: wo_sn/validation-*
- split: test
path: wo_sn/test-*
- config_name: xh_za
data_files:
- split: train
path: xh_za/train-*
- split: validation
path: xh_za/validation-*
- split: test
path: xh_za/test-*
- config_name: yo_ng
data_files:
- split: train
path: yo_ng/train-*
- split: validation
path: yo_ng/validation-*
- split: test
path: yo_ng/test-*
- config_name: yue_hant_hk
data_files:
- split: train
path: yue_hant_hk/train-*
- split: validation
path: yue_hant_hk/validation-*
- split: test
path: yue_hant_hk/test-*
- config_name: zu_za
data_files:
- split: train
path: zu_za/train-*
- split: validation
path: zu_za/validation-*
- split: test
path: zu_za/test-*
size_categories:
- 100K<n<1M
---
# Dataset
## Dataset Statistics
This table shows the number of examples per language configuration and split.
| config_name | train_examples | validation_examples | test_examples |
|---|---:|---:|---:|
| af_za | 1.032 | 198 | 264 |
| am_et | 3.163 | 223 | 516 |
| ar_eg | 2.104 | 295 | 428 |
| as_in | 2.812 | 418 | 984 |
| ast_es | 2.511 | 398 | 946 |
| az_az | 2.665 | 400 | 923 |
| be_by | 2.433 | 408 | 967 |
| bg_bg | 2.973 | 395 | 658 |
| bn_in | 3.006 | 402 | 920 |
| bs_ba | 3.091 | 400 | 925 |
| ca_es | 2.300 | 404 | 940 |
| ceb_ph | 3.261 | 225 | 541 |
| ckb_iq | 3.040 | 386 | 922 |
| cmn_hans_cn | 3.246 | 409 | 945 |
| cs_cz | 2.811 | 305 | 723 |
| cy_gb | 3.427 | 447 | 1.021 |
| da_dk | 2.465 | 395 | 930 |
| de_de | 2.987 | 363 | 862 |
| el_gr | 3.215 | 271 | 650 |
| en_us | 2.602 | 394 | 647 |
| es_419 | 2.796 | 408 | 908 |
| et_ee | 2.501 | 387 | 893 |
| fa_ir | 3.101 | 369 | 871 |
| ff_sn | 3.235 | 273 | 660 |
| fi_fi | 2.704 | 415 | 918 |
| fil_ph | 1.884 | 418 | 964 |
| fr_fr | 3.193 | 289 | 676 |
| ga_ie | 2.845 | 369 | 842 |
| gl_es | 2.175 | 395 | 927 |
| gu_in | 3.145 | 432 | 1.000 |
| ha_ng | 3.259 | 296 | 621 |
| he_il | 3.242 | 328 | 792 |
| hi_in | 2.120 | 239 | 418 |
| hr_hr | 3.461 | 377 | 914 |
| hu_hu | 3.095 | 407 | 905 |
| hy_am | 3.053 | 395 | 932 |
| id_id | 2.579 | 350 | 687 |
| ig_ng | 2.839 | 413 | 969 |
| is_is | 926 | 36 | 46 |
| it_it | 3.030 | 391 | 865 |
| ja_jp | 2.292 | 266 | 650 |
| jv_id | 3.051 | 295 | 728 |
| ka_ge | 1.491 | 409 | 979 |
| kam_ke | 3.340 | 338 | 827 |
| kea_cv | 2.715 | 366 | 864 |
| kk_kz | 3.200 | 369 | 856 |
| km_kh | 1.675 | 326 | 771 |
| kn_in | 2.283 | 368 | 838 |
| ko_kr | 2.307 | 226 | 382 |
| ky_kg | 2.818 | 422 | 977 |
| lb_lu | 2.502 | 408 | 934 |
| lg_ug | 2.478 | 306 | 723 |
| ln_cd | 3.350 | 209 | 478 |
| lo_la | 1.809 | 191 | 404 |
| lt_lt | 2.937 | 416 | 986 |
| luo_ke | 2.384 | 102 | 256 |
| lv_lv | 2.110 | 356 | 851 |
| mi_nz | 3.249 | 429 | 1.008 |
| mk_mk | 2.337 | 415 | 973 |
| ml_in | 3.043 | 418 | 958 |
| mn_mn | 3.074 | 405 | 949 |
| mr_in | 3.269 | 443 | 1.015 |
| ms_my | 2.667 | 324 | 749 |
| mt_mt | 2.895 | 404 | 926 |
| my_mm | 3.058 | 384 | 880 |
| nb_no | 3.167 | 163 | 357 |
| ne_np | 3.331 | 305 | 726 |
| nl_nl | 2.918 | 171 | 364 |
| nso_za | 1.990 | 363 | 790 |
| ny_mw | 2.694 | 311 | 761 |
| oc_fr | 3.379 | 427 | 998 |
| om_et | 1.701 | 19 | 41 |
| or_in | 1.081 | 392 | 883 |
| pa_in | 1.923 | 251 | 574 |
| pl_pl | 2.841 | 338 | 758 |
| ps_af | 2.513 | 217 | 512 |
| pt_br | 2.793 | 386 | 919 |
| ro_ro | 2.891 | 387 | 883 |
| ru_ru | 2.562 | 356 | 775 |
| sd_in | 3.443 | 426 | 980 |
| sk_sk | 1.957 | 352 | 792 |
| sl_si | 2.512 | 349 | 834 |
| sn_zw | 2.463 | 393 | 925 |
| so_so | 3.148 | 432 | 1.019 |
| sr_rs | 2.944 | 290 | 700 |
| sv_se | 2.385 | 330 | 759 |
| sw_ke | 3.070 | 211 | 487 |
| ta_in | 2.367 | 377 | 591 |
| te_in | 2.302 | 311 | 472 |
| tg_tj | 2.298 | 240 | 600 |
| th_th | 2.602 | 439 | 1.021 |
| tr_tr | 2.526 | 338 | 743 |
| uk_ua | 2.810 | 325 | 750 |
| umb_ao | 1.597 | 135 | 379 |
| ur_pk | 2.109 | 267 | 299 |
| uz_uz | 2.943 | 363 | 862 |
| vi_vn | 2.994 | 361 | 857 |
| wo_sn | 2.279 | 169 | 371 |
| xh_za | 3.466 | 446 | 1.041 |
| yo_ng | 2.339 | 378 | 831 |
| yue_hant_hk | 1.939 | 362 | 819 |
| zu_za | 2.858 | 354 | 854 |
提供机构:
deepdml



