catherinearnett/bilingual-tokenizer-training-data
收藏Hugging Face2026-02-21 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/catherinearnett/bilingual-tokenizer-training-data
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: afr_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524687226
num_examples: 133023
download_size: 319847528
dataset_size: 524687226
- config_name: afr_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524694799
num_examples: 132371
download_size: 319644890
dataset_size: 524694799
- config_name: afr_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524692766
num_examples: 132092
download_size: 319450791
dataset_size: 524692766
- config_name: als_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524874994
num_examples: 194670
download_size: 311002870
dataset_size: 524874994
- config_name: als_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524867677
num_examples: 193139
download_size: 310962316
dataset_size: 524867677
- config_name: als_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524863939
num_examples: 191782
download_size: 310730428
dataset_size: 524863939
- config_name: amh_Ethi_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524547635
num_examples: 82696
download_size: 255304653
dataset_size: 524547635
- config_name: amh_Ethi_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524538235
num_examples: 82768
download_size: 255112337
dataset_size: 524538235
- config_name: amh_Ethi_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524537653
num_examples: 81774
download_size: 255367699
dataset_size: 524537653
- config_name: arb_Arab_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524559851
num_examples: 88985
download_size: 254428922
dataset_size: 524559851
- config_name: arb_Arab_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524552742
num_examples: 87803
download_size: 254388829
dataset_size: 524552742
- config_name: arb_Arab_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524586834
num_examples: 86720
download_size: 254500227
dataset_size: 524586834
- config_name: arz_Arab_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524695324
num_examples: 135048
download_size: 261893512
dataset_size: 524695324
- config_name: arz_Arab_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524695907
num_examples: 134336
download_size: 261613138
dataset_size: 524695907
- config_name: arz_Arab_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524696738
num_examples: 134226
download_size: 261929769
dataset_size: 524696738
- config_name: asm_Beng_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524582503
num_examples: 93860
download_size: 198605728
dataset_size: 524582503
- config_name: asm_Beng_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524577081
num_examples: 94308
download_size: 198635319
dataset_size: 524577081
- config_name: azj_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524739586
num_examples: 150193
download_size: 291394017
dataset_size: 524739586
- config_name: azj_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524751680
num_examples: 151073
download_size: 291535431
dataset_size: 524751680
- config_name: azj_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524746310
num_examples: 150448
download_size: 291579288
dataset_size: 524746310
- config_name: bel_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524565884
num_examples: 84920
download_size: 264825116
dataset_size: 524565884
- config_name: bel_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524543806
num_examples: 85180
download_size: 264984767
dataset_size: 524543806
- config_name: bel_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524539657
num_examples: 83633
download_size: 265010640
dataset_size: 524539657
- config_name: ben_Beng_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524516180
num_examples: 76060
download_size: 195463398
dataset_size: 524516180
- config_name: ben_Beng_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524528603
num_examples: 76728
download_size: 195459516
dataset_size: 524528603
- config_name: ben_Beng_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524520785
num_examples: 76489
download_size: 195389523
dataset_size: 524520785
- config_name: bod_Tibt_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524411062
num_examples: 39668
download_size: 157020299
dataset_size: 524411062
- config_name: bod_Tibt_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524411130
num_examples: 40677
download_size: 157015499
dataset_size: 524411130
- config_name: bod_Tibt_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524514034
num_examples: 39476
download_size: 157174103
dataset_size: 524514034
- config_name: bos_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524832393
num_examples: 181102
download_size: 344064443
dataset_size: 524832393
- config_name: bos_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524842815
num_examples: 183578
download_size: 344159104
dataset_size: 524842815
- config_name: bos_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524837746
num_examples: 182957
download_size: 344287871
dataset_size: 524837746
- config_name: bul_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524559183
num_examples: 90329
download_size: 251012513
dataset_size: 524559183
- config_name: bul_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524561028
num_examples: 90436
download_size: 250929246
dataset_size: 524561028
- config_name: bul_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524569830
num_examples: 90403
download_size: 251096310
dataset_size: 524569830
- config_name: cat_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524886635
num_examples: 199269
download_size: 322850858
dataset_size: 524886635
- config_name: cat_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524894753
num_examples: 198447
download_size: 322894952
dataset_size: 524894753
- config_name: cat_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524886254
num_examples: 198632
download_size: 322829763
dataset_size: 524886254
- config_name: ces_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524786171
num_examples: 164702
download_size: 345450793
dataset_size: 524786171
- config_name: ces_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524794974
num_examples: 165415
download_size: 345320048
dataset_size: 524794974
- config_name: ces_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524783084
num_examples: 164324
download_size: 344766675
dataset_size: 524783084
- config_name: ckb_Arab_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524609143
num_examples: 106517
download_size: 244258716
dataset_size: 524609143
- config_name: ckb_Arab_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524615403
num_examples: 108569
download_size: 244256043
dataset_size: 524615403
- config_name: ckb_Arab_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524618892
num_examples: 109496
download_size: 244332374
dataset_size: 524618892
- config_name: cym_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524797416
num_examples: 168461
download_size: 318803973
dataset_size: 524797416
- config_name: cym_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524803889
num_examples: 170281
download_size: 318996842
dataset_size: 524803889
- config_name: cym_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524795720
num_examples: 167834
download_size: 318559285
dataset_size: 524795720
- config_name: dan_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524822241
num_examples: 176101
download_size: 316598856
dataset_size: 524822241
- config_name: dan_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524831446
num_examples: 175841
download_size: 316852838
dataset_size: 524831446
- config_name: dan_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524824636
num_examples: 175980
download_size: 316251064
dataset_size: 524824636
- config_name: deu_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524819638
num_examples: 176959
download_size: 325239842
dataset_size: 524819638
- config_name: deu_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524823151
num_examples: 178030
download_size: 324952022
dataset_size: 524823151
- config_name: deu_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524823398
num_examples: 177178
download_size: 325038991
dataset_size: 524823398
- config_name: ekk_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524743164
num_examples: 151289
download_size: 332588876
dataset_size: 524743164
- config_name: ekk_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524745124
num_examples: 149969
download_size: 332717417
dataset_size: 524745124
- config_name: ekk_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524741716
num_examples: 151009
download_size: 332883853
dataset_size: 524741716
- config_name: ell_Grek_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524746815
num_examples: 91341
download_size: 256594990
dataset_size: 524746815
- config_name: ell_Grek_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524562903
num_examples: 91212
download_size: 256537680
dataset_size: 524562903
- config_name: ell_Grek_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524564801
num_examples: 91528
download_size: 256540931
dataset_size: 524564801
- config_name: eus_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524838271
num_examples: 183397
download_size: 319677971
dataset_size: 524838271
- config_name: eus_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524842666
num_examples: 184410
download_size: 319760545
dataset_size: 524842666
- config_name: eus_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524844111
num_examples: 185142
download_size: 319632369
dataset_size: 524844111
- config_name: fil_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524728714
num_examples: 146399
download_size: 308525936
dataset_size: 524728714
- config_name: fil_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524736197
num_examples: 147718
download_size: 308699657
dataset_size: 524736197
- config_name: fil_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524725576
num_examples: 145417
download_size: 308440680
dataset_size: 524725576
- config_name: fin_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524732062
num_examples: 147748
download_size: 326222472
dataset_size: 524732062
- config_name: fin_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524734537
num_examples: 148752
download_size: 326921686
dataset_size: 524734537
- config_name: fin_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524743102
num_examples: 147771
download_size: 326528653
dataset_size: 524743102
- config_name: fra_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524770786
num_examples: 160849
download_size: 316325652
dataset_size: 524770786
- config_name: fra_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524767091
num_examples: 159354
download_size: 315558285
dataset_size: 524767091
- config_name: fra_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524767700
num_examples: 159768
download_size: 316057319
dataset_size: 524767700
- config_name: gle_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524730706
num_examples: 147544
download_size: 308306029
dataset_size: 524730706
- config_name: gle_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524736594
num_examples: 147502
download_size: 307717441
dataset_size: 524736594
- config_name: gle_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524875115
num_examples: 148176
download_size: 308118532
dataset_size: 524875115
- config_name: glg_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524867345
num_examples: 192710
download_size: 321975995
dataset_size: 524867345
- config_name: glg_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524866026
num_examples: 191953
download_size: 321527017
dataset_size: 524866026
- config_name: glg_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524870363
num_examples: 193458
download_size: 321837392
dataset_size: 524870363
- config_name: guj_Gujr_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524539466
num_examples: 79769
download_size: 201683960
dataset_size: 524539466
- config_name: guj_Gujr_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524542568
num_examples: 79699
download_size: 201677359
dataset_size: 524542568
- config_name: guj_Gujr_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524543768
num_examples: 79521
download_size: 201702102
dataset_size: 524543768
- config_name: hau_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524936158
num_examples: 215887
download_size: 301620494
dataset_size: 524936158
- config_name: hau_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524939166
num_examples: 215226
download_size: 301805777
dataset_size: 524939166
- config_name: heb_Hebr_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524668738
num_examples: 95454
download_size: 265590306
dataset_size: 524668738
- config_name: heb_Hebr_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524577374
num_examples: 95805
download_size: 265493150
dataset_size: 524577374
- config_name: heb_Hebr_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524579748
num_examples: 95745
download_size: 265537714
dataset_size: 524579748
- config_name: hin_Deva_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524534514
num_examples: 80041
download_size: 197075273
dataset_size: 524534514
- config_name: hin_Deva_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524531150
num_examples: 79875
download_size: 197101870
dataset_size: 524531150
- config_name: hin_Deva_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524531096
num_examples: 80410
download_size: 197003973
dataset_size: 524531096
- config_name: hrv_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524567299
num_examples: 92384
download_size: 337494441
dataset_size: 524567299
- config_name: hrv_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524569665
num_examples: 92082
download_size: 337601691
dataset_size: 524569665
- config_name: hrv_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524568776
num_examples: 92864
download_size: 337843888
dataset_size: 524568776
- config_name: hun_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524663929
num_examples: 124505
download_size: 326890587
dataset_size: 524663929
- config_name: hun_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524663919
num_examples: 125118
download_size: 327063486
dataset_size: 524663919
- config_name: hun_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524663449
num_examples: 124877
download_size: 326498182
dataset_size: 524663449
- config_name: hye_Armn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524619993
num_examples: 106871
download_size: 242734776
dataset_size: 524619993
- config_name: hye_Armn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524616111
num_examples: 108964
download_size: 242941455
dataset_size: 524616111
- config_name: hye_Armn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524618276
num_examples: 107564
download_size: 242916332
dataset_size: 524618276
- config_name: ind_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524686217
num_examples: 131929
download_size: 288872799
dataset_size: 524686217
- config_name: ind_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524690238
num_examples: 130705
download_size: 288785873
dataset_size: 524690238
- config_name: ind_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524685014
num_examples: 131809
download_size: 289028155
dataset_size: 524685014
- config_name: isl_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524797050
num_examples: 168452
download_size: 318118719
dataset_size: 524797050
- config_name: isl_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524797993
num_examples: 169529
download_size: 318227255
dataset_size: 524797993
- config_name: isl_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524794592
num_examples: 168813
download_size: 318270423
dataset_size: 524794592
- config_name: ita_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524788332
num_examples: 165957
download_size: 327785128
dataset_size: 524788332
- config_name: ita_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524812480
num_examples: 166226
download_size: 327911748
dataset_size: 524812480
- config_name: ita_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524788914
num_examples: 166655
download_size: 328031868
dataset_size: 524788914
- config_name: jav_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524643709
num_examples: 118128
download_size: 297863494
dataset_size: 524643709
- config_name: jpn_Jpan_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524701219
num_examples: 135997
download_size: 297392716
dataset_size: 524701219
- config_name: jpn_Jpan_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524692517
num_examples: 134573
download_size: 297226102
dataset_size: 524692517
- config_name: jpn_Jpan_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524692608
num_examples: 134450
download_size: 297437214
dataset_size: 524692608
- config_name: kan_Knda_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524524761
num_examples: 78743
download_size: 197106112
dataset_size: 524524761
- config_name: kan_Knda_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524532808
num_examples: 79228
download_size: 196878166
dataset_size: 524532808
- config_name: kan_Knda_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524645263
num_examples: 78005
download_size: 197090006
dataset_size: 524645263
- config_name: kat_Geor_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524484548
num_examples: 64688
download_size: 188047999
dataset_size: 524484548
- config_name: kat_Geor_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524487248
num_examples: 64665
download_size: 187570761
dataset_size: 524487248
- config_name: kat_Geor_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524489551
num_examples: 64466
download_size: 187832707
dataset_size: 524489551
- config_name: kaz_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524538810
num_examples: 73041
download_size: 243046780
dataset_size: 524538810
- config_name: kaz_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524513385
num_examples: 73184
download_size: 242963533
dataset_size: 524513385
- config_name: kaz_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524532339
num_examples: 73467
download_size: 243057312
dataset_size: 524532339
- config_name: khk_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524551513
num_examples: 85741
download_size: 239787256
dataset_size: 524551513
- config_name: khk_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524550235
num_examples: 86213
download_size: 239686032
dataset_size: 524550235
- config_name: khk_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524545805
num_examples: 85518
download_size: 239946650
dataset_size: 524545805
- config_name: khm_Khmr_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524554034
num_examples: 86753
download_size: 190658486
dataset_size: 524554034
- config_name: khm_Khmr_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524550061
num_examples: 87176
download_size: 190685416
dataset_size: 524550061
- config_name: khm_Khmr_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524615586
num_examples: 87366
download_size: 190672039
dataset_size: 524615586
- config_name: kir_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524646153
num_examples: 119375
download_size: 249407445
dataset_size: 524646153
- config_name: kir_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524656521
num_examples: 120292
download_size: 249747788
dataset_size: 524656521
- config_name: kir_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524651776
num_examples: 120729
download_size: 249699337
dataset_size: 524651776
- config_name: kmr_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524902239
num_examples: 186853
download_size: 308778457
dataset_size: 524902239
- config_name: kmr_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524879926
num_examples: 186415
download_size: 309062104
dataset_size: 524879926
- config_name: kor_Hang_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524721784
num_examples: 137032
download_size: 308079826
dataset_size: 524721784
- config_name: kor_Hang_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524706894
num_examples: 139207
download_size: 308442552
dataset_size: 524706894
- config_name: kor_Hang_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524699874
num_examples: 137279
download_size: 308196300
dataset_size: 524699874
- config_name: lao_Laoo_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524538314
num_examples: 83274
download_size: 199264254
dataset_size: 524538314
- config_name: lao_Laoo_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524546814
num_examples: 83746
download_size: 199511267
dataset_size: 524546814
- config_name: lao_Laoo_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524547913
num_examples: 83864
download_size: 199341360
dataset_size: 524547913
- config_name: lit_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524730004
num_examples: 131439
download_size: 328153580
dataset_size: 524730004
- config_name: lit_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524684709
num_examples: 131365
download_size: 327698114
dataset_size: 524684709
- config_name: lit_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524686968
num_examples: 130384
download_size: 327679158
dataset_size: 524686968
- config_name: lvs_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524708098
num_examples: 137499
download_size: 318917665
dataset_size: 524708098
- config_name: lvs_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524719744
num_examples: 138638
download_size: 319070808
dataset_size: 524719744
- config_name: lvs_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524726869
num_examples: 137691
download_size: 318650665
dataset_size: 524726869
- config_name: mal_Mlym_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524476304
num_examples: 60105
download_size: 190706776
dataset_size: 524476304
- config_name: mal_Mlym_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524534762
num_examples: 61216
download_size: 190839272
dataset_size: 524534762
- config_name: mal_Mlym_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524485468
num_examples: 60385
download_size: 190660423
dataset_size: 524485468
- config_name: mar_Deva_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524510534
num_examples: 71804
download_size: 197846024
dataset_size: 524510534
- config_name: mar_Deva_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524509281
num_examples: 71762
download_size: 197774313
dataset_size: 524509281
- config_name: mar_Deva_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524535770
num_examples: 71196
download_size: 197850643
dataset_size: 524535770
- config_name: mkd_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524670222
num_examples: 127007
download_size: 245366845
dataset_size: 524670222
- config_name: mkd_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524676251
num_examples: 127528
download_size: 245469996
dataset_size: 524676251
- config_name: mkd_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524674473
num_examples: 126695
download_size: 245164854
dataset_size: 524674473
- config_name: mlt_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524752263
num_examples: 154593
download_size: 318656092
dataset_size: 524752263
- config_name: mlt_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524763377
num_examples: 156878
download_size: 318708389
dataset_size: 524763377
- config_name: mlt_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524772894
num_examples: 157843
download_size: 317303693
dataset_size: 524772894
- config_name: mya_Mymr_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524449571
num_examples: 52556
download_size: 177627667
dataset_size: 524449571
- config_name: mya_Mymr_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524452360
num_examples: 53419
download_size: 177772224
dataset_size: 524452360
- config_name: mya_Mymr_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524482211
num_examples: 53825
download_size: 177716038
dataset_size: 524482211
- config_name: nld_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524880875
num_examples: 196719
download_size: 316152641
dataset_size: 524880875
- config_name: nld_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524883581
num_examples: 198400
download_size: 316792288
dataset_size: 524883581
- config_name: nld_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524894699
num_examples: 199214
download_size: 316678445
dataset_size: 524894699
- config_name: nno_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524930575
num_examples: 212979
download_size: 326569847
dataset_size: 524930575
- config_name: nno_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524933498
num_examples: 213242
download_size: 326493233
dataset_size: 524933498
- config_name: nno_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524931650
num_examples: 214477
download_size: 327063086
dataset_size: 524931650
- config_name: nob_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524769026
num_examples: 158122
download_size: 325490792
dataset_size: 524769026
- config_name: nob_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524762549
num_examples: 155909
download_size: 325280396
dataset_size: 524762549
- config_name: nob_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524767158
num_examples: 156837
download_size: 325778618
dataset_size: 524767158
- config_name: npi_Deva_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524559117
num_examples: 87647
download_size: 192698705
dataset_size: 524559117
- config_name: ory_Orya_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524661974
num_examples: 124410
download_size: 195024659
dataset_size: 524661974
- config_name: ory_Orya_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524662451
num_examples: 124736
download_size: 195114348
dataset_size: 524662451
- config_name: ory_Orya_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524661852
num_examples: 124465
download_size: 195172976
dataset_size: 524661852
- config_name: pan_Guru_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524524453
num_examples: 77211
download_size: 200030739
dataset_size: 524524453
- config_name: pan_Guru_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524552035
num_examples: 78143
download_size: 200132358
dataset_size: 524552035
- config_name: pan_Guru_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524517627
num_examples: 76070
download_size: 200166058
dataset_size: 524517627
- config_name: pap_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524939893
num_examples: 179170
download_size: 317811078
dataset_size: 524939893
- config_name: pbt_Arab_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524683234
num_examples: 128350
download_size: 249444502
dataset_size: 524683234
- config_name: pbt_Arab_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524679719
num_examples: 129429
download_size: 249353006
dataset_size: 524679719
- config_name: pbt_Arab_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524683432
num_examples: 128552
download_size: 249338725
dataset_size: 524683432
- config_name: plt_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524734281
num_examples: 148669
download_size: 287196413
dataset_size: 524734281
- config_name: pol_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524786166
num_examples: 163025
download_size: 338269353
dataset_size: 524786166
- config_name: pol_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524814223
num_examples: 165105
download_size: 338387666
dataset_size: 524814223
- config_name: pol_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524782067
num_examples: 164648
download_size: 338365832
dataset_size: 524782067
- config_name: por_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524783529
num_examples: 163325
download_size: 322395684
dataset_size: 524783529
- config_name: por_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524786482
num_examples: 165356
download_size: 322816982
dataset_size: 524786482
- config_name: por_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524940756
num_examples: 166127
download_size: 322877421
dataset_size: 524940756
- config_name: ron_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524752115
num_examples: 153677
download_size: 326267751
dataset_size: 524752115
- config_name: ron_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524749611
num_examples: 153662
download_size: 326090542
dataset_size: 524749611
- config_name: ron_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524754694
num_examples: 154370
download_size: 326444268
dataset_size: 524754694
- config_name: rus_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524520631
num_examples: 75078
download_size: 257159562
dataset_size: 524520631
- config_name: rus_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524517101
num_examples: 75040
download_size: 257145189
dataset_size: 524517101
- config_name: rus_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524515313
num_examples: 75099
download_size: 257601544
dataset_size: 524515313
- config_name: sin_Sinh_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524505001
num_examples: 71167
download_size: 209807001
dataset_size: 524505001
- config_name: sin_Sinh_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524500995
num_examples: 70710
download_size: 209641411
dataset_size: 524500995
- config_name: sin_Sinh_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524515145
num_examples: 70110
download_size: 209888295
dataset_size: 524515145
- config_name: slk_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524831024
num_examples: 180270
download_size: 346470262
dataset_size: 524831024
- config_name: slk_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524826998
num_examples: 178370
download_size: 346332733
dataset_size: 524826998
- config_name: slk_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524828689
num_examples: 179835
download_size: 346532463
dataset_size: 524828689
- config_name: slv_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524776938
num_examples: 162099
download_size: 336183443
dataset_size: 524776938
- config_name: slv_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524786719
num_examples: 165633
download_size: 336594890
dataset_size: 524786719
- config_name: slv_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524803395
num_examples: 162682
download_size: 336224577
dataset_size: 524803395
- config_name: snd_Arab_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524609830
num_examples: 106733
download_size: 253377938
dataset_size: 524609830
- config_name: snd_Arab_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524613155
num_examples: 106885
download_size: 253804507
dataset_size: 524613155
- config_name: som_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 525053208
num_examples: 254924
download_size: 313243226
dataset_size: 525053208
- config_name: som_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 525078335
num_examples: 254830
download_size: 313074850
dataset_size: 525078335
- config_name: som_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 525066431
num_examples: 258920
download_size: 313417157
dataset_size: 525066431
- config_name: srp_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524527460
num_examples: 78452
download_size: 256264280
dataset_size: 524527460
- config_name: srp_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524527331
num_examples: 78049
download_size: 256376941
dataset_size: 524527331
- config_name: srp_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524644570
num_examples: 79010
download_size: 256228157
dataset_size: 524644570
- config_name: swe_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524838185
num_examples: 183267
download_size: 316925602
dataset_size: 524838185
- config_name: swe_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524835513
num_examples: 182207
download_size: 317234875
dataset_size: 524835513
- config_name: swe_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524837009
num_examples: 182914
download_size: 317209339
dataset_size: 524837009
- config_name: swh_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524868247
num_examples: 192252
download_size: 304754665
dataset_size: 524868247
- config_name: swh_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524867380
num_examples: 191766
download_size: 304769626
dataset_size: 524867380
- config_name: swh_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524981967
num_examples: 191948
download_size: 304876289
dataset_size: 524981967
- config_name: tam_Taml_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524459756
num_examples: 56554
download_size: 183197708
dataset_size: 524459756
- config_name: tam_Taml_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524456693
num_examples: 55822
download_size: 183390987
dataset_size: 524456693
- config_name: tam_Taml_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524463275
num_examples: 56554
download_size: 183471050
dataset_size: 524463275
- config_name: tat_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524451675
num_examples: 51780
download_size: 256095078
dataset_size: 524451675
- config_name: tat_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524446080
num_examples: 51297
download_size: 256283881
dataset_size: 524446080
- config_name: tat_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524460425
num_examples: 51758
download_size: 256059680
dataset_size: 524460425
- config_name: tel_Telu_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524463310
num_examples: 57121
download_size: 199268829
dataset_size: 524463310
- config_name: tel_Telu_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524460680
num_examples: 57099
download_size: 199462642
dataset_size: 524460680
- config_name: tel_Telu_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524463257
num_examples: 56999
download_size: 199348439
dataset_size: 524463257
- config_name: tgk_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524585687
num_examples: 93609
download_size: 240988948
dataset_size: 524585687
- config_name: tgk_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524575130
num_examples: 94795
download_size: 241244130
dataset_size: 524575130
- config_name: tgk_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524579484
num_examples: 94512
download_size: 241302961
dataset_size: 524579484
- config_name: tha_Thai_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524490580
num_examples: 66189
download_size: 200398822
dataset_size: 524490580
- config_name: tha_Thai_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524505299
num_examples: 66180
download_size: 200558418
dataset_size: 524505299
- config_name: tha_Thai_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524503991
num_examples: 66275
download_size: 200568488
dataset_size: 524503991
- config_name: uig_Arab_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524523251
num_examples: 77527
download_size: 237746451
dataset_size: 524523251
- config_name: uig_Arab_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524530733
num_examples: 77872
download_size: 237703728
dataset_size: 524530733
- config_name: ukr_Cyrl_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524542663
num_examples: 84655
download_size: 257243615
dataset_size: 524542663
- config_name: ukr_Cyrl_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524550000
num_examples: 85302
download_size: 257645625
dataset_size: 524550000
- config_name: ukr_Cyrl_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524545922
num_examples: 84444
download_size: 257415681
dataset_size: 524545922
- config_name: urd_Arab_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524586669
num_examples: 99322
download_size: 246910655
dataset_size: 524586669
- config_name: urd_Arab_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524591482
num_examples: 99759
download_size: 246947544
dataset_size: 524591482
- config_name: urd_Arab_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524602868
num_examples: 100400
download_size: 247100994
dataset_size: 524602868
- config_name: uzn_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524719413
num_examples: 139988
download_size: 300933712
dataset_size: 524719413
- config_name: uzn_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524708639
num_examples: 139792
download_size: 300648679
dataset_size: 524708639
- config_name: uzn_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524699782
num_examples: 137088
download_size: 300563984
dataset_size: 524699782
- config_name: vie_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524574434
num_examples: 93849
download_size: 267511936
dataset_size: 524574434
- config_name: vie_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524573206
num_examples: 94047
download_size: 267449640
dataset_size: 524573206
- config_name: vie_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524624961
num_examples: 93103
download_size: 267242223
dataset_size: 524624961
- config_name: zsm_Latn_subset_1
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524726164
num_examples: 146017
download_size: 300345810
dataset_size: 524726164
- config_name: zsm_Latn_subset_2
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524742477
num_examples: 147106
download_size: 300182121
dataset_size: 524742477
- config_name: zsm_Latn_subset_3
features:
- name: text
dtype: string
splits:
- name: train
num_bytes: 524730163
num_examples: 147287
download_size: 300299387
dataset_size: 524730163
configs:
- config_name: afr_Latn_subset_1
data_files:
- split: train
path: afr_Latn_subset_1/train-*
- config_name: afr_Latn_subset_2
data_files:
- split: train
path: afr_Latn_subset_2/train-*
- config_name: afr_Latn_subset_3
data_files:
- split: train
path: afr_Latn_subset_3/train-*
- config_name: als_Latn_subset_1
data_files:
- split: train
path: als_Latn_subset_1/train-*
- config_name: als_Latn_subset_2
data_files:
- split: train
path: als_Latn_subset_2/train-*
- config_name: als_Latn_subset_3
data_files:
- split: train
path: als_Latn_subset_3/train-*
- config_name: amh_Ethi_subset_1
data_files:
- split: train
path: amh_Ethi_subset_1/train-*
- config_name: amh_Ethi_subset_2
data_files:
- split: train
path: amh_Ethi_subset_2/train-*
- config_name: amh_Ethi_subset_3
data_files:
- split: train
path: amh_Ethi_subset_3/train-*
- config_name: arb_Arab_subset_1
data_files:
- split: train
path: arb_Arab_subset_1/train-*
- config_name: arb_Arab_subset_2
data_files:
- split: train
path: arb_Arab_subset_2/train-*
- config_name: arb_Arab_subset_3
data_files:
- split: train
path: arb_Arab_subset_3/train-*
- config_name: arz_Arab_subset_1
data_files:
- split: train
path: arz_Arab_subset_1/train-*
- config_name: arz_Arab_subset_2
data_files:
- split: train
path: arz_Arab_subset_2/train-*
- config_name: arz_Arab_subset_3
data_files:
- split: train
path: arz_Arab_subset_3/train-*
- config_name: asm_Beng_subset_1
data_files:
- split: train
path: asm_Beng_subset_1/train-*
- config_name: asm_Beng_subset_2
data_files:
- split: train
path: asm_Beng_subset_2/train-*
- config_name: azj_Latn_subset_1
data_files:
- split: train
path: azj_Latn_subset_1/train-*
- config_name: azj_Latn_subset_2
data_files:
- split: train
path: azj_Latn_subset_2/train-*
- config_name: azj_Latn_subset_3
data_files:
- split: train
path: azj_Latn_subset_3/train-*
- config_name: bel_Cyrl_subset_1
data_files:
- split: train
path: bel_Cyrl_subset_1/train-*
- config_name: bel_Cyrl_subset_2
data_files:
- split: train
path: bel_Cyrl_subset_2/train-*
- config_name: bel_Cyrl_subset_3
data_files:
- split: train
path: bel_Cyrl_subset_3/train-*
- config_name: ben_Beng_subset_1
data_files:
- split: train
path: ben_Beng_subset_1/train-*
- config_name: ben_Beng_subset_2
data_files:
- split: train
path: ben_Beng_subset_2/train-*
- config_name: ben_Beng_subset_3
data_files:
- split: train
path: ben_Beng_subset_3/train-*
- config_name: bod_Tibt_subset_1
data_files:
- split: train
path: bod_Tibt_subset_1/train-*
- config_name: bod_Tibt_subset_2
data_files:
- split: train
path: bod_Tibt_subset_2/train-*
- config_name: bod_Tibt_subset_3
data_files:
- split: train
path: bod_Tibt_subset_3/train-*
- config_name: bos_Latn_subset_1
data_files:
- split: train
path: bos_Latn_subset_1/train-*
- config_name: bos_Latn_subset_2
data_files:
- split: train
path: bos_Latn_subset_2/train-*
- config_name: bos_Latn_subset_3
data_files:
- split: train
path: bos_Latn_subset_3/train-*
- config_name: bul_Cyrl_subset_1
data_files:
- split: train
path: bul_Cyrl_subset_1/train-*
- config_name: bul_Cyrl_subset_2
data_files:
- split: train
path: bul_Cyrl_subset_2/train-*
- config_name: bul_Cyrl_subset_3
data_files:
- split: train
path: bul_Cyrl_subset_3/train-*
- config_name: cat_Latn_subset_1
data_files:
- split: train
path: cat_Latn_subset_1/train-*
- config_name: cat_Latn_subset_2
data_files:
- split: train
path: cat_Latn_subset_2/train-*
- config_name: cat_Latn_subset_3
data_files:
- split: train
path: cat_Latn_subset_3/train-*
- config_name: ces_Latn_subset_1
data_files:
- split: train
path: ces_Latn_subset_1/train-*
- config_name: ces_Latn_subset_2
data_files:
- split: train
path: ces_Latn_subset_2/train-*
- config_name: ces_Latn_subset_3
data_files:
- split: train
path: ces_Latn_subset_3/train-*
- config_name: ckb_Arab_subset_1
data_files:
- split: train
path: ckb_Arab_subset_1/train-*
- config_name: ckb_Arab_subset_2
data_files:
- split: train
path: ckb_Arab_subset_2/train-*
- config_name: ckb_Arab_subset_3
data_files:
- split: train
path: ckb_Arab_subset_3/train-*
- config_name: cym_Latn_subset_1
data_files:
- split: train
path: cym_Latn_subset_1/train-*
- config_name: cym_Latn_subset_2
data_files:
- split: train
path: cym_Latn_subset_2/train-*
- config_name: cym_Latn_subset_3
data_files:
- split: train
path: cym_Latn_subset_3/train-*
- config_name: dan_Latn_subset_1
data_files:
- split: train
path: dan_Latn_subset_1/train-*
- config_name: dan_Latn_subset_2
data_files:
- split: train
path: dan_Latn_subset_2/train-*
- config_name: dan_Latn_subset_3
data_files:
- split: train
path: dan_Latn_subset_3/train-*
- config_name: deu_Latn_subset_1
data_files:
- split: train
path: deu_Latn_subset_1/train-*
- config_name: deu_Latn_subset_2
data_files:
- split: train
path: deu_Latn_subset_2/train-*
- config_name: deu_Latn_subset_3
data_files:
- split: train
path: deu_Latn_subset_3/train-*
- config_name: ekk_Latn_subset_1
data_files:
- split: train
path: ekk_Latn_subset_1/train-*
- config_name: ekk_Latn_subset_2
data_files:
- split: train
path: ekk_Latn_subset_2/train-*
- config_name: ekk_Latn_subset_3
data_files:
- split: train
path: ekk_Latn_subset_3/train-*
- config_name: ell_Grek_subset_1
data_files:
- split: train
path: ell_Grek_subset_1/train-*
- config_name: ell_Grek_subset_2
data_files:
- split: train
path: ell_Grek_subset_2/train-*
- config_name: ell_Grek_subset_3
data_files:
- split: train
path: ell_Grek_subset_3/train-*
- config_name: eus_Latn_subset_1
data_files:
- split: train
path: eus_Latn_subset_1/train-*
- config_name: eus_Latn_subset_2
data_files:
- split: train
path: eus_Latn_subset_2/train-*
- config_name: eus_Latn_subset_3
data_files:
- split: train
path: eus_Latn_subset_3/train-*
- config_name: fil_Latn_subset_1
data_files:
- split: train
path: fil_Latn_subset_1/train-*
- config_name: fil_Latn_subset_2
data_files:
- split: train
path: fil_Latn_subset_2/train-*
- config_name: fil_Latn_subset_3
data_files:
- split: train
path: fil_Latn_subset_3/train-*
- config_name: fin_Latn_subset_1
data_files:
- split: train
path: fin_Latn_subset_1/train-*
- config_name: fin_Latn_subset_2
data_files:
- split: train
path: fin_Latn_subset_2/train-*
- config_name: fin_Latn_subset_3
data_files:
- split: train
path: fin_Latn_subset_3/train-*
- config_name: fra_Latn_subset_1
data_files:
- split: train
path: fra_Latn_subset_1/train-*
- config_name: fra_Latn_subset_2
data_files:
- split: train
path: fra_Latn_subset_2/train-*
- config_name: fra_Latn_subset_3
data_files:
- split: train
path: fra_Latn_subset_3/train-*
- config_name: gle_Latn_subset_1
data_files:
- split: train
path: gle_Latn_subset_1/train-*
- config_name: gle_Latn_subset_2
data_files:
- split: train
path: gle_Latn_subset_2/train-*
- config_name: gle_Latn_subset_3
data_files:
- split: train
path: gle_Latn_subset_3/train-*
- config_name: glg_Latn_subset_1
data_files:
- split: train
path: glg_Latn_subset_1/train-*
- config_name: glg_Latn_subset_2
data_files:
- split: train
path: glg_Latn_subset_2/train-*
- config_name: glg_Latn_subset_3
data_files:
- split: train
path: glg_Latn_subset_3/train-*
- config_name: guj_Gujr_subset_1
data_files:
- split: train
path: guj_Gujr_subset_1/train-*
- config_name: guj_Gujr_subset_2
data_files:
- split: train
path: guj_Gujr_subset_2/train-*
- config_name: guj_Gujr_subset_3
data_files:
- split: train
path: guj_Gujr_subset_3/train-*
- config_name: hau_Latn_subset_1
data_files:
- split: train
path: hau_Latn_subset_1/train-*
- config_name: hau_Latn_subset_2
data_files:
- split: train
path: hau_Latn_subset_2/train-*
- config_name: heb_Hebr_subset_1
data_files:
- split: train
path: heb_Hebr_subset_1/train-*
- config_name: heb_Hebr_subset_2
data_files:
- split: train
path: heb_Hebr_subset_2/train-*
- config_name: heb_Hebr_subset_3
data_files:
- split: train
path: heb_Hebr_subset_3/train-*
- config_name: hin_Deva_subset_1
data_files:
- split: train
path: hin_Deva_subset_1/train-*
- config_name: hin_Deva_subset_2
data_files:
- split: train
path: hin_Deva_subset_2/train-*
- config_name: hin_Deva_subset_3
data_files:
- split: train
path: hin_Deva_subset_3/train-*
- config_name: hrv_Latn_subset_1
data_files:
- split: train
path: hrv_Latn_subset_1/train-*
- config_name: hrv_Latn_subset_2
data_files:
- split: train
path: hrv_Latn_subset_2/train-*
- config_name: hrv_Latn_subset_3
data_files:
- split: train
path: hrv_Latn_subset_3/train-*
- config_name: hun_Latn_subset_1
data_files:
- split: train
path: hun_Latn_subset_1/train-*
- config_name: hun_Latn_subset_2
data_files:
- split: train
path: hun_Latn_subset_2/train-*
- config_name: hun_Latn_subset_3
data_files:
- split: train
path: hun_Latn_subset_3/train-*
- config_name: hye_Armn_subset_1
data_files:
- split: train
path: hye_Armn_subset_1/train-*
- config_name: hye_Armn_subset_2
data_files:
- split: train
path: hye_Armn_subset_2/train-*
- config_name: hye_Armn_subset_3
data_files:
- split: train
path: hye_Armn_subset_3/train-*
- config_name: ind_Latn_subset_1
data_files:
- split: train
path: ind_Latn_subset_1/train-*
- config_name: ind_Latn_subset_2
data_files:
- split: train
path: ind_Latn_subset_2/train-*
- config_name: ind_Latn_subset_3
data_files:
- split: train
path: ind_Latn_subset_3/train-*
- config_name: isl_Latn_subset_1
data_files:
- split: train
path: isl_Latn_subset_1/train-*
- config_name: isl_Latn_subset_2
data_files:
- split: train
path: isl_Latn_subset_2/train-*
- config_name: isl_Latn_subset_3
data_files:
- split: train
path: isl_Latn_subset_3/train-*
- config_name: ita_Latn_subset_1
data_files:
- split: train
path: ita_Latn_subset_1/train-*
- config_name: ita_Latn_subset_2
data_files:
- split: train
path: ita_Latn_subset_2/train-*
- config_name: ita_Latn_subset_3
data_files:
- split: train
path: ita_Latn_subset_3/train-*
- config_name: jav_Latn_subset_1
data_files:
- split: train
path: jav_Latn_subset_1/train-*
- config_name: jpn_Jpan_subset_1
data_files:
- split: train
path: jpn_Jpan_subset_1/train-*
- config_name: jpn_Jpan_subset_2
data_files:
- split: train
path: jpn_Jpan_subset_2/train-*
- config_name: jpn_Jpan_subset_3
data_files:
- split: train
path: jpn_Jpan_subset_3/train-*
- config_name: kan_Knda_subset_1
data_files:
- split: train
path: kan_Knda_subset_1/train-*
- config_name: kan_Knda_subset_2
data_files:
- split: train
path: kan_Knda_subset_2/train-*
- config_name: kan_Knda_subset_3
data_files:
- split: train
path: kan_Knda_subset_3/train-*
- config_name: kat_Geor_subset_1
data_files:
- split: train
path: kat_Geor_subset_1/train-*
- config_name: kat_Geor_subset_2
data_files:
- split: train
path: kat_Geor_subset_2/train-*
- config_name: kat_Geor_subset_3
data_files:
- split: train
path: kat_Geor_subset_3/train-*
- config_name: kaz_Cyrl_subset_1
data_files:
- split: train
path: kaz_Cyrl_subset_1/train-*
- config_name: kaz_Cyrl_subset_2
data_files:
- split: train
path: kaz_Cyrl_subset_2/train-*
- config_name: kaz_Cyrl_subset_3
data_files:
- split: train
path: kaz_Cyrl_subset_3/train-*
- config_name: khk_Cyrl_subset_1
data_files:
- split: train
path: khk_Cyrl_subset_1/train-*
- config_name: khk_Cyrl_subset_2
data_files:
- split: train
path: khk_Cyrl_subset_2/train-*
- config_name: khk_Cyrl_subset_3
data_files:
- split: train
path: khk_Cyrl_subset_3/train-*
- config_name: khm_Khmr_subset_1
data_files:
- split: train
path: khm_Khmr_subset_1/train-*
- config_name: khm_Khmr_subset_2
data_files:
- split: train
path: khm_Khmr_subset_2/train-*
- config_name: khm_Khmr_subset_3
data_files:
- split: train
path: khm_Khmr_subset_3/train-*
- config_name: kir_Cyrl_subset_1
data_files:
- split: train
path: kir_Cyrl_subset_1/train-*
- config_name: kir_Cyrl_subset_2
data_files:
- split: train
path: kir_Cyrl_subset_2/train-*
- config_name: kir_Cyrl_subset_3
data_files:
- split: train
path: kir_Cyrl_subset_3/train-*
- config_name: kmr_Latn_subset_1
data_files:
- split: train
path: kmr_Latn_subset_1/train-*
- config_name: kmr_Latn_subset_2
data_files:
- split: train
path: kmr_Latn_subset_2/train-*
- config_name: kor_Hang_subset_1
data_files:
- split: train
path: kor_Hang_subset_1/train-*
- config_name: kor_Hang_subset_2
data_files:
- split: train
path: kor_Hang_subset_2/train-*
- config_name: kor_Hang_subset_3
data_files:
- split: train
path: kor_Hang_subset_3/train-*
- config_name: lao_Laoo_subset_1
data_files:
- split: train
path: lao_Laoo_subset_1/train-*
- config_name: lao_Laoo_subset_2
data_files:
- split: train
path: lao_Laoo_subset_2/train-*
- config_name: lao_Laoo_subset_3
data_files:
- split: train
path: lao_Laoo_subset_3/train-*
- config_name: lit_Latn_subset_1
data_files:
- split: train
path: lit_Latn_subset_1/train-*
- config_name: lit_Latn_subset_2
data_files:
- split: train
path: lit_Latn_subset_2/train-*
- config_name: lit_Latn_subset_3
data_files:
- split: train
path: lit_Latn_subset_3/train-*
- config_name: lvs_Latn_subset_1
data_files:
- split: train
path: lvs_Latn_subset_1/train-*
- config_name: lvs_Latn_subset_2
data_files:
- split: train
path: lvs_Latn_subset_2/train-*
- config_name: lvs_Latn_subset_3
data_files:
- split: train
path: lvs_Latn_subset_3/train-*
- config_name: mal_Mlym_subset_1
data_files:
- split: train
path: mal_Mlym_subset_1/train-*
- config_name: mal_Mlym_subset_2
data_files:
- split: train
path: mal_Mlym_subset_2/train-*
- config_name: mal_Mlym_subset_3
data_files:
- split: train
path: mal_Mlym_subset_3/train-*
- config_name: mar_Deva_subset_1
data_files:
- split: train
path: mar_Deva_subset_1/train-*
- config_name: mar_Deva_subset_2
data_files:
- split: train
path: mar_Deva_subset_2/train-*
- config_name: mar_Deva_subset_3
data_files:
- split: train
path: mar_Deva_subset_3/train-*
- config_name: mkd_Cyrl_subset_1
data_files:
- split: train
path: mkd_Cyrl_subset_1/train-*
- config_name: mkd_Cyrl_subset_2
data_files:
- split: train
path: mkd_Cyrl_subset_2/train-*
- config_name: mkd_Cyrl_subset_3
data_files:
- split: train
path: mkd_Cyrl_subset_3/train-*
- config_name: mlt_Latn_subset_1
data_files:
- split: train
path: mlt_Latn_subset_1/train-*
- config_name: mlt_Latn_subset_2
data_files:
- split: train
path: mlt_Latn_subset_2/train-*
- config_name: mlt_Latn_subset_3
data_files:
- split: train
path: mlt_Latn_subset_3/train-*
- config_name: mya_Mymr_subset_1
data_files:
- split: train
path: mya_Mymr_subset_1/train-*
- config_name: mya_Mymr_subset_2
data_files:
- split: train
path: mya_Mymr_subset_2/train-*
- config_name: mya_Mymr_subset_3
data_files:
- split: train
path: mya_Mymr_subset_3/train-*
- config_name: nld_Latn_subset_1
data_files:
- split: train
path: nld_Latn_subset_1/train-*
- config_name: nld_Latn_subset_2
data_files:
- split: train
path: nld_Latn_subset_2/train-*
- config_name: nld_Latn_subset_3
data_files:
- split: train
path: nld_Latn_subset_3/train-*
- config_name: nno_Latn_subset_1
data_files:
- split: train
path: nno_Latn_subset_1/train-*
- config_name: nno_Latn_subset_2
data_files:
- split: train
path: nno_Latn_subset_2/train-*
- config_name: nno_Latn_subset_3
data_files:
- split: train
path: nno_Latn_subset_3/train-*
- config_name: nob_Latn_subset_1
data_files:
- split: train
path: nob_Latn_subset_1/train-*
- config_name: nob_Latn_subset_2
data_files:
- split: train
path: nob_Latn_subset_2/train-*
- config_name: nob_Latn_subset_3
data_files:
- split: train
path: nob_Latn_subset_3/train-*
- config_name: npi_Deva_subset_1
data_files:
- split: train
path: npi_Deva_subset_1/train-*
- config_name: ory_Orya_subset_1
data_files:
- split: train
path: ory_Orya_subset_1/train-*
- config_name: ory_Orya_subset_2
data_files:
- split: train
path: ory_Orya_subset_2/train-*
- config_name: ory_Orya_subset_3
data_files:
- split: train
path: ory_Orya_subset_3/train-*
- config_name: pan_Guru_subset_1
data_files:
- split: train
path: pan_Guru_subset_1/train-*
- config_name: pan_Guru_subset_2
data_files:
- split: train
path: pan_Guru_subset_2/train-*
- config_name: pan_Guru_subset_3
data_files:
- split: train
path: pan_Guru_subset_3/train-*
- config_name: pap_Latn_subset_1
data_files:
- split: train
path: pap_Latn_subset_1/train-*
- config_name: pbt_Arab_subset_1
data_files:
- split: train
path: pbt_Arab_subset_1/train-*
- config_name: pbt_Arab_subset_2
data_files:
- split: train
path: pbt_Arab_subset_2/train-*
- config_name: pbt_Arab_subset_3
data_files:
- split: train
path: pbt_Arab_subset_3/train-*
- config_name: plt_Latn_subset_1
data_files:
- split: train
path: plt_Latn_subset_1/train-*
- config_name: pol_Latn_subset_1
data_files:
- split: train
path: pol_Latn_subset_1/train-*
- config_name: pol_Latn_subset_2
data_files:
- split: train
path: pol_Latn_subset_2/train-*
- config_name: pol_Latn_subset_3
data_files:
- split: train
path: pol_Latn_subset_3/train-*
- config_name: por_Latn_subset_1
data_files:
- split: train
path: por_Latn_subset_1/train-*
- config_name: por_Latn_subset_2
data_files:
- split: train
path: por_Latn_subset_2/train-*
- config_name: por_Latn_subset_3
data_files:
- split: train
path: por_Latn_subset_3/train-*
- config_name: ron_Latn_subset_1
data_files:
- split: train
path: ron_Latn_subset_1/train-*
- config_name: ron_Latn_subset_2
data_files:
- split: train
path: ron_Latn_subset_2/train-*
- config_name: ron_Latn_subset_3
data_files:
- split: train
path: ron_Latn_subset_3/train-*
- config_name: rus_Cyrl_subset_1
data_files:
- split: train
path: rus_Cyrl_subset_1/train-*
- config_name: rus_Cyrl_subset_2
data_files:
- split: train
path: rus_Cyrl_subset_2/train-*
- config_name: rus_Cyrl_subset_3
data_files:
- split: train
path: rus_Cyrl_subset_3/train-*
- config_name: sin_Sinh_subset_1
data_files:
- split: train
path: sin_Sinh_subset_1/train-*
- config_name: sin_Sinh_subset_2
data_files:
- split: train
path: sin_Sinh_subset_2/train-*
- config_name: sin_Sinh_subset_3
data_files:
- split: train
path: sin_Sinh_subset_3/train-*
- config_name: slk_Latn_subset_1
data_files:
- split: train
path: slk_Latn_subset_1/train-*
- config_name: slk_Latn_subset_2
data_files:
- split: train
path: slk_Latn_subset_2/train-*
- config_name: slk_Latn_subset_3
data_files:
- split: train
path: slk_Latn_subset_3/train-*
- config_name: slv_Latn_subset_1
data_files:
- split: train
path: slv_Latn_subset_1/train-*
- config_name: slv_Latn_subset_2
data_files:
- split: train
path: slv_Latn_subset_2/train-*
- config_name: slv_Latn_subset_3
data_files:
- split: train
path: slv_Latn_subset_3/train-*
- config_name: snd_Arab_subset_1
data_files:
- split: train
path: snd_Arab_subset_1/train-*
- config_name: snd_Arab_subset_2
data_files:
- split: train
path: snd_Arab_subset_2/train-*
- config_name: som_Latn_subset_1
data_files:
- split: train
path: som_Latn_subset_1/train-*
- config_name: som_Latn_subset_2
data_files:
- split: train
path: som_Latn_subset_2/train-*
- config_name: som_Latn_subset_3
data_files:
- split: train
path: som_Latn_subset_3/train-*
- config_name: srp_Cyrl_subset_1
data_files:
- split: train
path: srp_Cyrl_subset_1/train-*
- config_name: srp_Cyrl_subset_2
data_files:
- split: train
path: srp_Cyrl_subset_2/train-*
- config_name: srp_Cyrl_subset_3
data_files:
- split: train
path: srp_Cyrl_subset_3/train-*
- config_name: swe_Latn_subset_1
data_files:
- split: train
path: swe_Latn_subset_1/train-*
- config_name: swe_Latn_subset_2
data_files:
- split: train
path: swe_Latn_subset_2/train-*
- config_name: swe_Latn_subset_3
data_files:
- split: train
path: swe_Latn_subset_3/train-*
- config_name: swh_Latn_subset_1
data_files:
- split: train
path: swh_Latn_subset_1/train-*
- config_name: swh_Latn_subset_2
data_files:
- split: train
path: swh_Latn_subset_2/train-*
- config_name: swh_Latn_subset_3
data_files:
- split: train
path: swh_Latn_subset_3/train-*
- config_name: tam_Taml_subset_1
data_files:
- split: train
path: tam_Taml_subset_1/train-*
- config_name: tam_Taml_subset_2
data_files:
- split: train
path: tam_Taml_subset_2/train-*
- config_name: tam_Taml_subset_3
data_files:
- split: train
path: tam_Taml_subset_3/train-*
- config_name: tat_Cyrl_subset_1
data_files:
- split: train
path: tat_Cyrl_subset_1/train-*
- config_name: tat_Cyrl_subset_2
data_files:
- split: train
path: tat_Cyrl_subset_2/train-*
- config_name: tat_Cyrl_subset_3
data_files:
- split: train
path: tat_Cyrl_subset_3/train-*
- config_name: tel_Telu_subset_1
data_files:
- split: train
path: tel_Telu_subset_1/train-*
- config_name: tel_Telu_subset_2
data_files:
- split: train
path: tel_Telu_subset_2/train-*
- config_name: tel_Telu_subset_3
data_files:
- split: train
path: tel_Telu_subset_3/train-*
- config_name: tgk_Cyrl_subset_1
data_files:
- split: train
path: tgk_Cyrl_subset_1/train-*
- config_name: tgk_Cyrl_subset_2
data_files:
- split: train
path: tgk_Cyrl_subset_2/train-*
- config_name: tgk_Cyrl_subset_3
data_files:
- split: train
path: tgk_Cyrl_subset_3/train-*
- config_name: tha_Thai_subset_1
data_files:
- split: train
path: tha_Thai_subset_1/train-*
- config_name: tha_Thai_subset_2
data_files:
- split: train
path: tha_Thai_subset_2/train-*
- config_name: tha_Thai_subset_3
data_files:
- split: train
path: tha_Thai_subset_3/train-*
- config_name: uig_Arab_subset_1
data_files:
- split: train
path: uig_Arab_subset_1/train-*
- config_name: uig_Arab_subset_2
data_files:
- split: train
path: uig_Arab_subset_2/train-*
- config_name: ukr_Cyrl_subset_1
data_files:
- split: train
path: ukr_Cyrl_subset_1/train-*
- config_name: ukr_Cyrl_subset_2
data_files:
- split: train
path: ukr_Cyrl_subset_2/train-*
- config_name: ukr_Cyrl_subset_3
data_files:
- split: train
path: ukr_Cyrl_subset_3/train-*
- config_name: urd_Arab_subset_1
data_files:
- split: train
path: urd_Arab_subset_1/train-*
- config_name: urd_Arab_subset_2
data_files:
- split: train
path: urd_Arab_subset_2/train-*
- config_name: urd_Arab_subset_3
data_files:
- split: train
path: urd_Arab_subset_3/train-*
- config_name: uzn_Latn_subset_1
data_files:
- split: train
path: uzn_Latn_subset_1/train-*
- config_name: uzn_Latn_subset_2
data_files:
- split: train
path: uzn_Latn_subset_2/train-*
- config_name: uzn_Latn_subset_3
data_files:
- split: train
path: uzn_Latn_subset_3/train-*
- config_name: vie_Latn_subset_1
data_files:
- split: train
path: vie_Latn_subset_1/train-*
- config_name: vie_Latn_subset_2
data_files:
- split: train
path: vie_Latn_subset_2/train-*
- config_name: vie_Latn_subset_3
data_files:
- split: train
path: vie_Latn_subset_3/train-*
- config_name: zsm_Latn_subset_1
data_files:
- split: train
path: zsm_Latn_subset_1/train-*
- config_name: zsm_Latn_subset_2
data_files:
- split: train
path: zsm_Latn_subset_2/train-*
- config_name: zsm_Latn_subset_3
data_files:
- split: train
path: zsm_Latn_subset_3/train-*
---
提供机构:
catherinearnett



