sproos/wikipedia-title-text-pairs
收藏Hugging Face2024-01-03 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/sproos/wikipedia-title-text-pairs
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: ab
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1644341
num_examples: 1576
download_size: 709370
dataset_size: 1644341
- config_name: ace
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2173660
num_examples: 8489
download_size: 830268
dataset_size: 2173660
- config_name: ady
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 429339
num_examples: 522
download_size: 205308
dataset_size: 429339
- config_name: af
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 166299481
num_examples: 204317
download_size: 94370515
dataset_size: 166299481
- config_name: alt
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3974030
num_examples: 4240
download_size: 1623904
dataset_size: 3974030
- config_name: am
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 13172809
num_examples: 9782
download_size: 6138866
dataset_size: 13172809
- config_name: ami
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3478701
num_examples: 3629
download_size: 1788840
dataset_size: 3478701
- config_name: an
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 41578614
num_examples: 79302
download_size: 23412452
dataset_size: 41578614
- config_name: ang
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1810454
num_examples: 3531
download_size: 1173048
dataset_size: 1810454
- config_name: anp
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4637069
num_examples: 2238
download_size: 1783765
dataset_size: 4637069
- config_name: ar
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2074092955
num_examples: 1978419
download_size: 940821033
dataset_size: 2074092955
- config_name: arc
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 337763
num_examples: 752
download_size: 166806
dataset_size: 337763
- config_name: ary
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 8336246
num_examples: 12782
download_size: 3662614
dataset_size: 8336246
- config_name: arz
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 519197847
num_examples: 816075
download_size: 69488970
dataset_size: 519197847
- config_name: as
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 66313275
num_examples: 34743
download_size: 25706302
dataset_size: 66313275
- config_name: ast
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 337276492
num_examples: 339625
download_size: 201934083
dataset_size: 337276492
- config_name: atj
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 520958
num_examples: 1066
download_size: 304143
dataset_size: 520958
- config_name: av
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3528226
num_examples: 3334
download_size: 1573201
dataset_size: 3528226
- config_name: avk
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 11730415
num_examples: 43588
download_size: 4205830
dataset_size: 11730415
- config_name: awa
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1465350
num_examples: 1286
download_size: 559381
dataset_size: 1465350
- config_name: ay
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1168894
num_examples: 4108
download_size: 558421
dataset_size: 1168894
- config_name: az
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 326338236
num_examples: 328201
download_size: 179029928
dataset_size: 326338236
- config_name: azb
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 73335621
num_examples: 171028
download_size: 21435303
dataset_size: 73335621
- config_name: ba
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 207304656
num_examples: 164050
download_size: 87599535
dataset_size: 207304656
- config_name: ban
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 8660809
num_examples: 21085
download_size: 4054193
dataset_size: 8660809
- config_name: bar
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 20055538
num_examples: 39958
download_size: 12418316
dataset_size: 20055538
- config_name: bcl
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 14060191
num_examples: 19021
download_size: 8251132
dataset_size: 14060191
- config_name: be
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 475308039
num_examples: 393613
download_size: 226186389
dataset_size: 475308039
- config_name: bg
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 823033374
num_examples: 587687
download_size: 392570254
dataset_size: 823033374
- config_name: bh
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 11853184
num_examples: 9647
download_size: 4413678
dataset_size: 11853184
- config_name: bi
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 167289
num_examples: 753
download_size: 96076
dataset_size: 167289
- config_name: bjn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3656195
num_examples: 5888
download_size: 2090215
dataset_size: 3656195
- config_name: blk
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7870709
num_examples: 2605
download_size: 2437996
dataset_size: 7870709
- config_name: bm
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 310793
num_examples: 403
download_size: 186230
dataset_size: 310793
- config_name: bn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 645866435
num_examples: 341688
download_size: 238009185
dataset_size: 645866435
- config_name: bo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 28460350
num_examples: 1964
download_size: 8232639
dataset_size: 28460350
- config_name: bpy
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 14141921
num_examples: 23173
download_size: 2240817
dataset_size: 14141921
- config_name: br
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 49288575
num_examples: 107785
download_size: 30516246
dataset_size: 49288575
- config_name: bs
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 125483841
num_examples: 164949
download_size: 75988761
dataset_size: 125483841
- config_name: bug
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 149949
num_examples: 225
download_size: 76250
dataset_size: 149949
- config_name: bxr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4870117
num_examples: 3674
download_size: 2272586
dataset_size: 4870117
- config_name: ca
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1477498594
num_examples: 1744590
download_size: 864492644
dataset_size: 1477498594
- config_name: cdo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1707484
num_examples: 4121
download_size: 802979
dataset_size: 1707484
- config_name: ce
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 351174229
num_examples: 547742
download_size: 20301039
dataset_size: 351174229
- config_name: ceb
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3125862799
num_examples: 5742548
download_size: 506330001
dataset_size: 3125862799
- config_name: ch
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 95817
num_examples: 431
download_size: 48870
dataset_size: 95817
- config_name: chr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 225116
num_examples: 222
download_size: 112167
dataset_size: 225116
- config_name: chy
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 5371
num_examples: 19
download_size: 5758
dataset_size: 5371
- config_name: ckb
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 72168105
num_examples: 69734
download_size: 31637686
dataset_size: 72168105
- config_name: co
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7949031
num_examples: 16222
download_size: 4202194
dataset_size: 7949031
- config_name: cr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 10480
num_examples: 16
download_size: 10921
dataset_size: 10480
- config_name: crh
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2216671
num_examples: 6776
download_size: 1197154
dataset_size: 2216671
- config_name: cs
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1074548923
num_examples: 1280691
download_size: 708665136
dataset_size: 1074548923
- config_name: csb
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2213549
num_examples: 5079
download_size: 1281331
dataset_size: 2213549
- config_name: cu
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 599755
num_examples: 1179
download_size: 261505
dataset_size: 599755
- config_name: cv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 43459611
num_examples: 69645
download_size: 15638924
dataset_size: 43459611
- config_name: cy
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 202006160
num_examples: 424538
download_size: 80173108
dataset_size: 202006160
- config_name: da
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 396508940
num_examples: 512093
download_size: 245775043
dataset_size: 396508940
- config_name: dag
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 8458584
num_examples: 16082
download_size: 3376234
dataset_size: 8458584
- config_name: de
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6838430796
num_examples: 7579295
download_size: 4217275524
dataset_size: 6838430796
- config_name: din
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 426665
num_examples: 375
download_size: 264031
dataset_size: 426665
- config_name: diq
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6946487
num_examples: 23344
download_size: 3350828
dataset_size: 6946487
- config_name: dsb
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1641782
num_examples: 3975
download_size: 1025014
dataset_size: 1641782
- config_name: dty
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4176627
num_examples: 3235
download_size: 1571607
dataset_size: 4176627
- config_name: dv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7902913
num_examples: 5391
download_size: 3040165
dataset_size: 7902913
- config_name: dz
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 5391618
num_examples: 787
download_size: 1566348
dataset_size: 5391618
- config_name: ee
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 587818
num_examples: 932
download_size: 322541
dataset_size: 587818
- config_name: el
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1031633409
num_examples: 618034
download_size: 496451447
dataset_size: 1031633409
- config_name: eml
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1277442
num_examples: 3518
download_size: 740639
dataset_size: 1277442
- config_name: en
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 14600063665
num_examples: 17322940
download_size: 8662069030
dataset_size: 14600063665
- config_name: eo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 347506297
num_examples: 582605
download_size: 203719257
dataset_size: 347506297
- config_name: es
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4617081021
num_examples: 5113016
download_size: 2738218845
dataset_size: 4617081021
- config_name: et
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 284897387
num_examples: 383833
download_size: 179215107
dataset_size: 284897387
- config_name: eu
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 383519807
num_examples: 649924
download_size: 199647059
dataset_size: 383519807
- config_name: ext
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3165541
num_examples: 5542
download_size: 2042220
dataset_size: 3165541
- config_name: fa
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1096086753
num_examples: 1283144
download_size: 493145784
dataset_size: 1096086753
- config_name: fat
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1680763
num_examples: 2816
download_size: 919605
dataset_size: 1680763
- config_name: ff
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1291343
num_examples: 2249
download_size: 772063
dataset_size: 1291343
- config_name: fi
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 849468386
num_examples: 996170
download_size: 519805131
dataset_size: 849468386
- config_name: fj
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 428943
num_examples: 685
download_size: 235797
dataset_size: 428943
- config_name: fo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 10043279
num_examples: 15695
download_size: 6034945
dataset_size: 10043279
- config_name: fon
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 385359
num_examples: 1115
download_size: 221743
dataset_size: 385359
- config_name: fr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 5575377829
num_examples: 7112007
download_size: 3278305869
dataset_size: 5575377829
- config_name: frp
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1904784
num_examples: 5630
download_size: 1083541
dataset_size: 1904784
- config_name: frr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6328685
num_examples: 20163
download_size: 3279258
dataset_size: 6328685
- config_name: fur
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2783416
num_examples: 4984
download_size: 1713552
dataset_size: 2783416
- config_name: fy
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 96632006
num_examples: 105637
download_size: 58009209
dataset_size: 96632006
- config_name: ga
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 40862887
num_examples: 66531
download_size: 24570434
dataset_size: 40862887
- config_name: gag
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 875669
num_examples: 1033
download_size: 529465
dataset_size: 875669
- config_name: gan
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 23979
num_examples: 52
download_size: 18975
dataset_size: 23979
- config_name: gcr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1902645
num_examples: 3012
download_size: 1178395
dataset_size: 1902645
- config_name: gd
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 9729455
num_examples: 19211
download_size: 5132470
dataset_size: 9729455
- config_name: gl
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 376632040
num_examples: 464044
download_size: 229599676
dataset_size: 376632040
- config_name: glk
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3457535
num_examples: 5649
download_size: 1507823
dataset_size: 3457535
- config_name: gn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4739434
num_examples: 6347
download_size: 2644351
dataset_size: 4739434
- config_name: gom
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 20042141
num_examples: 8011
download_size: 8233978
dataset_size: 20042141
- config_name: gor
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2442089
num_examples: 7911
download_size: 1098415
dataset_size: 2442089
- config_name: got
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 620942
num_examples: 531
download_size: 254411
dataset_size: 620942
- config_name: gpe
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1467531
num_examples: 2767
download_size: 848079
dataset_size: 1467531
- config_name: gu
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 87847917
num_examples: 49271
download_size: 27931216
dataset_size: 87847917
- config_name: guc
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 811934
num_examples: 1056
download_size: 485619
dataset_size: 811934
- config_name: gur
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1865901
num_examples: 4578
download_size: 844014
dataset_size: 1865901
- config_name: guw
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1506144
num_examples: 2372
download_size: 820171
dataset_size: 1506144
- config_name: gv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3773746
num_examples: 6852
download_size: 2172556
dataset_size: 3773746
- config_name: ha
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 62346946
num_examples: 85903
download_size: 34795509
dataset_size: 62346946
- config_name: hak
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1882896
num_examples: 4921
download_size: 928842
dataset_size: 1882896
- config_name: haw
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1154015
num_examples: 2708
download_size: 503601
dataset_size: 1154015
- config_name: he
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1598405786
num_examples: 1133198
download_size: 817911019
dataset_size: 1598405786
- config_name: hi
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 417168754
num_examples: 236273
download_size: 154826318
dataset_size: 417168754
- config_name: hif
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3199201
num_examples: 9039
download_size: 1694952
dataset_size: 3199201
- config_name: hr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 308518527
num_examples: 367795
download_size: 200551112
dataset_size: 308518527
- config_name: hsb
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 8182922
num_examples: 20300
download_size: 4332181
dataset_size: 8182922
- config_name: ht
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 9051570
num_examples: 33834
download_size: 4842294
dataset_size: 9051570
- config_name: hu
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 985324031
num_examples: 1041410
download_size: 618699673
dataset_size: 985324031
- config_name: hy
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 706380440
num_examples: 510514
download_size: 318467176
dataset_size: 706380440
- config_name: hyw
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 40402046
num_examples: 27591
download_size: 19127848
dataset_size: 40402046
- config_name: ia
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 10829579
num_examples: 26267
download_size: 5773537
dataset_size: 10829579
- config_name: id
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 767096741
num_examples: 1073986
download_size: 403956151
dataset_size: 767096741
- config_name: ie
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4595259
num_examples: 13921
download_size: 2151099
dataset_size: 4595259
- config_name: ig
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 50591894
num_examples: 57369
download_size: 26579676
dataset_size: 50591894
- config_name: ik
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 48243
num_examples: 124
download_size: 36789
dataset_size: 48243
- config_name: ilo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 10128691
num_examples: 18937
download_size: 4914236
dataset_size: 10128691
- config_name: inh
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1626368
num_examples: 2031
download_size: 800094
dataset_size: 1626368
- config_name: io
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 30202126
num_examples: 55372
download_size: 12908101
dataset_size: 30202126
- config_name: is
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 67181567
num_examples: 83150
download_size: 41496542
dataset_size: 67181567
- config_name: it
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3467024449
num_examples: 3881485
download_size: 2141854901
dataset_size: 3467024449
- config_name: iu
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 97431
num_examples: 123
download_size: 52124
dataset_size: 97431
- config_name: ja
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 12649923
num_examples: 15648
download_size: 8090054
dataset_size: 12649923
- config_name: jam
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 790137
num_examples: 1345
download_size: 507703
dataset_size: 790137
- config_name: jbo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2234632
num_examples: 1901
download_size: 752804
dataset_size: 2234632
- config_name: jv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 43914783
num_examples: 66704
download_size: 24394070
dataset_size: 43914783
- config_name: ka
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 531577814
num_examples: 261455
download_size: 189689216
dataset_size: 531577814
- config_name: kaa
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3616940
num_examples: 3706
download_size: 2085423
dataset_size: 3616940
- config_name: kab
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2734070
num_examples: 5949
download_size: 1709648
dataset_size: 2734070
- config_name: kbd
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1945099
num_examples: 2478
download_size: 893504
dataset_size: 1945099
- config_name: kbp
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2788850
num_examples: 1792
download_size: 1450152
dataset_size: 2788850
- config_name: kcg
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 634937
num_examples: 1090
download_size: 349354
dataset_size: 634937
- config_name: kg
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 123309
num_examples: 360
download_size: 77334
dataset_size: 123309
- config_name: ki
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 190936
num_examples: 542
download_size: 119539
dataset_size: 190936
- config_name: kk
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 335654188
num_examples: 343677
download_size: 130273599
dataset_size: 335654188
- config_name: kl
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 216357
num_examples: 299
download_size: 131302
dataset_size: 216357
- config_name: km
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 32479695
num_examples: 9303
download_size: 11359760
dataset_size: 32479695
- config_name: kn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 259663346
num_examples: 95614
download_size: 96296440
dataset_size: 259663346
- config_name: ko
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 781900316
num_examples: 886867
download_size: 460692322
dataset_size: 781900316
- config_name: koi
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1629343
num_examples: 2785
download_size: 707126
dataset_size: 1629343
- config_name: krc
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3209048
num_examples: 2405
download_size: 1472435
dataset_size: 3209048
- config_name: ks
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1492138
num_examples: 2582
download_size: 652747
dataset_size: 1492138
- config_name: ksh
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2384434
num_examples: 4374
download_size: 1586634
dataset_size: 2384434
- config_name: ku
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 25265131
num_examples: 43303
download_size: 14891891
dataset_size: 25265131
- config_name: kv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4614568
num_examples: 5718
download_size: 1955088
dataset_size: 4614568
- config_name: kw
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2943801
num_examples: 6333
download_size: 1803891
dataset_size: 2943801
- config_name: ky
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 123060265
num_examples: 118822
download_size: 49413332
dataset_size: 123060265
- config_name: la
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 77981311
num_examples: 165009
download_size: 45551768
dataset_size: 77981311
- config_name: lad
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3642672
num_examples: 6132
download_size: 2166767
dataset_size: 3642672
- config_name: lb
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 50437132
num_examples: 83578
download_size: 30482715
dataset_size: 50437132
- config_name: lbe
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 320431
num_examples: 477
download_size: 143006
dataset_size: 320431
- config_name: lez
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6646648
num_examples: 7344
download_size: 2718761
dataset_size: 6646648
- config_name: lfn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6557011
num_examples: 6997
download_size: 3864822
dataset_size: 6557011
- config_name: lg
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4409804
num_examples: 6170
download_size: 2439074
dataset_size: 4409804
- config_name: li
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 19709458
num_examples: 26613
download_size: 11987601
dataset_size: 19709458
- config_name: lij
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7594804
num_examples: 9445
download_size: 4792854
dataset_size: 7594804
- config_name: lld
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 23873089
num_examples: 142049
download_size: 7030839
dataset_size: 23873089
- config_name: lmo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 27999544
num_examples: 70178
download_size: 13330712
dataset_size: 27999544
- config_name: ln
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 891430
num_examples: 2193
download_size: 521890
dataset_size: 891430
- config_name: lo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 5661219
num_examples: 2835
download_size: 2242396
dataset_size: 5661219
- config_name: lt
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 244956278
num_examples: 357966
download_size: 148852123
dataset_size: 244956278
- config_name: ltg
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 536844
num_examples: 1089
download_size: 339989
dataset_size: 536844
- config_name: lv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 169387546
num_examples: 212549
download_size: 101333752
dataset_size: 169387546
- config_name: mad
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1105661
num_examples: 1710
download_size: 633828
dataset_size: 1105661
- config_name: mai
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 11120476
num_examples: 10476
download_size: 3961783
dataset_size: 11120476
- config_name: mdf
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1418161
num_examples: 2044
download_size: 678094
dataset_size: 1418161
- config_name: mg
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 42905755
num_examples: 105277
download_size: 12829807
dataset_size: 42905755
- config_name: mhr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 11085411
num_examples: 13488
download_size: 3976421
dataset_size: 11085411
- config_name: mi
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2381297
num_examples: 8446
download_size: 598043
dataset_size: 2381297
- config_name: min
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 86297406
num_examples: 281845
download_size: 16954958
dataset_size: 86297406
- config_name: mk
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 496793528
num_examples: 356631
download_size: 209848511
dataset_size: 496793528
- config_name: ml
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 366041285
num_examples: 176956
download_size: 136534719
dataset_size: 366041285
- config_name: mn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 59687084
num_examples: 42154
download_size: 28104275
dataset_size: 59687084
- config_name: mni
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 4688288
num_examples: 6674
download_size: 1207849
dataset_size: 4688288
- config_name: mnw
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 15145600
num_examples: 6023
download_size: 5212159
dataset_size: 15145600
- config_name: mr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 158877153
num_examples: 113296
download_size: 52740126
dataset_size: 158877153
- config_name: mrj
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3944304
num_examples: 7644
download_size: 1594941
dataset_size: 3944304
- config_name: ms
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 257532994
num_examples: 372645
download_size: 137054834
dataset_size: 257532994
- config_name: mt
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 23920680
num_examples: 20141
download_size: 14214841
dataset_size: 23920680
- config_name: mwl
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 13712607
num_examples: 11445
download_size: 8270955
dataset_size: 13712607
- config_name: my
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 134406207
num_examples: 78009
download_size: 38068743
dataset_size: 134406207
- config_name: myv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 5542938
num_examples: 6850
download_size: 2561738
dataset_size: 5542938
- config_name: mzn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 9993632
num_examples: 19508
download_size: 3680929
dataset_size: 9993632
- config_name: nah
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 858487
num_examples: 1852
download_size: 489323
dataset_size: 858487
- config_name: nap
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2777568
num_examples: 5660
download_size: 1775858
dataset_size: 2777568
- config_name: nds
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 71564450
num_examples: 131803
download_size: 37374996
dataset_size: 71564450
- config_name: ne
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 72067748
num_examples: 44610
download_size: 26192931
dataset_size: 72067748
- config_name: new
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 80829185
num_examples: 96846
download_size: 5843115
dataset_size: 80829185
- config_name: nia
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1578312
num_examples: 2086
download_size: 859633
dataset_size: 1578312
- config_name: nl
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1882105897
num_examples: 3188360
download_size: 1049995445
dataset_size: 1882105897
- config_name: nn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 166728290
num_examples: 254167
download_size: 100464171
dataset_size: 166728290
- config_name: 'no'
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 676713742
num_examples: 991401
download_size: 413310332
dataset_size: 676713742
- config_name: nov
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 435282
num_examples: 987
download_size: 265049
dataset_size: 435282
- config_name: nqo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6097594
num_examples: 3524
download_size: 2590230
dataset_size: 6097594
- config_name: nso
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1802069
num_examples: 7454
download_size: 610051
dataset_size: 1802069
- config_name: nv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 13323915
num_examples: 36791
download_size: 2572537
dataset_size: 13323915
- config_name: ny
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1271601
num_examples: 1168
download_size: 719811
dataset_size: 1271601
- config_name: oc
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 84558682
num_examples: 138057
download_size: 48860368
dataset_size: 84558682
- config_name: olo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2155866
num_examples: 4698
download_size: 1279978
dataset_size: 2155866
- config_name: om
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1996601
num_examples: 2084
download_size: 1127579
dataset_size: 1996601
- config_name: or
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 52828943
num_examples: 32074
download_size: 19402056
dataset_size: 52828943
- config_name: os
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7163774
num_examples: 7808
download_size: 3448814
dataset_size: 7163774
- config_name: pa
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 158365259
num_examples: 98517
download_size: 61737048
dataset_size: 158365259
- config_name: pag
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 582639
num_examples: 2218
download_size: 220978
dataset_size: 582639
- config_name: pam
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3715136
num_examples: 8014
download_size: 2007875
dataset_size: 3715136
- config_name: pap
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2995239
num_examples: 4911
download_size: 1795611
dataset_size: 2995239
- config_name: pcd
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2042528
num_examples: 6201
download_size: 1264830
dataset_size: 2042528
- config_name: pcm
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1375325
num_examples: 2496
download_size: 846190
dataset_size: 1375325
- config_name: pdc
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 628853
num_examples: 1736
download_size: 380509
dataset_size: 628853
- config_name: pfl
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2436990
num_examples: 5003
download_size: 1446263
dataset_size: 2436990
- config_name: pi
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 82809
num_examples: 132
download_size: 40325
dataset_size: 82809
- config_name: pih
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 127708
num_examples: 502
download_size: 85181
dataset_size: 127708
- config_name: pl
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2010160906
num_examples: 2631708
download_size: 1260442689
dataset_size: 2010160906
- config_name: pms
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 20653982
num_examples: 67727
download_size: 7968650
dataset_size: 20653982
- config_name: pnb
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 189688323
num_examples: 126071
download_size: 86057950
dataset_size: 189688323
- config_name: pnt
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 349665
num_examples: 416
download_size: 180075
dataset_size: 349665
- config_name: ps
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 83802012
num_examples: 44211
download_size: 39978475
dataset_size: 83802012
- config_name: pt
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2034932935
num_examples: 2344840
download_size: 1219829216
dataset_size: 2034932935
- config_name: pwn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 647735
num_examples: 992
download_size: 357871
dataset_size: 647735
- config_name: qu
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 5828517
num_examples: 19537
download_size: 2902726
dataset_size: 5828517
- config_name: rm
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 13579320
num_examples: 11229
download_size: 8015673
dataset_size: 13579320
- config_name: rmy
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 248619
num_examples: 503
download_size: 160629
dataset_size: 248619
- config_name: rn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 316166
num_examples: 350
download_size: 188144
dataset_size: 316166
- config_name: ro
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 545948116
num_examples: 700042
download_size: 318430221
dataset_size: 545948116
- config_name: ru
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7759635097
num_examples: 5003570
download_size: 3753889504
dataset_size: 7759635097
- config_name: rue
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 9287344
num_examples: 11090
download_size: 4795201
dataset_size: 9287344
- config_name: rw
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 8977697
num_examples: 14467
download_size: 5084740
dataset_size: 8977697
- config_name: sa
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 32949173
num_examples: 16177
download_size: 11951852
dataset_size: 32949173
- config_name: sah
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 33849150
num_examples: 21615
download_size: 15515331
dataset_size: 33849150
- config_name: sat
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 32665784
num_examples: 25055
download_size: 11182638
dataset_size: 32665784
- config_name: sc
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 9738121
num_examples: 10658
download_size: 5985006
dataset_size: 9738121
- config_name: scn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 10327065
num_examples: 20298
download_size: 6577724
dataset_size: 10327065
- config_name: sco
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 25802215
num_examples: 46219
download_size: 15762256
dataset_size: 25802215
- config_name: sd
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 28066243
num_examples: 23470
download_size: 13812745
dataset_size: 28066243
- config_name: se
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1761185
num_examples: 4045
download_size: 1016098
dataset_size: 1761185
- config_name: sg
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 41098
num_examples: 157
download_size: 20525
dataset_size: 41098
- config_name: shi
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1532180
num_examples: 2558
download_size: 896262
dataset_size: 1532180
- config_name: shn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 13432089
num_examples: 8063
download_size: 3496596
dataset_size: 13432089
- config_name: si
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 92239795
num_examples: 39107
download_size: 36672229
dataset_size: 92239795
- config_name: sk
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 240678500
num_examples: 324632
download_size: 155181911
dataset_size: 240678500
- config_name: skr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 12872603
num_examples: 9273
download_size: 5985896
dataset_size: 12872603
- config_name: sl
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 293875472
num_examples: 345828
download_size: 188307158
dataset_size: 293875472
- config_name: sm
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 580087
num_examples: 968
download_size: 313287
dataset_size: 580087
- config_name: smn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3031036
num_examples: 7695
download_size: 1626048
dataset_size: 3031036
- config_name: sn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6725868
num_examples: 15853
download_size: 3470476
dataset_size: 6725868
- config_name: so
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7805320
num_examples: 10672
download_size: 4637543
dataset_size: 7805320
- config_name: sq
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 146996247
num_examples: 156631
download_size: 85730721
dataset_size: 146996247
- config_name: sr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1100964218
num_examples: 1068267
download_size: 501616775
dataset_size: 1100964218
- config_name: srn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 422510
num_examples: 1180
download_size: 108349
dataset_size: 422510
- config_name: ss
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 778167
num_examples: 1185
download_size: 451534
dataset_size: 778167
- config_name: st
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 461960
num_examples: 570
download_size: 263541
dataset_size: 461960
- config_name: stq
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3204151
num_examples: 5239
download_size: 1932332
dataset_size: 3204151
- config_name: su
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 27523996
num_examples: 59688
download_size: 12050250
dataset_size: 27523996
- config_name: sv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1477480782
num_examples: 3037984
download_size: 687965680
dataset_size: 1477480782
- config_name: sw
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 45061769
num_examples: 87838
download_size: 24462885
dataset_size: 45061769
- config_name: szl
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 14393038
num_examples: 51878
download_size: 4723617
dataset_size: 14393038
- config_name: szy
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3529218
num_examples: 6201
download_size: 1951037
dataset_size: 3529218
- config_name: ta
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 552188178
num_examples: 309536
download_size: 187677501
dataset_size: 552188178
- config_name: tay
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1885022
num_examples: 4259
download_size: 877353
dataset_size: 1885022
- config_name: tcy
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7559863
num_examples: 5407
download_size: 2901019
dataset_size: 7559863
- config_name: te
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 498690123
num_examples: 353456
download_size: 135670194
dataset_size: 498690123
- config_name: tet
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 873221
num_examples: 1392
download_size: 466131
dataset_size: 873221
- config_name: tg
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 83443035
num_examples: 100061
download_size: 34180411
dataset_size: 83443035
- config_name: th
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 336701485
num_examples: 164709
download_size: 132379654
dataset_size: 336701485
- config_name: ti
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 481131
num_examples: 296
download_size: 248148
dataset_size: 481131
- config_name: tk
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 9508702
num_examples: 5642
download_size: 5421160
dataset_size: 9508702
- config_name: tl
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 66514500
num_examples: 109797
download_size: 35000269
dataset_size: 66514500
- config_name: tly
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1330085
num_examples: 3854
download_size: 680242
dataset_size: 1330085
- config_name: tn
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2377043
num_examples: 2751
download_size: 1313067
dataset_size: 2377043
- config_name: to
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 480434
num_examples: 1253
download_size: 258608
dataset_size: 480434
- config_name: tpi
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 209536
num_examples: 611
download_size: 112851
dataset_size: 209536
- config_name: tr
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 648280660
num_examples: 775135
download_size: 383472360
dataset_size: 648280660
- config_name: trv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 2507999
num_examples: 3596
download_size: 1366401
dataset_size: 2507999
- config_name: ts
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 681222
num_examples: 872
download_size: 373073
dataset_size: 681222
- config_name: tt
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 348346744
num_examples: 579628
download_size: 64120738
dataset_size: 348346744
- config_name: tum
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 8246192
num_examples: 16025
download_size: 3777855
dataset_size: 8246192
- config_name: tw
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6409746
num_examples: 9981
download_size: 3328825
dataset_size: 6409746
- config_name: ty
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 80707
num_examples: 197
download_size: 53075
dataset_size: 80707
- config_name: tyv
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 9711306
num_examples: 7025
download_size: 4480311
dataset_size: 9711306
- config_name: udm
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3820873
num_examples: 4504
download_size: 1805995
dataset_size: 3820873
- config_name: ug
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 29487564
num_examples: 11777
download_size: 12611086
dataset_size: 29487564
- config_name: uk
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3604654400
num_examples: 2838256
download_size: 1704669574
dataset_size: 3604654400
- config_name: ur
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 235896915
num_examples: 270915
download_size: 106346098
dataset_size: 235896915
- config_name: uz
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 294031767
num_examples: 366178
download_size: 165625367
dataset_size: 294031767
- config_name: ve
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 146094
num_examples: 377
download_size: 78967
dataset_size: 146094
- config_name: vec
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 22389528
num_examples: 76129
download_size: 10918596
dataset_size: 22389528
- config_name: vep
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 9173814
num_examples: 18308
download_size: 5200281
dataset_size: 9173814
- config_name: vi
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1105540235
num_examples: 1582776
download_size: 527099253
dataset_size: 1105540235
- config_name: vls
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7033519
num_examples: 12090
download_size: 4475822
dataset_size: 7033519
- config_name: vo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 13186136
num_examples: 32012
download_size: 4481037
dataset_size: 13186136
- config_name: wa
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 6129322
num_examples: 17279
download_size: 3877434
dataset_size: 6129322
- config_name: war
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 301393399
num_examples: 1128985
download_size: 60603756
dataset_size: 301393399
- config_name: wo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1594273
num_examples: 2018
download_size: 975581
dataset_size: 1594273
- config_name: wuu
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 48662
num_examples: 91
download_size: 38315
dataset_size: 48662
- config_name: xal
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 562983
num_examples: 1103
download_size: 241231
dataset_size: 562983
- config_name: xh
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 1526785
num_examples: 1679
download_size: 975370
dataset_size: 1526785
- config_name: xmf
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 24640607
num_examples: 18943
download_size: 8854837
dataset_size: 24640607
- config_name: yi
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 27936672
num_examples: 25145
download_size: 12753119
dataset_size: 27936672
- config_name: yo
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 10177873
num_examples: 15344
download_size: 5571492
dataset_size: 10177873
- config_name: za
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 472569
num_examples: 1115
download_size: 254556
dataset_size: 472569
- config_name: zea
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3339570
num_examples: 8814
download_size: 1752470
dataset_size: 3339570
- config_name: zh
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 11480498
num_examples: 19526
download_size: 7986641
dataset_size: 11480498
- config_name: zu
features:
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 3819166
num_examples: 4449
download_size: 2353401
dataset_size: 3819166
configs:
- config_name: ab
data_files:
- split: train
path: ab/train-*
- config_name: ace
data_files:
- split: train
path: ace/train-*
- config_name: ady
data_files:
- split: train
path: ady/train-*
- config_name: af
data_files:
- split: train
path: af/train-*
- config_name: alt
data_files:
- split: train
path: alt/train-*
- config_name: am
data_files:
- split: train
path: am/train-*
- config_name: ami
data_files:
- split: train
path: ami/train-*
- config_name: an
data_files:
- split: train
path: an/train-*
- config_name: ang
data_files:
- split: train
path: ang/train-*
- config_name: anp
data_files:
- split: train
path: anp/train-*
- config_name: ar
data_files:
- split: train
path: ar/train-*
- config_name: arc
data_files:
- split: train
path: arc/train-*
- config_name: ary
data_files:
- split: train
path: ary/train-*
- config_name: arz
data_files:
- split: train
path: arz/train-*
- config_name: as
data_files:
- split: train
path: as/train-*
- config_name: ast
data_files:
- split: train
path: ast/train-*
- config_name: atj
data_files:
- split: train
path: atj/train-*
- config_name: av
data_files:
- split: train
path: av/train-*
- config_name: avk
data_files:
- split: train
path: avk/train-*
- config_name: awa
data_files:
- split: train
path: awa/train-*
- config_name: ay
data_files:
- split: train
path: ay/train-*
- config_name: az
data_files:
- split: train
path: az/train-*
- config_name: azb
data_files:
- split: train
path: azb/train-*
- config_name: ba
data_files:
- split: train
path: ba/train-*
- config_name: ban
data_files:
- split: train
path: ban/train-*
- config_name: bar
data_files:
- split: train
path: bar/train-*
- config_name: bcl
data_files:
- split: train
path: bcl/train-*
- config_name: be
data_files:
- split: train
path: be/train-*
- config_name: bg
data_files:
- split: train
path: bg/train-*
- config_name: bh
data_files:
- split: train
path: bh/train-*
- config_name: bi
data_files:
- split: train
path: bi/train-*
- config_name: bjn
data_files:
- split: train
path: bjn/train-*
- config_name: blk
data_files:
- split: train
path: blk/train-*
- config_name: bm
data_files:
- split: train
path: bm/train-*
- config_name: bn
data_files:
- split: train
path: bn/train-*
- config_name: bo
data_files:
- split: train
path: bo/train-*
- config_name: bpy
data_files:
- split: train
path: bpy/train-*
- config_name: br
data_files:
- split: train
path: br/train-*
- config_name: bs
data_files:
- split: train
path: bs/train-*
- config_name: bug
data_files:
- split: train
path: bug/train-*
- config_name: bxr
data_files:
- split: train
path: bxr/train-*
- config_name: ca
data_files:
- split: train
path: ca/train-*
- config_name: cdo
data_files:
- split: train
path: cdo/train-*
- config_name: ce
data_files:
- split: train
path: ce/train-*
- config_name: ceb
data_files:
- split: train
path: ceb/train-*
- config_name: ch
data_files:
- split: train
path: ch/train-*
- config_name: chr
data_files:
- split: train
path: chr/train-*
- config_name: chy
data_files:
- split: train
path: chy/train-*
- config_name: ckb
data_files:
- split: train
path: ckb/train-*
- config_name: co
data_files:
- split: train
path: co/train-*
- config_name: cr
data_files:
- split: train
path: cr/train-*
- config_name: crh
data_files:
- split: train
path: crh/train-*
- config_name: cs
data_files:
- split: train
path: cs/train-*
- config_name: csb
data_files:
- split: train
path: csb/train-*
- config_name: cu
data_files:
- split: train
path: cu/train-*
- config_name: cv
data_files:
- split: train
path: cv/train-*
- config_name: cy
data_files:
- split: train
path: cy/train-*
- config_name: da
data_files:
- split: train
path: da/train-*
- config_name: dag
data_files:
- split: train
path: dag/train-*
- config_name: de
data_files:
- split: train
path: de/train-*
- config_name: din
data_files:
- split: train
path: din/train-*
- config_name: diq
data_files:
- split: train
path: diq/train-*
- config_name: dsb
data_files:
- split: train
path: dsb/train-*
- config_name: dty
data_files:
- split: train
path: dty/train-*
- config_name: dv
data_files:
- split: train
path: dv/train-*
- config_name: dz
data_files:
- split: train
path: dz/train-*
- config_name: ee
data_files:
- split: train
path: ee/train-*
- config_name: el
data_files:
- split: train
path: el/train-*
- config_name: eml
data_files:
- split: train
path: eml/train-*
- config_name: en
data_files:
- split: train
path: en/train-*
- config_name: eo
data_files:
- split: train
path: eo/train-*
- config_name: es
data_files:
- split: train
path: es/train-*
- config_name: et
data_files:
- split: train
path: et/train-*
- config_name: eu
data_files:
- split: train
path: eu/train-*
- config_name: ext
data_files:
- split: train
path: ext/train-*
- config_name: fa
data_files:
- split: train
path: fa/train-*
- config_name: fat
data_files:
- split: train
path: fat/train-*
- config_name: ff
data_files:
- split: train
path: ff/train-*
- config_name: fi
data_files:
- split: train
path: fi/train-*
- config_name: fj
data_files:
- split: train
path: fj/train-*
- config_name: fo
data_files:
- split: train
path: fo/train-*
- config_name: fon
data_files:
- split: train
path: fon/train-*
- config_name: fr
data_files:
- split: train
path: fr/train-*
- config_name: frp
data_files:
- split: train
path: frp/train-*
- config_name: frr
data_files:
- split: train
path: frr/train-*
- config_name: fur
data_files:
- split: train
path: fur/train-*
- config_name: fy
data_files:
- split: train
path: fy/train-*
- config_name: ga
data_files:
- split: train
path: ga/train-*
- config_name: gag
data_files:
- split: train
path: gag/train-*
- config_name: gan
data_files:
- split: train
path: gan/train-*
- config_name: gcr
data_files:
- split: train
path: gcr/train-*
- config_name: gd
data_files:
- split: train
path: gd/train-*
- config_name: gl
data_files:
- split: train
path: gl/train-*
- config_name: glk
data_files:
- split: train
path: glk/train-*
- config_name: gn
data_files:
- split: train
path: gn/train-*
- config_name: gom
data_files:
- split: train
path: gom/train-*
- config_name: gor
data_files:
- split: train
path: gor/train-*
- config_name: got
data_files:
- split: train
path: got/train-*
- config_name: gpe
data_files:
- split: train
path: gpe/train-*
- config_name: gu
data_files:
- split: train
path: gu/train-*
- config_name: guc
data_files:
- split: train
path: guc/train-*
- config_name: gur
data_files:
- split: train
path: gur/train-*
- config_name: guw
data_files:
- split: train
path: guw/train-*
- config_name: gv
data_files:
- split: train
path: gv/train-*
- config_name: ha
data_files:
- split: train
path: ha/train-*
- config_name: hak
data_files:
- split: train
path: hak/train-*
- config_name: haw
data_files:
- split: train
path: haw/train-*
- config_name: he
data_files:
- split: train
path: he/train-*
- config_name: hi
data_files:
- split: train
path: hi/train-*
- config_name: hif
data_files:
- split: train
path: hif/train-*
- config_name: hr
data_files:
- split: train
path: hr/train-*
- config_name: hsb
data_files:
- split: train
path: hsb/train-*
- config_name: ht
data_files:
- split: train
path: ht/train-*
- config_name: hu
data_files:
- split: train
path: hu/train-*
- config_name: hy
data_files:
- split: train
path: hy/train-*
- config_name: hyw
data_files:
- split: train
path: hyw/train-*
- config_name: ia
data_files:
- split: train
path: ia/train-*
- config_name: id
data_files:
- split: train
path: id/train-*
- config_name: ie
data_files:
- split: train
path: ie/train-*
- config_name: ig
data_files:
- split: train
path: ig/train-*
- config_name: ik
data_files:
- split: train
path: ik/train-*
- config_name: ilo
data_files:
- split: train
path: ilo/train-*
- config_name: inh
data_files:
- split: train
path: inh/train-*
- config_name: io
data_files:
- split: train
path: io/train-*
- config_name: is
data_files:
- split: train
path: is/train-*
- config_name: it
data_files:
- split: train
path: it/train-*
- config_name: iu
data_files:
- split: train
path: iu/train-*
- config_name: ja
data_files:
- split: train
path: ja/train-*
- config_name: jam
data_files:
- split: train
path: jam/train-*
- config_name: jbo
data_files:
- split: train
path: jbo/train-*
- config_name: jv
data_files:
- split: train
path: jv/train-*
- config_name: ka
data_files:
- split: train
path: ka/train-*
- config_name: kaa
data_files:
- split: train
path: kaa/train-*
- config_name: kab
data_files:
- split: train
path: kab/train-*
- config_name: kbd
data_files:
- split: train
path: kbd/train-*
- config_name: kbp
data_files:
- split: train
path: kbp/train-*
- config_name: kcg
data_files:
- split: train
path: kcg/train-*
- config_name: kg
data_files:
- split: train
path: kg/train-*
- config_name: ki
data_files:
- split: train
path: ki/train-*
- config_name: kk
data_files:
- split: train
path: kk/train-*
- config_name: kl
data_files:
- split: train
path: kl/train-*
- config_name: km
data_files:
- split: train
path: km/train-*
- config_name: kn
data_files:
- split: train
path: kn/train-*
- config_name: ko
data_files:
- split: train
path: ko/train-*
- config_name: koi
data_files:
- split: train
path: koi/train-*
- config_name: krc
data_files:
- split: train
path: krc/train-*
- config_name: ks
data_files:
- split: train
path: ks/train-*
- config_name: ksh
data_files:
- split: train
path: ksh/train-*
- config_name: ku
data_files:
- split: train
path: ku/train-*
- config_name: kv
data_files:
- split: train
path: kv/train-*
- config_name: kw
data_files:
- split: train
path: kw/train-*
- config_name: ky
data_files:
- split: train
path: ky/train-*
- config_name: la
data_files:
- split: train
path: la/train-*
- config_name: lad
data_files:
- split: train
path: lad/train-*
- config_name: lb
data_files:
- split: train
path: lb/train-*
- config_name: lbe
data_files:
- split: train
path: lbe/train-*
- config_name: lez
data_files:
- split: train
path: lez/train-*
- config_name: lfn
data_files:
- split: train
path: lfn/train-*
- config_name: lg
data_files:
- split: train
path: lg/train-*
- config_name: li
data_files:
- split: train
path: li/train-*
- config_name: lij
data_files:
- split: train
path: lij/train-*
- config_name: lld
data_files:
- split: train
path: lld/train-*
- config_name: lmo
data_files:
- split: train
path: lmo/train-*
- config_name: ln
data_files:
- split: train
path: ln/train-*
- config_name: lo
data_files:
- split: train
path: lo/train-*
- config_name: lt
data_files:
- split: train
path: lt/train-*
- config_name: ltg
data_files:
- split: train
path: ltg/train-*
- config_name: lv
data_files:
- split: train
path: lv/train-*
- config_name: mad
data_files:
- split: train
path: mad/train-*
- config_name: mai
data_files:
- split: train
path: mai/train-*
- config_name: mdf
data_files:
- split: train
path: mdf/train-*
- config_name: mg
data_files:
- split: train
path: mg/train-*
- config_name: mhr
data_files:
- split: train
path: mhr/train-*
- config_name: mi
data_files:
- split: train
path: mi/train-*
- config_name: min
data_files:
- split: train
path: min/train-*
- config_name: mk
data_files:
- split: train
path: mk/train-*
- config_name: ml
data_files:
- split: train
path: ml/train-*
- config_name: mn
data_files:
- split: train
path: mn/train-*
- config_name: mni
data_files:
- split: train
path: mni/train-*
- config_name: mnw
data_files:
- split: train
path: mnw/train-*
- config_name: mr
data_files:
- split: train
path: mr/train-*
- config_name: mrj
data_files:
- split: train
path: mrj/train-*
- config_name: ms
data_files:
- split: train
path: ms/train-*
- config_name: mt
data_files:
- split: train
path: mt/train-*
- config_name: mwl
data_files:
- split: train
path: mwl/train-*
- config_name: my
data_files:
- split: train
path: my/train-*
- config_name: myv
data_files:
- split: train
path: myv/train-*
- config_name: mzn
data_files:
- split: train
path: mzn/train-*
- config_name: nah
data_files:
- split: train
path: nah/train-*
- config_name: nap
data_files:
- split: train
path: nap/train-*
- config_name: nds
data_files:
- split: train
path: nds/train-*
- config_name: ne
data_files:
- split: train
path: ne/train-*
- config_name: new
data_files:
- split: train
path: new/train-*
- config_name: nia
data_files:
- split: train
path: nia/train-*
- config_name: nl
data_files:
- split: train
path: nl/train-*
- config_name: nn
data_files:
- split: train
path: nn/train-*
- config_name: 'no'
data_files:
- split: train
path: no/train-*
- config_name: nov
data_files:
- split: train
path: nov/train-*
- config_name: nqo
data_files:
- split: train
path: nqo/train-*
- config_name: nso
data_files:
- split: train
path: nso/train-*
- config_name: nv
data_files:
- split: train
path: nv/train-*
- config_name: ny
data_files:
- split: train
path: ny/train-*
- config_name: oc
data_files:
- split: train
path: oc/train-*
- config_name: olo
data_files:
- split: train
path: olo/train-*
- config_name: om
data_files:
- split: train
path: om/train-*
- config_name: or
data_files:
- split: train
path: or/train-*
- config_name: os
data_files:
- split: train
path: os/train-*
- config_name: pa
data_files:
- split: train
path: pa/train-*
- config_name: pag
data_files:
- split: train
path: pag/train-*
- config_name: pam
data_files:
- split: train
path: pam/train-*
- config_name: pap
data_files:
- split: train
path: pap/train-*
- config_name: pcd
data_files:
- split: train
path: pcd/train-*
- config_name: pcm
data_files:
- split: train
path: pcm/train-*
- config_name: pdc
data_files:
- split: train
path: pdc/train-*
- config_name: pfl
data_files:
- split: train
path: pfl/train-*
- config_name: pi
data_files:
- split: train
path: pi/train-*
- config_name: pih
data_files:
- split: train
path: pih/train-*
- config_name: pl
data_files:
- split: train
path: pl/train-*
- config_name: pms
data_files:
- split: train
path: pms/train-*
- config_name: pnb
data_files:
- split: train
path: pnb/train-*
- config_name: pnt
data_files:
- split: train
path: pnt/train-*
- config_name: ps
data_files:
- split: train
path: ps/train-*
- config_name: pt
data_files:
- split: train
path: pt/train-*
- config_name: pwn
data_files:
- split: train
path: pwn/train-*
- config_name: qu
data_files:
- split: train
path: qu/train-*
- config_name: rm
data_files:
- split: train
path: rm/train-*
- config_name: rmy
data_files:
- split: train
path: rmy/train-*
- config_name: rn
data_files:
- split: train
path: rn/train-*
- config_name: ro
data_files:
- split: train
path: ro/train-*
- config_name: ru
data_files:
- split: train
path: ru/train-*
- config_name: rue
data_files:
- split: train
path: rue/train-*
- config_name: rw
data_files:
- split: train
path: rw/train-*
- config_name: sa
data_files:
- split: train
path: sa/train-*
- config_name: sah
data_files:
- split: train
path: sah/train-*
- config_name: sat
data_files:
- split: train
path: sat/train-*
- config_name: sc
data_files:
- split: train
path: sc/train-*
- config_name: scn
data_files:
- split: train
path: scn/train-*
- config_name: sco
data_files:
- split: train
path: sco/train-*
- config_name: sd
data_files:
- split: train
path: sd/train-*
- config_name: se
data_files:
- split: train
path: se/train-*
- config_name: sg
data_files:
- split: train
path: sg/train-*
- config_name: shi
data_files:
- split: train
path: shi/train-*
- config_name: shn
data_files:
- split: train
path: shn/train-*
- config_name: si
data_files:
- split: train
path: si/train-*
- config_name: sk
data_files:
- split: train
path: sk/train-*
- config_name: skr
data_files:
- split: train
path: skr/train-*
- config_name: sl
data_files:
- split: train
path: sl/train-*
- config_name: sm
data_files:
- split: train
path: sm/train-*
- config_name: smn
data_files:
- split: train
path: smn/train-*
- config_name: sn
data_files:
- split: train
path: sn/train-*
- config_name: so
data_files:
- split: train
path: so/train-*
- config_name: sq
data_files:
- split: train
path: sq/train-*
- config_name: sr
data_files:
- split: train
path: sr/train-*
- config_name: srn
data_files:
- split: train
path: srn/train-*
- config_name: ss
data_files:
- split: train
path: ss/train-*
- config_name: st
data_files:
- split: train
path: st/train-*
- config_name: stq
data_files:
- split: train
path: stq/train-*
- config_name: su
data_files:
- split: train
path: su/train-*
- config_name: sv
data_files:
- split: train
path: sv/train-*
- config_name: sw
data_files:
- split: train
path: sw/train-*
- config_name: szl
data_files:
- split: train
path: szl/train-*
- config_name: szy
data_files:
- split: train
path: szy/train-*
- config_name: ta
data_files:
- split: train
path: ta/train-*
- config_name: tay
data_files:
- split: train
path: tay/train-*
- config_name: tcy
data_files:
- split: train
path: tcy/train-*
- config_name: te
data_files:
- split: train
path: te/train-*
- config_name: tet
data_files:
- split: train
path: tet/train-*
- config_name: tg
data_files:
- split: train
path: tg/train-*
- config_name: th
data_files:
- split: train
path: th/train-*
- config_name: ti
data_files:
- split: train
path: ti/train-*
- config_name: tk
data_files:
- split: train
path: tk/train-*
- config_name: tl
data_files:
- split: train
path: tl/train-*
- config_name: tly
data_files:
- split: train
path: tly/train-*
- config_name: tn
data_files:
- split: train
path: tn/train-*
- config_name: to
data_files:
- split: train
path: to/train-*
- config_name: tpi
data_files:
- split: train
path: tpi/train-*
- config_name: tr
data_files:
- split: train
path: tr/train-*
- config_name: trv
data_files:
- split: train
path: trv/train-*
- config_name: ts
data_files:
- split: train
path: ts/train-*
- config_name: tt
data_files:
- split: train
path: tt/train-*
- config_name: tum
data_files:
- split: train
path: tum/train-*
- config_name: tw
data_files:
- split: train
path: tw/train-*
- config_name: ty
data_files:
- split: train
path: ty/train-*
- config_name: tyv
data_files:
- split: train
path: tyv/train-*
- config_name: udm
data_files:
- split: train
path: udm/train-*
- config_name: ug
data_files:
- split: train
path: ug/train-*
- config_name: uk
data_files:
- split: train
path: uk/train-*
- config_name: ur
data_files:
- split: train
path: ur/train-*
- config_name: uz
data_files:
- split: train
path: uz/train-*
- config_name: ve
data_files:
- split: train
path: ve/train-*
- config_name: vec
data_files:
- split: train
path: vec/train-*
- config_name: vep
data_files:
- split: train
path: vep/train-*
- config_name: vi
data_files:
- split: train
path: vi/train-*
- config_name: vls
data_files:
- split: train
path: vls/train-*
- config_name: vo
data_files:
- split: train
path: vo/train-*
- config_name: wa
data_files:
- split: train
path: wa/train-*
- config_name: war
data_files:
- split: train
path: war/train-*
- config_name: wo
data_files:
- split: train
path: wo/train-*
- config_name: wuu
data_files:
- split: train
path: wuu/train-*
- config_name: xal
data_files:
- split: train
path: xal/train-*
- config_name: xh
data_files:
- split: train
path: xh/train-*
- config_name: xmf
data_files:
- split: train
path: xmf/train-*
- config_name: yi
data_files:
- split: train
path: yi/train-*
- config_name: yo
data_files:
- split: train
path: yo/train-*
- config_name: za
data_files:
- split: train
path: za/train-*
- config_name: zea
data_files:
- split: train
path: zea/train-*
- config_name: zh
data_files:
- split: train
path: zh/train-*
- config_name: zu
data_files:
- split: train
path: zu/train-*
---
提供机构:
sproos
原始信息汇总
数据集概述
数据集配置
配置 ab
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 1644341
- 样本数: 1576
- train:
- 下载大小: 709370 字节
- 数据集大小: 1644341 字节
配置 ace
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 2173660
- 样本数: 8489
- train:
- 下载大小: 830268 字节
- 数据集大小: 2173660 字节
配置 ady
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 429339
- 样本数: 522
- train:
- 下载大小: 205308 字节
- 数据集大小: 429339 字节
配置 af
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 166299481
- 样本数: 204317
- train:
- 下载大小: 94370515 字节
- 数据集大小: 166299481 字节
配置 alt
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 3974030
- 样本数: 4240
- train:
- 下载大小: 1623904 字节
- 数据集大小: 3974030 字节
配置 am
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 13172809
- 样本数: 9782
- train:
- 下载大小: 6138866 字节
- 数据集大小: 13172809 字节
配置 ami
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 3478701
- 样本数: 3629
- train:
- 下载大小: 1788840 字节
- 数据集大小: 3478701 字节
配置 an
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 41578614
- 样本数: 79302
- train:
- 下载大小: 23412452 字节
- 数据集大小: 41578614 字节
配置 ang
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 1810454
- 样本数: 3531
- train:
- 下载大小: 1173048 字节
- 数据集大小: 1810454 字节
配置 anp
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 4637069
- 样本数: 2238
- train:
- 下载大小: 1783765 字节
- 数据集大小: 4637069 字节
配置 ar
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 2074092955
- 样本数: 1978419
- train:
- 下载大小: 940821033 字节
- 数据集大小: 2074092955 字节
配置 arc
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 337763
- 样本数: 752
- train:
- 下载大小: 166806 字节
- 数据集大小: 337763 字节
配置 ary
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 8336246
- 样本数: 12782
- train:
- 下载大小: 3662614 字节
- 数据集大小: 8336246 字节
配置 arz
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 519197847
- 样本数: 816075
- train:
- 下载大小: 69488970 字节
- 数据集大小: 519197847 字节
配置 as
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 66313275
- 样本数: 34743
- train:
- 下载大小: 25706302 字节
- 数据集大小: 66313275 字节
配置 ast
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 337276492
- 样本数: 339625
- train:
- 下载大小: 201934083 字节
- 数据集大小: 337276492 字节
配置 atj
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 520958
- 样本数: 1066
- train:
- 下载大小: 304143 字节
- 数据集大小: 520958 字节
配置 av
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 3528226
- 样本数: 3334
- train:
- 下载大小: 1573201 字节
- 数据集大小: 3528226 字节
配置 avk
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 11730415
- 样本数: 43588
- train:
- 下载大小: 4205830 字节
- 数据集大小: 11730415 字节
配置 awa
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 1465350
- 样本数: 1286
- train:
- 下载大小: 559381 字节
- 数据集大小: 1465350 字节
配置 ay
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 1168894
- 样本数: 4108
- train:
- 下载大小: 558421 字节
- 数据集大小: 1168894 字节
配置 az
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 326338236
- 样本数: 328201
- train:
- 下载大小: 179029928 字节
- 数据集大小: 326338236 字节
配置 azb
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 73335621
- 样本数: 171028
- train:
- 下载大小: 21435303 字节
- 数据集大小: 73335621 字节
配置 ba
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 207304656
- 样本数: 164050
- train:
- 下载大小: 87599535 字节
- 数据集大小: 207304656 字节
配置 ban
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 8660809
- 样本数: 21085
- train:
- 下载大小: 4054193 字节
- 数据集大小: 8660809 字节
配置 bar
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 20055538
- 样本数: 39958
- train:
- 下载大小: 12418316 字节
- 数据集大小: 20055538 字节
配置 bcl
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 14060191
- 样本数: 19021
- train:
- 下载大小: 8251132 字节
- 数据集大小: 14060191 字节
配置 be
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 475308039
- 样本数: 393613
- train:
- 下载大小: 226186389 字节
- 数据集大小: 475308039 字节
配置 bg
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 823033374
- 样本数: 587687
- train:
- 下载大小: 392570254 字节
- 数据集大小: 823033374 字节
配置 bh
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 11853184
- 样本数: 9647
- train:
- 下载大小: 4413678 字节
- 数据集大小: 11853184 字节
配置 bi
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 167289
- 样本数: 753
- train:
- 下载大小: 96076 字节
- 数据集大小: 167289 字节
配置 bjn
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 3656195
- 样本数: 5888
- train:
- 下载大小: 2090215 字节
- 数据集大小: 3656195 字节
配置 blk
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 7870709
- 样本数: 2605
- train:
- 下载大小: 2437996 字节
- 数据集大小: 7870709 字节
配置 bm
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 310793
- 样本数: 403
- train:
- 下载大小: 186230 字节
- 数据集大小: 310793 字节
配置 bn
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 645866435
- 样本数: 341688
- train:
- 下载大小: 238009185 字节
- 数据集大小: 645866435 字节
配置 bo
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 28460350
- 样本数: 1964
- train:
- 下载大小: 8232639 字节
- 数据集大小: 28460350 字节
配置 bpy
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 14141921
- 样本数: 23173
- train:
- 下载大小: 2240817 字节
- 数据集大小: 14141921 字节
配置 br
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 49288575
- 样本数: 107785
- train:
- 下载大小: 30516246 字节
- 数据集大小: 49288575 字节
配置 bs
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 125483841
- 样本数: 164949
- train:
- 下载大小: 75988761 字节
- 数据集大小: 125483841 字节
配置 bug
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 149949
- 样本数: 225
- train:
- 下载大小: 76250 字节
- 数据集大小: 149949 字节
配置 bxr
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 4870117
- 样本数: 3674
- train:
- 下载大小: 2272586 字节
- 数据集大小: 4870117 字节
配置 ca
- 特征:
- title: string
- text: string
- 分割:
- train:
- 字节数: 1
- train:



