hotchpotch/multilingual_cc_news
收藏Hugging Face2026-02-13 更新2026-03-29 收录
下载链接:
https://hf-mirror.com/datasets/hotchpotch/multilingual_cc_news
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: af
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 15178818
num_examples: 5212
download_size: 9056961
dataset_size: 15178818
- config_name: als
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1477705
num_examples: 652
download_size: 1230685
dataset_size: 1477705
- config_name: am
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 6524410
num_examples: 22672
download_size: 2981607
dataset_size: 6524410
- config_name: an
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 3973
num_examples: 23
download_size: 5659
dataset_size: 3973
- config_name: arz
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 8436054
num_examples: 9806
download_size: 2293566
dataset_size: 8436054
- config_name: as
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 12989931
num_examples: 31679
download_size: 5350512
dataset_size: 12989931
- config_name: ast
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 692802
num_examples: 338
download_size: 437002
dataset_size: 692802
- config_name: av
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 7988
num_examples: 41
download_size: 7365
dataset_size: 7988
- config_name: az
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 251107466
num_examples: 573044
download_size: 126239850
dataset_size: 251107466
- config_name: azb
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 62976
num_examples: 61
download_size: 35146
dataset_size: 62976
- config_name: ba
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 13933201
num_examples: 6915
download_size: 6933747
dataset_size: 13933201
- config_name: bar
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 7725
num_examples: 37
download_size: 8829
dataset_size: 7725
- config_name: bcl
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 57242
num_examples: 34
download_size: 30684
dataset_size: 57242
- config_name: be
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 121612602
num_examples: 72654
download_size: 60323310
dataset_size: 121612602
- config_name: bg
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 6744424698
num_examples: 2924019
download_size: 3185200934
dataset_size: 6744424698
- config_name: bh
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 46366
num_examples: 141
download_size: 23432
dataset_size: 46366
- config_name: bn
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 337570141
num_examples: 719102
download_size: 105519183
dataset_size: 337570141
- config_name: bo
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 191789
num_examples: 486
download_size: 60871
dataset_size: 191789
- config_name: bpy
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 18365
num_examples: 65
download_size: 9379
dataset_size: 18365
- config_name: br
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1188912
num_examples: 1103
download_size: 726549
dataset_size: 1188912
- config_name: bs
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 9512643
num_examples: 30804
download_size: 5539808
dataset_size: 9512643
- config_name: bxr
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 37934
num_examples: 165
download_size: 23079
dataset_size: 37934
- config_name: ca
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1959603172
num_examples: 851043
download_size: 1162112944
dataset_size: 1959603172
- config_name: cbk
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 2331
num_examples: 13
download_size: 5255
dataset_size: 2331
- config_name: ce
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 82833
num_examples: 39
download_size: 41778
dataset_size: 82833
- config_name: ceb
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 2330042
num_examples: 2449
download_size: 1292674
dataset_size: 2330042
- config_name: ckb
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 13722257
num_examples: 31864
download_size: 4794661
dataset_size: 13722257
- config_name: co
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 4168
num_examples: 6
download_size: 11886
dataset_size: 4168
- config_name: cs
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 4585729650
num_examples: 2695727
download_size: 2935908417
dataset_size: 4585729650
- config_name: cv
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 500992
num_examples: 54
download_size: 190845
dataset_size: 500992
- config_name: cy
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 26090705
num_examples: 37692
download_size: 14968538
dataset_size: 26090705
- config_name: da
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 2281283281
num_examples: 1187189
download_size: 1306864188
dataset_size: 2281283281
- config_name: de
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 5001393489
num_examples: 2242000
download_size: 2988006067
dataset_size: 5001393489
- config_name: diq
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 31769
num_examples: 64
download_size: 23415
dataset_size: 31769
- config_name: dsb
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 401
num_examples: 4
download_size: 2705
dataset_size: 401
- config_name: dty
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 538
num_examples: 2
download_size: 3429
dataset_size: 538
- config_name: dv
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 762
num_examples: 6
download_size: 3101
dataset_size: 762
- config_name: el
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 22030652795
num_examples: 6772358
download_size: 10611960833
dataset_size: 22030652795
- config_name: eml
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 49386
num_examples: 81
download_size: 29089
dataset_size: 49386
- config_name: en
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 5001966732
num_examples: 1899000
download_size: 2913392637
dataset_size: 5001966732
- config_name: eo
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1550499
num_examples: 4929
download_size: 863690
dataset_size: 1550499
- config_name: et
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1593654454
num_examples: 1098270
download_size: 981863659
dataset_size: 1593654454
- config_name: eu
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 23032427
num_examples: 76444
download_size: 12134346
dataset_size: 23032427
- config_name: fa
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 13021373243
num_examples: 3443176
download_size: 5766060109
dataset_size: 13021373243
- config_name: fi
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 3014330140
num_examples: 1536679
download_size: 1788027040
dataset_size: 3014330140
- config_name: fy
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 44316612
num_examples: 39731
download_size: 27053374
dataset_size: 44316612
- config_name: ga
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 3796491
num_examples: 1652
download_size: 2185133
dataset_size: 3796491
- config_name: gd
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 11958312
num_examples: 10988
download_size: 6305732
dataset_size: 11958312
- config_name: gl
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 280120565
num_examples: 113208
download_size: 166722820
dataset_size: 280120565
- config_name: gn
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 61949
num_examples: 140
download_size: 40758
dataset_size: 61949
- config_name: gom
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 43748
num_examples: 173
download_size: 25269
dataset_size: 43748
- config_name: gu
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 44062106
num_examples: 75382
download_size: 18727093
dataset_size: 44062106
- config_name: gv
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 17979
num_examples: 98
download_size: 13474
dataset_size: 17979
- config_name: he
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1669376584
num_examples: 530738
download_size: 834401503
dataset_size: 1669376584
- config_name: hi
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 34147211501
num_examples: 10859572
download_size: 12679482179
dataset_size: 34147211501
- config_name: hif
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 181
num_examples: 1
download_size: 2751
dataset_size: 181
- config_name: hr
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 3071903377
num_examples: 1481087
download_size: 1972653731
dataset_size: 3071903377
- config_name: hsb
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 2683
num_examples: 10
download_size: 5053
dataset_size: 2683
- config_name: ht
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1926217
num_examples: 3790
download_size: 1011869
dataset_size: 1926217
- config_name: hu
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 6078040554
num_examples: 2485688
download_size: 3630413202
dataset_size: 6078040554
- config_name: hy
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 69699577
num_examples: 223261
download_size: 26079721
dataset_size: 69699577
- config_name: ia
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 17866
num_examples: 109
download_size: 13674
dataset_size: 17866
- config_name: id
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 8433793004
num_examples: 4483457
download_size: 4682984460
dataset_size: 8433793004
- config_name: ie
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 3634
num_examples: 33
download_size: 5042
dataset_size: 3634
- config_name: ilo
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 329208
num_examples: 222
download_size: 187661
dataset_size: 329208
- config_name: io
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 176984
num_examples: 936
download_size: 100264
dataset_size: 176984
- config_name: is
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 64098397
num_examples: 196123
download_size: 35717054
dataset_size: 64098397
- config_name: ja
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 4273036518
num_examples: 4306405
download_size: 2290775657
dataset_size: 4273036518
- config_name: jbo
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 18846
num_examples: 59
download_size: 19317
dataset_size: 18846
- config_name: jv
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 210349
num_examples: 562
download_size: 108090
dataset_size: 210349
- config_name: ka
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 107742327
num_examples: 91811
download_size: 32816756
dataset_size: 107742327
- config_name: kk
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 23340984
num_examples: 55996
download_size: 10453380
dataset_size: 23340984
- config_name: km
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 3368262
num_examples: 2630
download_size: 1069332
dataset_size: 3368262
- config_name: kn
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 126060528
num_examples: 290071
download_size: 52596233
dataset_size: 126060528
- config_name: ko
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 13483436712
num_examples: 5572465
download_size: 7171568217
dataset_size: 13483436712
- config_name: krc
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 101968
num_examples: 134
download_size: 41812
dataset_size: 101968
- config_name: ku
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 13280623
num_examples: 7566
download_size: 7294787
dataset_size: 13280623
- config_name: kv
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 458933
num_examples: 116
download_size: 232976
dataset_size: 458933
- config_name: kw
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 32273
num_examples: 210
download_size: 23428
dataset_size: 32273
- config_name: ky
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 23570366
num_examples: 100863
download_size: 9802320
dataset_size: 23570366
- config_name: la
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 104421620
num_examples: 31572
download_size: 47880182
dataset_size: 104421620
- config_name: lb
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 65276119
num_examples: 41252
download_size: 38681628
dataset_size: 65276119
- config_name: lez
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 211683
num_examples: 35
download_size: 103516
dataset_size: 211683
- config_name: li
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1709
num_examples: 9
download_size: 4473
dataset_size: 1709
- config_name: lmo
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 201566
num_examples: 461
download_size: 124936
dataset_size: 201566
- config_name: lo
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 23021638
num_examples: 13915
download_size: 10057954
dataset_size: 23021638
- config_name: lt
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 417574820
num_examples: 590555
download_size: 241685772
dataset_size: 417574820
- config_name: lv
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 534111745
num_examples: 557302
download_size: 311759964
dataset_size: 534111745
- config_name: mai
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 4353
num_examples: 18
download_size: 4887
dataset_size: 4353
- config_name: mg
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 608452
num_examples: 442
download_size: 330484
dataset_size: 608452
- config_name: mhr
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 6371372
num_examples: 1600
download_size: 3019448
dataset_size: 6371372
- config_name: min
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 114526
num_examples: 37
download_size: 17048
dataset_size: 114526
- config_name: mk
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 689180160
num_examples: 216221
download_size: 308405880
dataset_size: 689180160
- config_name: ml
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 480846220
num_examples: 536337
download_size: 185789458
dataset_size: 480846220
- config_name: mn
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 20693254
num_examples: 40425
download_size: 7327891
dataset_size: 20693254
- config_name: mr
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 353146989
num_examples: 362224
download_size: 139594547
dataset_size: 353146989
- config_name: mrj
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 2136
num_examples: 6
download_size: 6131
dataset_size: 2136
- config_name: ms
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 9370328
num_examples: 31690
download_size: 4903825
dataset_size: 9370328
- config_name: mt
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 548889
num_examples: 990
download_size: 324370
dataset_size: 548889
- config_name: mwl
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 913
num_examples: 7
download_size: 3493
dataset_size: 913
- config_name: my
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 35227677
num_examples: 71512
download_size: 12050797
dataset_size: 35227677
- config_name: myv
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 55644
num_examples: 10
download_size: 35298
dataset_size: 55644
- config_name: mzn
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 9645
num_examples: 43
download_size: 9124
dataset_size: 9645
- config_name: nah
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 349
num_examples: 2
download_size: 2885
dataset_size: 349
- config_name: nap
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 5966
num_examples: 28
download_size: 7820
dataset_size: 5966
- config_name: nds
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1569643
num_examples: 2125
download_size: 772047
dataset_size: 1569643
- config_name: ne
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 5118398
num_examples: 17766
download_size: 1798018
dataset_size: 5118398
- config_name: new
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 19722
num_examples: 65
download_size: 15530
dataset_size: 19722
- config_name: nl
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 9873537514
num_examples: 5616536
download_size: 5740710214
dataset_size: 9873537514
- config_name: nn
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 200431791
num_examples: 171271
download_size: 118355965
dataset_size: 200431791
- config_name: 'no'
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 3296965210
num_examples: 1738632
download_size: 1950971992
dataset_size: 3296965210
- config_name: oc
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 583043
num_examples: 437
download_size: 393008
dataset_size: 583043
- config_name: or
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 23755608
num_examples: 50586
download_size: 9750658
dataset_size: 23755608
- config_name: os
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 394
num_examples: 3
download_size: 2611
dataset_size: 394
- config_name: pa
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 41249905
num_examples: 48191
download_size: 15865490
dataset_size: 41249905
- config_name: pam
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 230133
num_examples: 80
download_size: 145546
dataset_size: 230133
- config_name: pfl
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 116
num_examples: 1
download_size: 2419
dataset_size: 116
- config_name: pl
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 8731938730
num_examples: 3508134
download_size: 5134342521
dataset_size: 8731938730
- config_name: pms
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 160435
num_examples: 275
download_size: 103703
dataset_size: 160435
- config_name: pnb
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 2436328
num_examples: 12793
download_size: 1068323
dataset_size: 2436328
- config_name: ps
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 18211144
num_examples: 40628
download_size: 7984275
dataset_size: 18211144
- config_name: pt
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 25625173658
num_examples: 10677210
download_size: 14955872508
dataset_size: 25625173658
- config_name: qu
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 52701
num_examples: 280
download_size: 34615
dataset_size: 52701
- config_name: rm
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 10898748
num_examples: 8652
download_size: 6106620
dataset_size: 10898748
- config_name: ro
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 13405789018
num_examples: 6847940
download_size: 7812913616
dataset_size: 13405789018
- config_name: ru
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 5000742250
num_examples: 2385000
download_size: 2436965214
dataset_size: 5000742250
- config_name: sa
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 323567
num_examples: 178
download_size: 137282
dataset_size: 323567
- config_name: sah
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 12560873
num_examples: 890
download_size: 6007483
dataset_size: 12560873
- config_name: sc
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 7022
num_examples: 32
download_size: 8716
dataset_size: 7022
- config_name: scn
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 10468
num_examples: 26
download_size: 10076
dataset_size: 10468
- config_name: sco
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 8338
num_examples: 56
download_size: 8386
dataset_size: 8338
- config_name: sd
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1342648
num_examples: 6078
download_size: 561041
dataset_size: 1342648
- config_name: sh
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 219209567
num_examples: 235356
download_size: 132718402
dataset_size: 219209567
- config_name: si
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 12839032
num_examples: 19193
download_size: 4123650
dataset_size: 12839032
- config_name: sk
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1661787841
num_examples: 1241434
download_size: 1053180893
dataset_size: 1661787841
- config_name: sl
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1414846117
num_examples: 889313
download_size: 895295408
dataset_size: 1414846117
- config_name: so
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 27153249
num_examples: 15956
download_size: 15166016
dataset_size: 27153249
- config_name: sq
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 845804443
num_examples: 367998
download_size: 477582383
dataset_size: 845804443
- config_name: sr
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1672216558
num_examples: 1014086
download_size: 1063306268
dataset_size: 1672216558
- config_name: su
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 176643
num_examples: 579
download_size: 107536
dataset_size: 176643
- config_name: sv
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 4798022444
num_examples: 3285173
download_size: 2811667006
dataset_size: 4798022444
- config_name: sw
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 96420302
num_examples: 58793
download_size: 53192402
dataset_size: 96420302
- config_name: ta
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1197464642
num_examples: 1937554
download_size: 381488817
dataset_size: 1197464642
- config_name: te
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 244106511
num_examples: 616215
download_size: 104606841
dataset_size: 244106511
- config_name: tg
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 30602330
num_examples: 59882
download_size: 13412283
dataset_size: 30602330
- config_name: th
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 209679771
num_examples: 174151
download_size: 79437695
dataset_size: 209679771
- config_name: tk
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 4954225
num_examples: 5600
download_size: 2433728
dataset_size: 4954225
- config_name: tl
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 98861412
num_examples: 61195
download_size: 58227137
dataset_size: 98861412
- config_name: tt
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 14960956
num_examples: 14760
download_size: 7078363
dataset_size: 14960956
- config_name: tyv
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 273
num_examples: 2
download_size: 2643
dataset_size: 273
- config_name: ug
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 2716
num_examples: 18
download_size: 4250
dataset_size: 2716
- config_name: uk
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 10539045797
num_examples: 3892400
download_size: 5099288448
dataset_size: 10539045797
- config_name: ur
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 702079151
num_examples: 1767337
download_size: 305645108
dataset_size: 702079151
- config_name: uz
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 987726
num_examples: 5916
download_size: 479989
dataset_size: 987726
- config_name: vec
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 43496
num_examples: 55
download_size: 29850
dataset_size: 43496
- config_name: vep
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 10515
num_examples: 53
download_size: 10467
dataset_size: 10515
- config_name: vi
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 10989622833
num_examples: 4386142
download_size: 5572562529
dataset_size: 10989622833
- config_name: vls
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 541
num_examples: 3
download_size: 3262
dataset_size: 541
- config_name: vo
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 4568
num_examples: 35
download_size: 6085
dataset_size: 4568
- config_name: wa
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 16079
num_examples: 53
download_size: 14372
dataset_size: 16079
- config_name: war
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 203723
num_examples: 602
download_size: 128550
dataset_size: 203723
- config_name: wuu
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 5780
num_examples: 20
download_size: 12667
dataset_size: 5780
- config_name: xal
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 1302
num_examples: 8
download_size: 3400
dataset_size: 1302
- config_name: xmf
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 125
num_examples: 1
download_size: 2464
dataset_size: 125
- config_name: yi
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 201189
num_examples: 115
download_size: 121750
dataset_size: 201189
- config_name: yo
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 3811900
num_examples: 5264
download_size: 1956057
dataset_size: 3811900
- config_name: yue
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 2850
num_examples: 16
download_size: 4316
dataset_size: 2850
- config_name: zh
features:
- name: title
dtype: string
- name: maintext
dtype: string
- name: url
dtype: string
- name: date_publish
dtype: string
splits:
- name: train
num_bytes: 5342542006
num_examples: 6133244
download_size: 3315529632
dataset_size: 5342542006
configs:
- config_name: af
data_files:
- split: train
path: af/train-*
- config_name: als
data_files:
- split: train
path: als/train-*
- config_name: am
data_files:
- split: train
path: am/train-*
- config_name: an
data_files:
- split: train
path: an/train-*
- config_name: arz
data_files:
- split: train
path: arz/train-*
- config_name: as
data_files:
- split: train
path: as/train-*
- config_name: ast
data_files:
- split: train
path: ast/train-*
- config_name: av
data_files:
- split: train
path: av/train-*
- config_name: az
data_files:
- split: train
path: az/train-*
- config_name: azb
data_files:
- split: train
path: azb/train-*
- config_name: ba
data_files:
- split: train
path: ba/train-*
- config_name: bar
data_files:
- split: train
path: bar/train-*
- config_name: bcl
data_files:
- split: train
path: bcl/train-*
- config_name: be
data_files:
- split: train
path: be/train-*
- config_name: bg
data_files:
- split: train
path: bg/train-*
- config_name: bh
data_files:
- split: train
path: bh/train-*
- config_name: bn
data_files:
- split: train
path: bn/train-*
- config_name: bo
data_files:
- split: train
path: bo/train-*
- config_name: bpy
data_files:
- split: train
path: bpy/train-*
- config_name: br
data_files:
- split: train
path: br/train-*
- config_name: bs
data_files:
- split: train
path: bs/train-*
- config_name: bxr
data_files:
- split: train
path: bxr/train-*
- config_name: ca
data_files:
- split: train
path: ca/train-*
- config_name: cbk
data_files:
- split: train
path: cbk/train-*
- config_name: ce
data_files:
- split: train
path: ce/train-*
- config_name: ceb
data_files:
- split: train
path: ceb/train-*
- config_name: ckb
data_files:
- split: train
path: ckb/train-*
- config_name: co
data_files:
- split: train
path: co/train-*
- config_name: cs
data_files:
- split: train
path: cs/train-*
- config_name: cv
data_files:
- split: train
path: cv/train-*
- config_name: cy
data_files:
- split: train
path: cy/train-*
- config_name: da
data_files:
- split: train
path: da/train-*
- config_name: de
data_files:
- split: train
path: de/train-*
- config_name: diq
data_files:
- split: train
path: diq/train-*
- config_name: dsb
data_files:
- split: train
path: dsb/train-*
- config_name: dty
data_files:
- split: train
path: dty/train-*
- config_name: dv
data_files:
- split: train
path: dv/train-*
- config_name: el
data_files:
- split: train
path: el/train-*
- config_name: eml
data_files:
- split: train
path: eml/train-*
- config_name: en
data_files:
- split: train
path: en/train-*
- config_name: eo
data_files:
- split: train
path: eo/train-*
- config_name: et
data_files:
- split: train
path: et/train-*
- config_name: eu
data_files:
- split: train
path: eu/train-*
- config_name: fa
data_files:
- split: train
path: fa/train-*
- config_name: fi
data_files:
- split: train
path: fi/train-*
- config_name: fy
data_files:
- split: train
path: fy/train-*
- config_name: ga
data_files:
- split: train
path: ga/train-*
- config_name: gd
data_files:
- split: train
path: gd/train-*
- config_name: gl
data_files:
- split: train
path: gl/train-*
- config_name: gn
data_files:
- split: train
path: gn/train-*
- config_name: gom
data_files:
- split: train
path: gom/train-*
- config_name: gu
data_files:
- split: train
path: gu/train-*
- config_name: gv
data_files:
- split: train
path: gv/train-*
- config_name: he
data_files:
- split: train
path: he/train-*
- config_name: hi
data_files:
- split: train
path: hi/train-*
- config_name: hif
data_files:
- split: train
path: hif/train-*
- config_name: hr
data_files:
- split: train
path: hr/train-*
- config_name: hsb
data_files:
- split: train
path: hsb/train-*
- config_name: ht
data_files:
- split: train
path: ht/train-*
- config_name: hu
data_files:
- split: train
path: hu/train-*
- config_name: hy
data_files:
- split: train
path: hy/train-*
- config_name: ia
data_files:
- split: train
path: ia/train-*
- config_name: id
data_files:
- split: train
path: id/train-*
- config_name: ie
data_files:
- split: train
path: ie/train-*
- config_name: ilo
data_files:
- split: train
path: ilo/train-*
- config_name: io
data_files:
- split: train
path: io/train-*
- config_name: is
data_files:
- split: train
path: is/train-*
- config_name: ja
data_files:
- split: train
path: ja/train-*
- config_name: jbo
data_files:
- split: train
path: jbo/train-*
- config_name: jv
data_files:
- split: train
path: jv/train-*
- config_name: ka
data_files:
- split: train
path: ka/train-*
- config_name: kk
data_files:
- split: train
path: kk/train-*
- config_name: km
data_files:
- split: train
path: km/train-*
- config_name: kn
data_files:
- split: train
path: kn/train-*
- config_name: ko
data_files:
- split: train
path: ko/train-*
- config_name: krc
data_files:
- split: train
path: krc/train-*
- config_name: ku
data_files:
- split: train
path: ku/train-*
- config_name: kv
data_files:
- split: train
path: kv/train-*
- config_name: kw
data_files:
- split: train
path: kw/train-*
- config_name: ky
data_files:
- split: train
path: ky/train-*
- config_name: la
data_files:
- split: train
path: la/train-*
- config_name: lb
data_files:
- split: train
path: lb/train-*
- config_name: lez
data_files:
- split: train
path: lez/train-*
- config_name: li
data_files:
- split: train
path: li/train-*
- config_name: lmo
data_files:
- split: train
path: lmo/train-*
- config_name: lo
data_files:
- split: train
path: lo/train-*
- config_name: lt
data_files:
- split: train
path: lt/train-*
- config_name: lv
data_files:
- split: train
path: lv/train-*
- config_name: mai
data_files:
- split: train
path: mai/train-*
- config_name: mg
data_files:
- split: train
path: mg/train-*
- config_name: mhr
data_files:
- split: train
path: mhr/train-*
- config_name: min
data_files:
- split: train
path: min/train-*
- config_name: mk
data_files:
- split: train
path: mk/train-*
- config_name: ml
data_files:
- split: train
path: ml/train-*
- config_name: mn
data_files:
- split: train
path: mn/train-*
- config_name: mr
data_files:
- split: train
path: mr/train-*
- config_name: mrj
data_files:
- split: train
path: mrj/train-*
- config_name: ms
data_files:
- split: train
path: ms/train-*
- config_name: mt
data_files:
- split: train
path: mt/train-*
- config_name: mwl
data_files:
- split: train
path: mwl/train-*
- config_name: my
data_files:
- split: train
path: my/train-*
- config_name: myv
data_files:
- split: train
path: myv/train-*
- config_name: mzn
data_files:
- split: train
path: mzn/train-*
- config_name: nah
data_files:
- split: train
path: nah/train-*
- config_name: nap
data_files:
- split: train
path: nap/train-*
- config_name: nds
data_files:
- split: train
path: nds/train-*
- config_name: ne
data_files:
- split: train
path: ne/train-*
- config_name: new
data_files:
- split: train
path: new/train-*
- config_name: nl
data_files:
- split: train
path: nl/train-*
- config_name: nn
data_files:
- split: train
path: nn/train-*
- config_name: 'no'
data_files:
- split: train
path: no/train-*
- config_name: oc
data_files:
- split: train
path: oc/train-*
- config_name: or
data_files:
- split: train
path: or/train-*
- config_name: os
data_files:
- split: train
path: os/train-*
- config_name: pa
data_files:
- split: train
path: pa/train-*
- config_name: pam
data_files:
- split: train
path: pam/train-*
- config_name: pfl
data_files:
- split: train
path: pfl/train-*
- config_name: pl
data_files:
- split: train
path: pl/train-*
- config_name: pms
data_files:
- split: train
path: pms/train-*
- config_name: pnb
data_files:
- split: train
path: pnb/train-*
- config_name: ps
data_files:
- split: train
path: ps/train-*
- config_name: pt
data_files:
- split: train
path: pt/train-*
- config_name: qu
data_files:
- split: train
path: qu/train-*
- config_name: rm
data_files:
- split: train
path: rm/train-*
- config_name: ro
data_files:
- split: train
path: ro/train-*
- config_name: ru
data_files:
- split: train
path: ru/train-*
- config_name: sa
data_files:
- split: train
path: sa/train-*
- config_name: sah
data_files:
- split: train
path: sah/train-*
- config_name: sc
data_files:
- split: train
path: sc/train-*
- config_name: scn
data_files:
- split: train
path: scn/train-*
- config_name: sco
data_files:
- split: train
path: sco/train-*
- config_name: sd
data_files:
- split: train
path: sd/train-*
- config_name: sh
data_files:
- split: train
path: sh/train-*
- config_name: si
data_files:
- split: train
path: si/train-*
- config_name: sk
data_files:
- split: train
path: sk/train-*
- config_name: sl
data_files:
- split: train
path: sl/train-*
- config_name: so
data_files:
- split: train
path: so/train-*
- config_name: sq
data_files:
- split: train
path: sq/train-*
- config_name: sr
data_files:
- split: train
path: sr/train-*
- config_name: su
data_files:
- split: train
path: su/train-*
- config_name: sv
data_files:
- split: train
path: sv/train-*
- config_name: sw
data_files:
- split: train
path: sw/train-*
- config_name: ta
data_files:
- split: train
path: ta/train-*
- config_name: te
data_files:
- split: train
path: te/train-*
- config_name: tg
data_files:
- split: train
path: tg/train-*
- config_name: th
data_files:
- split: train
path: th/train-*
- config_name: tk
data_files:
- split: train
path: tk/train-*
- config_name: tl
data_files:
- split: train
path: tl/train-*
- config_name: tt
data_files:
- split: train
path: tt/train-*
- config_name: tyv
data_files:
- split: train
path: tyv/train-*
- config_name: ug
data_files:
- split: train
path: ug/train-*
- config_name: uk
data_files:
- split: train
path: uk/train-*
- config_name: ur
data_files:
- split: train
path: ur/train-*
- config_name: uz
data_files:
- split: train
path: uz/train-*
- config_name: vec
data_files:
- split: train
path: vec/train-*
- config_name: vep
data_files:
- split: train
path: vep/train-*
- config_name: vi
data_files:
- split: train
path: vi/train-*
- config_name: vls
data_files:
- split: train
path: vls/train-*
- config_name: vo
data_files:
- split: train
path: vo/train-*
- config_name: wa
data_files:
- split: train
path: wa/train-*
- config_name: war
data_files:
- split: train
path: war/train-*
- config_name: wuu
data_files:
- split: train
path: wuu/train-*
- config_name: xal
data_files:
- split: train
path: xal/train-*
- config_name: xmf
data_files:
- split: train
path: xmf/train-*
- config_name: yi
data_files:
- split: train
path: yi/train-*
- config_name: yo
data_files:
- split: train
path: yo/train-*
- config_name: yue
data_files:
- split: train
path: yue/train-*
- config_name: zh
data_files:
- split: train
path: zh/train-*
---
# hotchpotch/multilingual_cc_news
## Dataset Summary
This dataset republishes the multilingual CC-News data in a Hugging Face friendly layout with one configuration per language.
Source datasets on the Hugging Face Hub:
- [CloverSearch/cc-news-mutlilingual](https://huggingface.co/datasets/CloverSearch/cc-news-mutlilingual)
- [intfloat/multilingual_cc_news](https://huggingface.co/datasets/intfloat/multilingual_cc_news)
The intfloat version provides a loading script, but it can be difficult to use directly via the datasets library because it pulls raw JSONL files and relies on a custom builder. This repo provides the same content as pre-sharded Parquet configs for easy loading.
## Data Fields
- title: string
- maintext: string
- url: string
- date_publish: string
## How to use this dataset
Each language is a dataset config. Load one language at a time:
```python
from datasets import load_dataset
# Single language
train = load_dataset("hotchpotch/multilingual_cc_news", "af", split="train")
# Another language
train_ja = load_dataset("hotchpotch/multilingual_cc_news", "ja", split="train")
```
## References
No dedicated paper is listed by the source dataset cards. For the CC-News dataset announcement, see:
- [Common Crawl mailing list announcement](https://groups.google.com/g/common-crawl/c/eQC0nLVqmQs)
- https://github.com/commoncrawl/news-crawl/
## Supported Languages (train split)
| Language | Rows |
|---|---:|
| af | 5,212 |
| als | 652 |
| am | 22,672 |
| an | 23 |
| arz | 9,806 |
| as | 31,679 |
| ast | 338 |
| av | 41 |
| az | 573,044 |
| azb | 61 |
| ba | 6,915 |
| bar | 37 |
| bcl | 34 |
| be | 72,654 |
| bg | 2,924,019 |
| bh | 141 |
| bn | 719,102 |
| bo | 486 |
| bpy | 65 |
| br | 1,103 |
| bs | 30,804 |
| bxr | 165 |
| ca | 851,043 |
| cbk | 13 |
| ce | 39 |
| ceb | 2,449 |
| ckb | 31,864 |
| co | 6 |
| cs | 2,695,727 |
| cv | 54 |
| cy | 37,692 |
| da | 1,187,189 |
| de | 2,242,000 |
| diq | 64 |
| dsb | 4 |
| dty | 2 |
| dv | 6 |
| el | 6,772,358 |
| eml | 81 |
| en | 1,899,000 |
| eo | 4,929 |
| et | 1,098,270 |
| eu | 76,444 |
| fa | 3,443,176 |
| fi | 1,536,679 |
| fy | 39,731 |
| ga | 1,652 |
| gd | 10,988 |
| gl | 113,208 |
| gn | 140 |
| gom | 173 |
| gu | 75,382 |
| gv | 98 |
| he | 530,738 |
| hi | 10,859,572 |
| hif | 1 |
| hr | 1,481,087 |
| hsb | 10 |
| ht | 3,790 |
| hu | 2,485,688 |
| hy | 223,261 |
| ia | 109 |
| id | 4,483,457 |
| ie | 33 |
| ilo | 222 |
| io | 936 |
| is | 196,123 |
| ja | 4,306,405 |
| jbo | 59 |
| jv | 562 |
| ka | 91,811 |
| kk | 55,996 |
| km | 2,630 |
| kn | 290,071 |
| ko | 5,572,465 |
| krc | 134 |
| ku | 7,566 |
| kv | 116 |
| kw | 210 |
| ky | 100,863 |
| la | 31,572 |
| lb | 41,252 |
| lez | 35 |
| li | 9 |
| lmo | 461 |
| lo | 13,915 |
| lt | 590,555 |
| lv | 557,302 |
| mai | 18 |
| mg | 442 |
| mhr | 1,600 |
| min | 37 |
| mk | 216,221 |
| ml | 536,337 |
| mn | 40,425 |
| mr | 362,224 |
| mrj | 6 |
| ms | 31,690 |
| mt | 990 |
| mwl | 7 |
| my | 71,512 |
| myv | 10 |
| mzn | 43 |
| nah | 2 |
| nap | 28 |
| nds | 2,125 |
| ne | 17,766 |
| new | 65 |
| nl | 5,616,536 |
| nn | 171,271 |
| no | 1,738,632 |
| oc | 437 |
| or | 50,586 |
| os | 3 |
| pa | 48,191 |
| pam | 80 |
| pfl | 1 |
| pl | 3,508,134 |
| pms | 275 |
| pnb | 12,793 |
| ps | 40,628 |
| pt | 10,677,210 |
| qu | 280 |
| rm | 8,652 |
| ro | 6,847,940 |
| ru | 2,385,000 |
| sa | 178 |
| sah | 890 |
| sc | 32 |
| scn | 26 |
| sco | 56 |
| sd | 6,078 |
| sh | 235,356 |
| si | 19,193 |
| sk | 1,241,434 |
| sl | 889,313 |
| so | 15,956 |
| sq | 367,998 |
| sr | 1,014,086 |
| su | 579 |
| sv | 3,285,173 |
| sw | 58,793 |
| ta | 1,937,554 |
| te | 616,215 |
| tg | 59,882 |
| th | 174,151 |
| tk | 5,600 |
| tl | 61,195 |
| tt | 14,760 |
| tyv | 2 |
| ug | 18 |
| uk | 3,892,400 |
| ur | 1,767,337 |
| uz | 5,916 |
| vec | 55 |
| vep | 53 |
| vi | 4,386,142 |
| vls | 3 |
| vo | 35 |
| wa | 53 |
| war | 602 |
| wuu | 20 |
| xal | 8 |
| xmf | 1 |
| yi | 115 |
| yo | 5,264 |
| yue | 16 |
| zh | 6,133,244 |
提供机构:
hotchpotch



