OpenLLM-France/wikipedia.fr
收藏Hugging Face2024-02-06 更新2024-07-06 收录
下载链接:
https://hf-mirror.com/datasets/OpenLLM-France/wikipedia.fr
下载链接
链接失效反馈官方服务:
资源简介:
---
language:
- fr
license:
- cc-by-sa-4.0
task_categories:
- text-generation
- fill-mask
task_ids:
- language-modeling
- masked-language-modeling
configs:
- config_name: default
data_files:
- split: train
path: 20240201/*
- config_name: "20240201"
data_files:
- split: train
path: 20240201/*
- config_name: "20240201.1"
data_files:
- split: train
path: 20240201/train-000000-of-000127.parquet
- config_name: "20240201.2"
data_files:
- split: train
path: 20240201/train-000001-of-000127.parquet
- config_name: "20240201.3"
data_files:
- split: train
path: 20240201/train-000002-of-000127.parquet
- config_name: "20240201.4"
data_files:
- split: train
path: 20240201/train-000003-of-000127.parquet
- config_name: "20240201.5"
data_files:
- split: train
path: 20240201/train-000004-of-000127.parquet
- config_name: "20240201.6"
data_files:
- split: train
path: 20240201/train-000005-of-000127.parquet
- config_name: "20240201.7"
data_files:
- split: train
path: 20240201/train-000006-of-000127.parquet
- config_name: "20240201.8"
data_files:
- split: train
path: 20240201/train-000007-of-000127.parquet
- config_name: "20240201.9"
data_files:
- split: train
path: 20240201/train-000008-of-000127.parquet
- config_name: "20240201.10"
data_files:
- split: train
path: 20240201/train-000009-of-000127.parquet
- config_name: "20240201.11"
data_files:
- split: train
path: 20240201/train-000010-of-000127.parquet
- config_name: "20240201.12"
data_files:
- split: train
path: 20240201/train-000011-of-000127.parquet
- config_name: "20240201.13"
data_files:
- split: train
path: 20240201/train-000012-of-000127.parquet
- config_name: "20240201.14"
data_files:
- split: train
path: 20240201/train-000013-of-000127.parquet
- config_name: "20240201.15"
data_files:
- split: train
path: 20240201/train-000014-of-000127.parquet
- config_name: "20240201.16"
data_files:
- split: train
path: 20240201/train-000015-of-000127.parquet
- config_name: "20240201.17"
data_files:
- split: train
path: 20240201/train-000016-of-000127.parquet
- config_name: "20240201.18"
data_files:
- split: train
path: 20240201/train-000017-of-000127.parquet
- config_name: "20240201.19"
data_files:
- split: train
path: 20240201/train-000018-of-000127.parquet
- config_name: "20240201.20"
data_files:
- split: train
path: 20240201/train-000019-of-000127.parquet
- config_name: "20240201.21"
data_files:
- split: train
path: 20240201/train-000020-of-000127.parquet
- config_name: "20240201.22"
data_files:
- split: train
path: 20240201/train-000021-of-000127.parquet
- config_name: "20240201.23"
data_files:
- split: train
path: 20240201/train-000022-of-000127.parquet
- config_name: "20240201.24"
data_files:
- split: train
path: 20240201/train-000023-of-000127.parquet
- config_name: "20240201.25"
data_files:
- split: train
path: 20240201/train-000024-of-000127.parquet
- config_name: "20240201.26"
data_files:
- split: train
path: 20240201/train-000025-of-000127.parquet
- config_name: "20240201.27"
data_files:
- split: train
path: 20240201/train-000026-of-000127.parquet
- config_name: "20240201.28"
data_files:
- split: train
path: 20240201/train-000027-of-000127.parquet
- config_name: "20240201.29"
data_files:
- split: train
path: 20240201/train-000028-of-000127.parquet
- config_name: "20240201.30"
data_files:
- split: train
path: 20240201/train-000029-of-000127.parquet
- config_name: "20240201.31"
data_files:
- split: train
path: 20240201/train-000030-of-000127.parquet
- config_name: "20240201.32"
data_files:
- split: train
path: 20240201/train-000031-of-000127.parquet
- config_name: "20240201.33"
data_files:
- split: train
path: 20240201/train-000032-of-000127.parquet
- config_name: "20240201.34"
data_files:
- split: train
path: 20240201/train-000033-of-000127.parquet
- config_name: "20240201.35"
data_files:
- split: train
path: 20240201/train-000034-of-000127.parquet
- config_name: "20240201.36"
data_files:
- split: train
path: 20240201/train-000035-of-000127.parquet
- config_name: "20240201.37"
data_files:
- split: train
path: 20240201/train-000036-of-000127.parquet
- config_name: "20240201.38"
data_files:
- split: train
path: 20240201/train-000037-of-000127.parquet
- config_name: "20240201.39"
data_files:
- split: train
path: 20240201/train-000038-of-000127.parquet
- config_name: "20240201.40"
data_files:
- split: train
path: 20240201/train-000039-of-000127.parquet
- config_name: "20240201.41"
data_files:
- split: train
path: 20240201/train-000040-of-000127.parquet
- config_name: "20240201.42"
data_files:
- split: train
path: 20240201/train-000041-of-000127.parquet
- config_name: "20240201.43"
data_files:
- split: train
path: 20240201/train-000042-of-000127.parquet
- config_name: "20240201.44"
data_files:
- split: train
path: 20240201/train-000043-of-000127.parquet
- config_name: "20240201.45"
data_files:
- split: train
path: 20240201/train-000044-of-000127.parquet
- config_name: "20240201.46"
data_files:
- split: train
path: 20240201/train-000045-of-000127.parquet
- config_name: "20240201.47"
data_files:
- split: train
path: 20240201/train-000046-of-000127.parquet
- config_name: "20240201.48"
data_files:
- split: train
path: 20240201/train-000047-of-000127.parquet
- config_name: "20240201.49"
data_files:
- split: train
path: 20240201/train-000048-of-000127.parquet
- config_name: "20240201.50"
data_files:
- split: train
path: 20240201/train-000049-of-000127.parquet
- config_name: "20240201.51"
data_files:
- split: train
path: 20240201/train-000050-of-000127.parquet
- config_name: "20240201.52"
data_files:
- split: train
path: 20240201/train-000051-of-000127.parquet
- config_name: "20240201.53"
data_files:
- split: train
path: 20240201/train-000052-of-000127.parquet
- config_name: "20240201.54"
data_files:
- split: train
path: 20240201/train-000053-of-000127.parquet
- config_name: "20240201.55"
data_files:
- split: train
path: 20240201/train-000054-of-000127.parquet
- config_name: "20240201.56"
data_files:
- split: train
path: 20240201/train-000055-of-000127.parquet
- config_name: "20240201.57"
data_files:
- split: train
path: 20240201/train-000056-of-000127.parquet
- config_name: "20240201.58"
data_files:
- split: train
path: 20240201/train-000057-of-000127.parquet
- config_name: "20240201.59"
data_files:
- split: train
path: 20240201/train-000058-of-000127.parquet
- config_name: "20240201.60"
data_files:
- split: train
path: 20240201/train-000059-of-000127.parquet
- config_name: "20240201.61"
data_files:
- split: train
path: 20240201/train-000060-of-000127.parquet
- config_name: "20240201.62"
data_files:
- split: train
path: 20240201/train-000061-of-000127.parquet
- config_name: "20240201.63"
data_files:
- split: train
path: 20240201/train-000062-of-000127.parquet
- config_name: "20240201.64"
data_files:
- split: train
path: 20240201/train-000063-of-000127.parquet
- config_name: "20240201.65"
data_files:
- split: train
path: 20240201/train-000064-of-000127.parquet
- config_name: "20240201.66"
data_files:
- split: train
path: 20240201/train-000065-of-000127.parquet
- config_name: "20240201.67"
data_files:
- split: train
path: 20240201/train-000066-of-000127.parquet
- config_name: "20240201.68"
data_files:
- split: train
path: 20240201/train-000067-of-000127.parquet
- config_name: "20240201.69"
data_files:
- split: train
path: 20240201/train-000068-of-000127.parquet
- config_name: "20240201.70"
data_files:
- split: train
path: 20240201/train-000069-of-000127.parquet
- config_name: "20240201.71"
data_files:
- split: train
path: 20240201/train-000070-of-000127.parquet
- config_name: "20240201.72"
data_files:
- split: train
path: 20240201/train-000071-of-000127.parquet
- config_name: "20240201.73"
data_files:
- split: train
path: 20240201/train-000072-of-000127.parquet
- config_name: "20240201.74"
data_files:
- split: train
path: 20240201/train-000073-of-000127.parquet
- config_name: "20240201.75"
data_files:
- split: train
path: 20240201/train-000074-of-000127.parquet
- config_name: "20240201.76"
data_files:
- split: train
path: 20240201/train-000075-of-000127.parquet
- config_name: "20240201.77"
data_files:
- split: train
path: 20240201/train-000076-of-000127.parquet
- config_name: "20240201.78"
data_files:
- split: train
path: 20240201/train-000077-of-000127.parquet
- config_name: "20240201.79"
data_files:
- split: train
path: 20240201/train-000078-of-000127.parquet
- config_name: "20240201.80"
data_files:
- split: train
path: 20240201/train-000079-of-000127.parquet
- config_name: "20240201.81"
data_files:
- split: train
path: 20240201/train-000080-of-000127.parquet
- config_name: "20240201.82"
data_files:
- split: train
path: 20240201/train-000081-of-000127.parquet
- config_name: "20240201.83"
data_files:
- split: train
path: 20240201/train-000082-of-000127.parquet
- config_name: "20240201.84"
data_files:
- split: train
path: 20240201/train-000083-of-000127.parquet
- config_name: "20240201.85"
data_files:
- split: train
path: 20240201/train-000084-of-000127.parquet
- config_name: "20240201.86"
data_files:
- split: train
path: 20240201/train-000085-of-000127.parquet
- config_name: "20240201.87"
data_files:
- split: train
path: 20240201/train-000086-of-000127.parquet
- config_name: "20240201.88"
data_files:
- split: train
path: 20240201/train-000087-of-000127.parquet
- config_name: "20240201.89"
data_files:
- split: train
path: 20240201/train-000088-of-000127.parquet
- config_name: "20240201.90"
data_files:
- split: train
path: 20240201/train-000089-of-000127.parquet
- config_name: "20240201.91"
data_files:
- split: train
path: 20240201/train-000090-of-000127.parquet
- config_name: "20240201.92"
data_files:
- split: train
path: 20240201/train-000091-of-000127.parquet
- config_name: "20240201.93"
data_files:
- split: train
path: 20240201/train-000092-of-000127.parquet
- config_name: "20240201.94"
data_files:
- split: train
path: 20240201/train-000093-of-000127.parquet
- config_name: "20240201.95"
data_files:
- split: train
path: 20240201/train-000094-of-000127.parquet
- config_name: "20240201.96"
data_files:
- split: train
path: 20240201/train-000095-of-000127.parquet
- config_name: "20240201.97"
data_files:
- split: train
path: 20240201/train-000096-of-000127.parquet
- config_name: "20240201.98"
data_files:
- split: train
path: 20240201/train-000097-of-000127.parquet
- config_name: "20240201.99"
data_files:
- split: train
path: 20240201/train-000098-of-000127.parquet
- config_name: "20240201.100"
data_files:
- split: train
path: 20240201/train-000099-of-000127.parquet
- config_name: "20240201.101"
data_files:
- split: train
path: 20240201/train-000100-of-000127.parquet
- config_name: "20240201.102"
data_files:
- split: train
path: 20240201/train-000101-of-000127.parquet
- config_name: "20240201.103"
data_files:
- split: train
path: 20240201/train-000102-of-000127.parquet
- config_name: "20240201.104"
data_files:
- split: train
path: 20240201/train-000103-of-000127.parquet
- config_name: "20240201.105"
data_files:
- split: train
path: 20240201/train-000104-of-000127.parquet
- config_name: "20240201.106"
data_files:
- split: train
path: 20240201/train-000105-of-000127.parquet
- config_name: "20240201.107"
data_files:
- split: train
path: 20240201/train-000106-of-000127.parquet
- config_name: "20240201.108"
data_files:
- split: train
path: 20240201/train-000107-of-000127.parquet
- config_name: "20240201.109"
data_files:
- split: train
path: 20240201/train-000108-of-000127.parquet
- config_name: "20240201.110"
data_files:
- split: train
path: 20240201/train-000109-of-000127.parquet
- config_name: "20240201.111"
data_files:
- split: train
path: 20240201/train-000110-of-000127.parquet
- config_name: "20240201.112"
data_files:
- split: train
path: 20240201/train-000111-of-000127.parquet
- config_name: "20240201.113"
data_files:
- split: train
path: 20240201/train-000112-of-000127.parquet
- config_name: "20240201.114"
data_files:
- split: train
path: 20240201/train-000113-of-000127.parquet
- config_name: "20240201.115"
data_files:
- split: train
path: 20240201/train-000114-of-000127.parquet
- config_name: "20240201.116"
data_files:
- split: train
path: 20240201/train-000115-of-000127.parquet
- config_name: "20240201.117"
data_files:
- split: train
path: 20240201/train-000116-of-000127.parquet
- config_name: "20240201.118"
data_files:
- split: train
path: 20240201/train-000117-of-000127.parquet
- config_name: "20240201.119"
data_files:
- split: train
path: 20240201/train-000118-of-000127.parquet
- config_name: "20240201.120"
data_files:
- split: train
path: 20240201/train-000119-of-000127.parquet
- config_name: "20240201.121"
data_files:
- split: train
path: 20240201/train-000120-of-000127.parquet
- config_name: "20240201.122"
data_files:
- split: train
path: 20240201/train-000121-of-000127.parquet
- config_name: "20240201.123"
data_files:
- split: train
path: 20240201/train-000122-of-000127.parquet
- config_name: "20240201.124"
data_files:
- split: train
path: 20240201/train-000123-of-000127.parquet
- config_name: "20240201.125"
data_files:
- split: train
path: 20240201/train-000124-of-000127.parquet
- config_name: "20240201.126"
data_files:
- split: train
path: 20240201/train-000125-of-000127.parquet
- config_name: "20240201.127"
data_files:
- split: train
path: 20240201/train-000126-of-000127.parquet
dataset_info:
- config_name: default
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 10734253565
num_examples: 2647717
download_size: 5990349749
dataset_size: 10734253565
- config_name: "20240201"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 10734253565
num_examples: 2647717
download_size: 5990349749
dataset_size: 10734253565
- config_name: "20240201.1"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 34196428
num_examples: 16396
download_size: 16506904
dataset_size: 34196428
- config_name: "20240201.2"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 27054791
num_examples: 18219
download_size: 12592449
dataset_size: 27054791
- config_name: "20240201.3"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 43830534
num_examples: 20789
download_size: 24005506
dataset_size: 43830534
- config_name: "20240201.4"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 71119058
num_examples: 31684
download_size: 41577213
dataset_size: 71119058
- config_name: "20240201.5"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 70841264
num_examples: 34160
download_size: 41421942
dataset_size: 70841264
- config_name: "20240201.6"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 69276072
num_examples: 29334
download_size: 39067795
dataset_size: 69276072
- config_name: "20240201.7"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 68674209
num_examples: 30908
download_size: 38793995
dataset_size: 68674209
- config_name: "20240201.8"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 66707411
num_examples: 19863
download_size: 33783086
dataset_size: 66707411
- config_name: "20240201.9"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 78359256
num_examples: 33781
download_size: 45286766
dataset_size: 78359256
- config_name: "20240201.10"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 72491164
num_examples: 29415
download_size: 40051853
dataset_size: 72491164
- config_name: "20240201.11"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 52088660
num_examples: 27038
download_size: 29422313
dataset_size: 52088660
- config_name: "20240201.12"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 53922030
num_examples: 27711
download_size: 30782847
dataset_size: 53922030
- config_name: "20240201.13"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 71147815
num_examples: 30115
download_size: 41109199
dataset_size: 71147815
- config_name: "20240201.14"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 75584185
num_examples: 34222
download_size: 44591235
dataset_size: 75584185
- config_name: "20240201.15"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 61102022
num_examples: 29388
download_size: 35345427
dataset_size: 61102022
- config_name: "20240201.16"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 52669405
num_examples: 25886
download_size: 29765352
dataset_size: 52669405
- config_name: "20240201.17"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 63002951
num_examples: 30210
download_size: 37237317
dataset_size: 63002951
- config_name: "20240201.18"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 72018629
num_examples: 33039
download_size: 42190421
dataset_size: 72018629
- config_name: "20240201.19"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 77047993
num_examples: 34730
download_size: 45416262
dataset_size: 77047993
- config_name: "20240201.20"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 71403591
num_examples: 29422
download_size: 40555745
dataset_size: 71403591
- config_name: "20240201.21"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 72784477
num_examples: 25282
download_size: 38386186
dataset_size: 72784477
- config_name: "20240201.22"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 77132028
num_examples: 28813
download_size: 40667522
dataset_size: 77132028
- config_name: "20240201.23"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 76783997
num_examples: 33275
download_size: 44706134
dataset_size: 76783997
- config_name: "20240201.24"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 74967948
num_examples: 26814
download_size: 39893670
dataset_size: 74967948
- config_name: "20240201.25"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 76874943
num_examples: 29343
download_size: 41627736
dataset_size: 76874943
- config_name: "20240201.26"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 70748683
num_examples: 27868
download_size: 38910526
dataset_size: 70748683
- config_name: "20240201.27"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 69525255
num_examples: 32615
download_size: 40576993
dataset_size: 69525255
- config_name: "20240201.28"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 75815913
num_examples: 34698
download_size: 44353299
dataset_size: 75815913
- config_name: "20240201.29"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 73241748
num_examples: 33102
download_size: 42403968
dataset_size: 73241748
- config_name: "20240201.30"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 73268105
num_examples: 35290
download_size: 42915636
dataset_size: 73268105
- config_name: "20240201.31"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 71784047
num_examples: 33478
download_size: 41483863
dataset_size: 71784047
- config_name: "20240201.32"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 73563401
num_examples: 33673
download_size: 42904317
dataset_size: 73563401
- config_name: "20240201.33"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 76305162
num_examples: 31663
download_size: 43808315
dataset_size: 76305162
- config_name: "20240201.34"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 73540193
num_examples: 29207
download_size: 41448910
dataset_size: 73540193
- config_name: "20240201.35"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 72914512
num_examples: 30460
download_size: 41771375
dataset_size: 72914512
- config_name: "20240201.36"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 65165255
num_examples: 32540
download_size: 37573941
dataset_size: 65165255
- config_name: "20240201.37"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 69989838
num_examples: 30901
download_size: 40275809
dataset_size: 69989838
- config_name: "20240201.38"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 71760117
num_examples: 30623
download_size: 41374992
dataset_size: 71760117
- config_name: "20240201.39"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 59012252
num_examples: 23924
download_size: 33266709
dataset_size: 59012252
- config_name: "20240201.40"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 71498864
num_examples: 24568
download_size: 37194152
dataset_size: 71498864
- config_name: "20240201.41"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 73351642
num_examples: 30071
download_size: 41397164
dataset_size: 73351642
- config_name: "20240201.42"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 69757641
num_examples: 33447
download_size: 41221344
dataset_size: 69757641
- config_name: "20240201.43"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 56131020
num_examples: 25953
download_size: 32083283
dataset_size: 56131020
- config_name: "20240201.44"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 69901910
num_examples: 29543
download_size: 39603099
dataset_size: 69901910
- config_name: "20240201.45"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 67320642
num_examples: 30496
download_size: 38572169
dataset_size: 67320642
- config_name: "20240201.46"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 75201697
num_examples: 24785
download_size: 38698472
dataset_size: 75201697
- config_name: "20240201.47"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 71647763
num_examples: 29341
download_size: 40281819
dataset_size: 71647763
- config_name: "20240201.48"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 74845409
num_examples: 33847
download_size: 43277340
dataset_size: 74845409
- config_name: "20240201.49"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 70275758
num_examples: 28664
download_size: 39130233
dataset_size: 70275758
- config_name: "20240201.50"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 70627618
num_examples: 33106
download_size: 41078652
dataset_size: 70627618
- config_name: "20240201.51"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 64080976
num_examples: 27496
download_size: 36516720
dataset_size: 64080976
- config_name: "20240201.52"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 64633349
num_examples: 28159
download_size: 36695690
dataset_size: 64633349
- config_name: "20240201.53"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 65937451
num_examples: 29331
download_size: 37886853
dataset_size: 65937451
- config_name: "20240201.54"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 63068048
num_examples: 27428
download_size: 36157934
dataset_size: 63068048
- config_name: "20240201.55"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 80624541
num_examples: 23177
download_size: 45015741
dataset_size: 80624541
- config_name: "20240201.56"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 91668895
num_examples: 21747
download_size: 51680423
dataset_size: 91668895
- config_name: "20240201.57"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 92928216
num_examples: 21120
download_size: 52439105
dataset_size: 92928216
- config_name: "20240201.58"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 92122763
num_examples: 20936
download_size: 51557933
dataset_size: 92122763
- config_name: "20240201.59"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 92896352
num_examples: 20141
download_size: 51601121
dataset_size: 92896352
- config_name: "20240201.60"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 95847565
num_examples: 20354
download_size: 52370169
dataset_size: 95847565
- config_name: "20240201.61"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 94892098
num_examples: 19699
download_size: 51987114
dataset_size: 94892098
- config_name: "20240201.62"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 94733883
num_examples: 21424
download_size: 53629896
dataset_size: 94733883
- config_name: "20240201.63"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 84760330
num_examples: 26724
download_size: 47727272
dataset_size: 84760330
- config_name: "20240201.64"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 76862741
num_examples: 29630
download_size: 42451391
dataset_size: 76862741
- config_name: "20240201.65"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 86835866
num_examples: 24746
download_size: 47559783
dataset_size: 86835866
- config_name: "20240201.66"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 97419505
num_examples: 20803
download_size: 54167375
dataset_size: 97419505
- config_name: "20240201.67"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 101240105
num_examples: 20508
download_size: 56988938
dataset_size: 101240105
- config_name: "20240201.68"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 100530987
num_examples: 20296
download_size: 56648848
dataset_size: 100530987
- config_name: "20240201.69"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 99765384
num_examples: 20504
download_size: 56629497
dataset_size: 99765384
- config_name: "20240201.70"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 99413760
num_examples: 19799
download_size: 55896759
dataset_size: 99413760
- config_name: "20240201.71"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 96699449
num_examples: 19261
download_size: 54160771
dataset_size: 96699449
- config_name: "20240201.72"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 96074130
num_examples: 18109
download_size: 53563085
dataset_size: 96074130
- config_name: "20240201.73"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 97872010
num_examples: 18950
download_size: 55104774
dataset_size: 97872010
- config_name: "20240201.74"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 103660117
num_examples: 18087
download_size: 57735545
dataset_size: 103660117
- config_name: "20240201.75"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 102927235
num_examples: 17259
download_size: 57769819
dataset_size: 102927235
- config_name: "20240201.76"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 101830529
num_examples: 17203
download_size: 57364131
dataset_size: 101830529
- config_name: "20240201.77"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 99300851
num_examples: 17022
download_size: 55394788
dataset_size: 99300851
- config_name: "20240201.78"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 89076628
num_examples: 17012
download_size: 49712869
dataset_size: 89076628
- config_name: "20240201.79"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 87018887
num_examples: 15667
download_size: 48560709
dataset_size: 87018887
- config_name: "20240201.80"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 102342716
num_examples: 15699
download_size: 56890574
dataset_size: 102342716
- config_name: "20240201.81"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 107543111
num_examples: 15723
download_size: 59819205
dataset_size: 107543111
- config_name: "20240201.82"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 105027998
num_examples: 15747
download_size: 58690957
dataset_size: 105027998
- config_name: "20240201.83"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 108813281
num_examples: 15867
download_size: 60887398
dataset_size: 108813281
- config_name: "20240201.84"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 108509538
num_examples: 15994
download_size: 60947424
dataset_size: 108509538
- config_name: "20240201.85"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 110251685
num_examples: 15836
download_size: 61438675
dataset_size: 110251685
- config_name: "20240201.86"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 108453666
num_examples: 15933
download_size: 60501214
dataset_size: 108453666
- config_name: "20240201.87"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 104511861
num_examples: 15725
download_size: 57654758
dataset_size: 104511861
- config_name: "20240201.88"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 93987463
num_examples: 15443
download_size: 51101131
dataset_size: 93987463
- config_name: "20240201.89"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 88103739
num_examples: 14825
download_size: 47103909
dataset_size: 88103739
- config_name: "20240201.90"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 101651589
num_examples: 16649
download_size: 56353221
dataset_size: 101651589
- config_name: "20240201.91"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 108162134
num_examples: 15811
download_size: 60694800
dataset_size: 108162134
- config_name: "20240201.92"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 106743929
num_examples: 14611
download_size: 59912440
dataset_size: 106743929
- config_name: "20240201.93"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 108525717
num_examples: 14309
download_size: 60885284
dataset_size: 108525717
- config_name: "20240201.94"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 108815335
num_examples: 14162
download_size: 60886861
dataset_size: 108815335
- config_name: "20240201.95"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 110701667
num_examples: 14093
download_size: 62307593
dataset_size: 110701667
- config_name: "20240201.96"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 110487897
num_examples: 14666
download_size: 62142956
dataset_size: 110487897
- config_name: "20240201.97"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 110022382
num_examples: 13955
download_size: 61626408
dataset_size: 110022382
- config_name: "20240201.98"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 103291309
num_examples: 13078
download_size: 57610204
dataset_size: 103291309
- config_name: "20240201.99"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 100143988
num_examples: 12452
download_size: 55137359
dataset_size: 100143988
- config_name: "20240201.100"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 98674110
num_examples: 12092
download_size: 54635938
dataset_size: 98674110
- config_name: "20240201.101"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 95599168
num_examples: 12972
download_size: 53196972
dataset_size: 95599168
- config_name: "20240201.102"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 106494406
num_examples: 12784
download_size: 59358058
dataset_size: 106494406
- config_name: "20240201.103"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 112642058
num_examples: 13182
download_size: 63217625
dataset_size: 112642058
- config_name: "20240201.104"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 114080181
num_examples: 12846
download_size: 63291654
dataset_size: 114080181
- config_name: "20240201.105"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 114514106
num_examples: 12574
download_size: 63203902
dataset_size: 114514106
- config_name: "20240201.106"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 113416864
num_examples: 12387
download_size: 62338080
dataset_size: 113416864
- config_name: "20240201.107"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 118319372
num_examples: 12158
download_size: 63800873
dataset_size: 118319372
- config_name: "20240201.108"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 114829730
num_examples: 12578
download_size: 63735252
dataset_size: 114829730
- config_name: "20240201.109"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 119250917
num_examples: 11721
download_size: 66271787
dataset_size: 119250917
- config_name: "20240201.110"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 117233559
num_examples: 11910
download_size: 65164043
dataset_size: 117233559
- config_name: "20240201.111"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 116855974
num_examples: 11730
download_size: 65042961
dataset_size: 116855974
- config_name: "20240201.112"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 118134356
num_examples: 10579
download_size: 65094506
dataset_size: 118134356
- config_name: "20240201.113"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 118623669
num_examples: 10037
download_size: 64871384
dataset_size: 118623669
- config_name: "20240201.114"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 117393757
num_examples: 9770
download_size: 63845856
dataset_size: 117393757
- config_name: "20240201.115"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 106111598
num_examples: 9071
download_size: 56797624
dataset_size: 106111598
- config_name: "20240201.116"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 90226981
num_examples: 7375
download_size: 48549040
dataset_size: 90226981
- config_name: "20240201.117"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 81116757
num_examples: 6312
download_size: 43881415
dataset_size: 81116757
- config_name: "20240201.118"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 69767796
num_examples: 5669
download_size: 37464950
dataset_size: 69767796
- config_name: "20240201.119"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 68532234
num_examples: 5141
download_size: 36501941
dataset_size: 68532234
- config_name: "20240201.120"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 67940220
num_examples: 5200
download_size: 35946156
dataset_size: 67940220
- config_name: "20240201.121"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 67517717
num_examples: 5147
download_size: 36402180
dataset_size: 67517717
- config_name: "20240201.122"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 64165695
num_examples: 5036
download_size: 34640708
dataset_size: 64165695
- config_name: "20240201.123"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 63067405
num_examples: 4790
download_size: 33983912
dataset_size: 63067405
- config_name: "20240201.124"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 64134100
num_examples: 4637
download_size: 34973221
dataset_size: 64134100
- config_name: "20240201.125"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 67893676
num_examples: 4804
download_size: 37099621
dataset_size: 67893676
- config_name: "20240201.126"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 56660069
num_examples: 4462
download_size: 30787576
dataset_size: 56660069
- config_name: "20240201.127"
features:
- name: id
dtype: int32
- name: url
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 51968128
num_examples: 3883
download_size: 28293835
dataset_size: 51968128
---
# Plain text of French Wikipedia
This dataset was created by [OpenLLM France](https://openllmfrance.org/) from the [Wikimedia dumps](https://dumps.wikimedia.org/other/enterprise_html/runs).
It is a plain text version of pages from [fr.wikipedia.org/wiki](https://fr.wikipedia.org/wiki)
without HTML tags nor wiki templates.
The text just includes markdown syntax for headers, lists and tables.
See [Notes on data formatting](#notes-on-data-formatting) for more details.
* [Statistics](#statistics)
* [Example use (python)](#example-use-python)
* [Data fields](#data-fields)
* [Notes on data formatting](#notes-on-data-formatting)
* [License](#license)
* [Aknowledgements](#aknowledgements)
* [Citation](#citation)
## Statistics
The amount of data for the latest dump (20240201) is:
| **Unit** | **Size** |
| :---------------|--------: |
| # documents | 2 647 717 |
| # paragraphs | 24 943 988 |
| # lines | 97 295 694 |
| # words | 1 758 293 925 |
| # characters | 10 141 322 856 |
| size in memory | 10237.0 MB |
| size on disk | 5712.8 MB |
## Example use (python)
Load the full dataset:
```python
import datasets
ds = datasets.load_dataset(OpenLLM-France/wikipedia.fr, streaming=True)
```
Load only a small subset:
```python
import datasets
i = 1 # Can be a number between 1 and 127
subset = f"20240201.{i}"
ds = datasets.load_dataset(OpenLLM-France/wikipedia.fr, subset, streaming=True)
```
## Data fields
The data fields are the same among all configurations:
- `id` (int): ID of the article.
- `url` (`str`): URL of the article.
- `title` (`str`): Title of the article.
- `text` (`str`): Text content of the article.
For example:
```
{'id': 847027,
'url': 'https://fr.wikipedia.org/wiki/D%C3%A9partement_de_Rinconada',
'title': 'Département de Rinconada',
'text': "# Département de Rinconada\n\nLe département de Rinconada est une des 16 subdivisions de la province de Jujuy, en Argentine. Son chef-lieu est la ville de Rinconada.\nLe département a une superficie de 6 407 km². Sa population extrêmement clairsemée s'élevait à 2 298 habitants en 2001, soit une densité de 0,3 hab./km².\n\n# Département de Rinconada\n## Localités\n\nOutre le chef-lieu d'arrondissement, Rinconada, il faut citer :\n* Mina Pirquitas, localité la plus élevée d'Argentine (4 271 mètres d'altitude).\nC'est sur le territoire de ce département qu'est situé le site Ramsar des Lagunas de Vilama."
}
```
## Notes on data formatting
### Special characters
Superscripts and subscripts are kept as unicode characters when possible. e.g. `XIIIᵉ siècle`, `3 000 m²`, `P₂O₇⁴⁻`.
Unbreakable spaces are kept as unicode characters
(in python, use `text.replace("\u00A0", " ")` to replace them with normal spaces).
### Markdown syntax
Tables are kept as markdown tables.
For instance:
<div style="border: 2px solid black; padding: 10px;"><code>
| Français | Espéranto | IPA |<br/>
| Salut, bonjour | Saluton | [sa.ˈlu.ton] |<br/>
| Oui | Jes | [ˈjes] |<br/>
| Non | Ne | [ˈne] |<br/>
| Bonsoir | Bonan vesperon | [ˈbo.nan ves.ˈpe.ron] |<br/>
| Bonne nuit | Bonan nokton | [ˈbo.nan ˈnok.ton] |
</code></div>
Lists are kept as markdown lists.
For instance:
<div style="border: 2px solid black; padding: 10px;"><code>
* 1 000 personnes ont l'espéranto comme langue maternelle ;<br/>
* 10 000 personnes parlent l'espéranto avec un niveau proche d'une langue maternelle ;<br/>
* 100 000 personnes parlent couramment l'espéranto ;
</code></div>
Headers are kept as markdown headers, and all the levels of headers are kept before each paragraph.
The first header always corresponds to the page title.
For instance:
<div style="border: 2px solid black; padding: 10px;"><code>
# Espéranto<br />
<br />
L'espéranto est une langue construite internationale utilisée comme langue véhiculaire par des personnes […]<br />
Fondée sur une grammaire régulière sans exception, l'espéranto est une langue globalement agglutinante où[…]<br />
C’est en 1887 que Louis-Lazare Zamenhof, sous le pseudonyme Doktoro Esperanto (Docteur « Espérant ») qui […]<br />
L’Association universelle d’espéranto, fondée en 1908, est en relation officielle avec l’Organisation des[…]<br />
<br />
# Espéranto<br />
## Définition<br />
### Nom<br />
<br />
Le pseudonyme « Doktoro Esperanto » (Docteur « Espérant »), utilisé par Zamenhof pour présenter son proje[…]<br />
<br />
# Espéranto<br />
## Définition<br />
### Nom<br />
#### Utilisation du mot espéranto en tant que métaphore<br />
<br />
Le nom espéranto fonctionne comme un nom propre quand il désigne la langue même, mais est parfois utilisé[…]<br />
Dans le domaine de l'informatique, Java fut qualifié d'« espéranto des langages de programmation », en pa[…]
</code></div>
## License
This dataset is distributed under the [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/).
## Aknowledgements
This dataset was created by [Jérôme Louradour](https://huggingface.co/Jeronymous) on behalf of [OpenLLM France](https://openllmfrance.org/).
Many thanks to the [Wikimedia Foundation](https://wikimediafoundation.org/)
for providing the data and useful advices,
in particular Isaac Johnson, Albert Villanova and Rémy Gerbet.
## Citation
```
@online{wikipedia_fr_dump,
author = "OpenLLM France",
title = "Plain text of French Wikipedia",
url = "https://huggingface.co/datasets/OpenLLM-France/wikipedia.fr"
}
```
This dataset includes multiple configuration files, each corresponding to different data files and features. The main features include id, url, title, and text, with data types of int32 and string. The dataset is primarily used for text generation and masked language modeling tasks. The dataset is in French and uses the CC BY-SA 4.0 license.
提供机构:
OpenLLM-France
原始信息汇总
数据集概述
语言
- 法语(fr)
许可证
- CC BY-SA 4.0
任务类别
- 文本生成
- 填充掩码
任务ID
- 语言建模
- 掩码语言建模
配置信息
默认配置
- 配置名称: default
- 数据文件路径: 20240201/*
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 10734253565
- 样本数: 2647717
- 训练集:
- 下载大小: 5990349749
- 数据集大小: 10734253565
20240201 配置
- 配置名称: 20240201
- 数据文件路径: 20240201/*
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 10734253565
- 样本数: 2647717
- 训练集:
- 下载大小: 5990349749
- 数据集大小: 10734253565
20240201.1 配置
- 配置名称: 20240201.1
- 数据文件路径: 20240201/train-000000-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 34196428
- 样本数: 16396
- 训练集:
- 下载大小: 16506904
- 数据集大小: 34196428
20240201.2 配置
- 配置名称: 20240201.2
- 数据文件路径: 20240201/train-000001-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 27054791
- 样本数: 18219
- 训练集:
- 下载大小: 12592449
- 数据集大小: 27054791
20240201.3 配置
- 配置名称: 20240201.3
- 数据文件路径: 20240201/train-000002-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 43830534
- 样本数: 20789
- 训练集:
- 下载大小: 24005506
- 数据集大小: 43830534
20240201.4 配置
- 配置名称: 20240201.4
- 数据文件路径: 20240201/train-000003-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 71119058
- 样本数: 31684
- 训练集:
- 下载大小: 41577213
- 数据集大小: 71119058
20240201.5 配置
- 配置名称: 20240201.5
- 数据文件路径: 20240201/train-000004-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 70841264
- 样本数: 34160
- 训练集:
- 下载大小: 41421942
- 数据集大小: 70841264
20240201.6 配置
- 配置名称: 20240201.6
- 数据文件路径: 20240201/train-000005-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 69276072
- 样本数: 29334
- 训练集:
- 下载大小: 39067795
- 数据集大小: 69276072
20240201.7 配置
- 配置名称: 20240201.7
- 数据文件路径: 20240201/train-000006-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 68674209
- 样本数: 30908
- 训练集:
- 下载大小: 38793995
- 数据集大小: 68674209
20240201.8 配置
- 配置名称: 20240201.8
- 数据文件路径: 20240201/train-000007-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 66707411
- 样本数: 19863
- 训练集:
- 下载大小: 33783086
- 数据集大小: 66707411
20240201.9 配置
- 配置名称: 20240201.9
- 数据文件路径: 20240201/train-000008-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 78359256
- 样本数: 33781
- 训练集:
- 下载大小: 45286766
- 数据集大小: 78359256
20240201.10 配置
- 配置名称: 20240201.10
- 数据文件路径: 20240201/train-000009-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 72491164
- 样本数: 29415
- 训练集:
- 下载大小: 40051853
- 数据集大小: 72491164
20240201.11 配置
- 配置名称: 20240201.11
- 数据文件路径: 20240201/train-000010-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 52088660
- 样本数: 27038
- 训练集:
- 下载大小: 29422313
- 数据集大小: 52088660
20240201.12 配置
- 配置名称: 20240201.12
- 数据文件路径: 20240201/train-000011-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 53922030
- 样本数: 27711
- 训练集:
- 下载大小: 30782847
- 数据集大小: 53922030
20240201.13 配置
- 配置名称: 20240201.13
- 数据文件路径: 20240201/train-000012-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 71147815
- 样本数: 30115
- 训练集:
- 下载大小: 41109199
- 数据集大小: 71147815
20240201.14 配置
- 配置名称: 20240201.14
- 数据文件路径: 20240201/train-000013-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 75584185
- 样本数: 34222
- 训练集:
- 下载大小: 44591235
- 数据集大小: 75584185
20240201.15 配置
- 配置名称: 20240201.15
- 数据文件路径: 20240201/train-000014-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 61102022
- 样本数: 29388
- 训练集:
- 下载大小: 35345427
- 数据集大小: 61102022
20240201.16 配置
- 配置名称: 20240201.16
- 数据文件路径: 20240201/train-000015-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 52669405
- 样本数: 25886
- 训练集:
- 下载大小: 29765352
- 数据集大小: 52669405
20240201.17 配置
- 配置名称: 20240201.17
- 数据文件路径: 20240201/train-000016-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 63002951
- 样本数: 30210
- 训练集:
- 下载大小: 37237317
- 数据集大小: 63002951
20240201.18 配置
- 配置名称: 20240201.18
- 数据文件路径: 20240201/train-000017-of-000127.parquet
- 特征:
- id: int32
- url: string
- title: string
- text: string
- 分割:
- 训练集:
- 字节数: 63002951
- 样本数: 30210
- 训练集:
- 下载大小: 37237317
- 数据集大小: 63002951



