dododo1234/fineweb_edu
收藏Hugging Face2024-06-03 更新2024-06-12 收录
下载链接:
https://hf-mirror.com/datasets/dododo1234/fineweb_edu
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: sample10bt-0
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 925700532
num_examples: 193442
download_size: 550580187
dataset_size: 925700532
- config_name: sample10bt-1
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 924290167
num_examples: 193442
download_size: 549843020
dataset_size: 924290167
- config_name: sample10bt-10
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 932556823
num_examples: 193442
download_size: 552673193
dataset_size: 932556823
- config_name: sample10bt-11
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 926438193
num_examples: 193442
download_size: 549038441
dataset_size: 926438193
- config_name: sample10bt-12
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 912121145
num_examples: 193442
download_size: 540693059
dataset_size: 912121145
- config_name: sample10bt-13
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 918403300
num_examples: 193442
download_size: 543811914
dataset_size: 918403300
- config_name: sample10bt-14
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 929283866
num_examples: 193442
download_size: 549880214
dataset_size: 929283866
- config_name: sample10bt-15
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 916033918
num_examples: 193442
download_size: 542077971
dataset_size: 916033918
- config_name: sample10bt-16
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 891530928
num_examples: 193442
download_size: 532190679
dataset_size: 891530928
- config_name: sample10bt-17
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 901026027
num_examples: 193442
download_size: 537720236
dataset_size: 901026027
- config_name: sample10bt-18
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 899439135
num_examples: 193442
download_size: 536756057
dataset_size: 899439135
- config_name: sample10bt-19
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 895955691
num_examples: 193442
download_size: 534529714
dataset_size: 895955691
- config_name: sample10bt-2
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 931603243
num_examples: 193442
download_size: 554185931
dataset_size: 931603243
- config_name: sample10bt-20
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 914486560
num_examples: 193442
download_size: 544706739
dataset_size: 914486560
- config_name: sample10bt-21
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 912656162
num_examples: 193442
download_size: 543540306
dataset_size: 912656162
- config_name: sample10bt-22
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 916819520
num_examples: 193442
download_size: 545340927
dataset_size: 916819520
- config_name: sample10bt-23
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 914713596
num_examples: 193442
download_size: 544747564
dataset_size: 914713596
- config_name: sample10bt-24
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 927404707
num_examples: 193442
download_size: 551511716
dataset_size: 927404707
- config_name: sample10bt-25
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 919943354
num_examples: 193442
download_size: 547251385
dataset_size: 919943354
- config_name: sample10bt-26
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 915334691
num_examples: 193442
download_size: 545008675
dataset_size: 915334691
- config_name: sample10bt-27
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 919573945
num_examples: 193442
download_size: 546794057
dataset_size: 919573945
- config_name: sample10bt-28
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 917638559
num_examples: 193442
download_size: 545885056
dataset_size: 917638559
- config_name: sample10bt-29
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 949232381
num_examples: 193442
download_size: 564537836
dataset_size: 949232381
- config_name: sample10bt-3
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 929145151
num_examples: 193442
download_size: 552204902
dataset_size: 929145151
- config_name: sample10bt-30
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 934225271
num_examples: 193442
download_size: 556164166
dataset_size: 934225271
- config_name: sample10bt-31
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 922212973
num_examples: 193442
download_size: 549107995
dataset_size: 922212973
- config_name: sample10bt-32
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 924727979
num_examples: 193442
download_size: 550784342
dataset_size: 924727979
- config_name: sample10bt-33
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 934987736
num_examples: 193442
download_size: 556762357
dataset_size: 934987736
- config_name: sample10bt-34
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 940839028
num_examples: 193442
download_size: 559881256
dataset_size: 940839028
- config_name: sample10bt-35
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 958320835
num_examples: 193442
download_size: 569789256
dataset_size: 958320835
- config_name: sample10bt-36
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 944648724
num_examples: 193442
download_size: 561577837
dataset_size: 944648724
- config_name: sample10bt-37
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 932104870
num_examples: 193442
download_size: 554000385
dataset_size: 932104870
- config_name: sample10bt-38
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 920199770
num_examples: 193442
download_size: 547046038
dataset_size: 920199770
- config_name: sample10bt-39
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 926504635
num_examples: 193442
download_size: 550593267
dataset_size: 926504635
- config_name: sample10bt-4
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 914377355
num_examples: 193442
download_size: 543427962
dataset_size: 914377355
- config_name: sample10bt-40
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 918530196
num_examples: 193442
download_size: 545569375
dataset_size: 918530196
- config_name: sample10bt-41
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 918710582
num_examples: 193442
download_size: 545833366
dataset_size: 918710582
- config_name: sample10bt-42
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 922428578
num_examples: 193442
download_size: 547645954
dataset_size: 922428578
- config_name: sample10bt-43
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 908568139
num_examples: 193442
download_size: 539343702
dataset_size: 908568139
- config_name: sample10bt-44
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 919020498
num_examples: 193442
download_size: 546353356
dataset_size: 919020498
- config_name: sample10bt-45
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 915364726
num_examples: 193442
download_size: 543648897
dataset_size: 915364726
- config_name: sample10bt-46
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 925697269
num_examples: 193442
download_size: 550067366
dataset_size: 925697269
- config_name: sample10bt-47
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 917774963
num_examples: 193442
download_size: 544876461
dataset_size: 917774963
- config_name: sample10bt-48
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 926040677
num_examples: 193442
download_size: 549318532
dataset_size: 926040677
- config_name: sample10bt-49
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 931903751
num_examples: 193442
download_size: 552402900
dataset_size: 931903751
- config_name: sample10bt-5
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 925468581
num_examples: 193442
download_size: 550168626
dataset_size: 925468581
- config_name: sample10bt-6
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 929543073
num_examples: 193442
download_size: 551506614
dataset_size: 929543073
- config_name: sample10bt-7
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 934462780
num_examples: 193442
download_size: 554165748
dataset_size: 934462780
- config_name: sample10bt-8
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 923570983
num_examples: 193442
download_size: 547313063
dataset_size: 923570983
- config_name: sample10bt-9
features:
- name: text
sequence: string
- name: avg_chars_per_chunk
dtype: int64
splits:
- name: train
num_bytes: 926282155
num_examples: 193442
download_size: 548624840
dataset_size: 926282155
configs:
- config_name: sample10bt-0
data_files:
- split: train
path: sample10bt-0/train-*
- config_name: sample10bt-1
data_files:
- split: train
path: sample10bt-1/train-*
- config_name: sample10bt-10
data_files:
- split: train
path: sample10bt-10/train-*
- config_name: sample10bt-11
data_files:
- split: train
path: sample10bt-11/train-*
- config_name: sample10bt-12
data_files:
- split: train
path: sample10bt-12/train-*
- config_name: sample10bt-13
data_files:
- split: train
path: sample10bt-13/train-*
- config_name: sample10bt-14
data_files:
- split: train
path: sample10bt-14/train-*
- config_name: sample10bt-15
data_files:
- split: train
path: sample10bt-15/train-*
- config_name: sample10bt-16
data_files:
- split: train
path: sample10bt-16/train-*
- config_name: sample10bt-17
data_files:
- split: train
path: sample10bt-17/train-*
- config_name: sample10bt-18
data_files:
- split: train
path: sample10bt-18/train-*
- config_name: sample10bt-19
data_files:
- split: train
path: sample10bt-19/train-*
- config_name: sample10bt-2
data_files:
- split: train
path: sample10bt-2/train-*
- config_name: sample10bt-20
data_files:
- split: train
path: sample10bt-20/train-*
- config_name: sample10bt-21
data_files:
- split: train
path: sample10bt-21/train-*
- config_name: sample10bt-22
data_files:
- split: train
path: sample10bt-22/train-*
- config_name: sample10bt-23
data_files:
- split: train
path: sample10bt-23/train-*
- config_name: sample10bt-24
data_files:
- split: train
path: sample10bt-24/train-*
- config_name: sample10bt-25
data_files:
- split: train
path: sample10bt-25/train-*
- config_name: sample10bt-26
data_files:
- split: train
path: sample10bt-26/train-*
- config_name: sample10bt-27
data_files:
- split: train
path: sample10bt-27/train-*
- config_name: sample10bt-28
data_files:
- split: train
path: sample10bt-28/train-*
- config_name: sample10bt-29
data_files:
- split: train
path: sample10bt-29/train-*
- config_name: sample10bt-3
data_files:
- split: train
path: sample10bt-3/train-*
- config_name: sample10bt-30
data_files:
- split: train
path: sample10bt-30/train-*
- config_name: sample10bt-31
data_files:
- split: train
path: sample10bt-31/train-*
- config_name: sample10bt-32
data_files:
- split: train
path: sample10bt-32/train-*
- config_name: sample10bt-33
data_files:
- split: train
path: sample10bt-33/train-*
- config_name: sample10bt-34
data_files:
- split: train
path: sample10bt-34/train-*
- config_name: sample10bt-35
data_files:
- split: train
path: sample10bt-35/train-*
- config_name: sample10bt-36
data_files:
- split: train
path: sample10bt-36/train-*
- config_name: sample10bt-37
data_files:
- split: train
path: sample10bt-37/train-*
- config_name: sample10bt-38
data_files:
- split: train
path: sample10bt-38/train-*
- config_name: sample10bt-39
data_files:
- split: train
path: sample10bt-39/train-*
- config_name: sample10bt-4
data_files:
- split: train
path: sample10bt-4/train-*
- config_name: sample10bt-40
data_files:
- split: train
path: sample10bt-40/train-*
- config_name: sample10bt-41
data_files:
- split: train
path: sample10bt-41/train-*
- config_name: sample10bt-42
data_files:
- split: train
path: sample10bt-42/train-*
- config_name: sample10bt-43
data_files:
- split: train
path: sample10bt-43/train-*
- config_name: sample10bt-44
data_files:
- split: train
path: sample10bt-44/train-*
- config_name: sample10bt-45
data_files:
- split: train
path: sample10bt-45/train-*
- config_name: sample10bt-46
data_files:
- split: train
path: sample10bt-46/train-*
- config_name: sample10bt-47
data_files:
- split: train
path: sample10bt-47/train-*
- config_name: sample10bt-48
data_files:
- split: train
path: sample10bt-48/train-*
- config_name: sample10bt-49
data_files:
- split: train
path: sample10bt-49/train-*
- config_name: sample10bt-5
data_files:
- split: train
path: sample10bt-5/train-*
- config_name: sample10bt-6
data_files:
- split: train
path: sample10bt-6/train-*
- config_name: sample10bt-7
data_files:
- split: train
path: sample10bt-7/train-*
- config_name: sample10bt-8
data_files:
- split: train
path: sample10bt-8/train-*
- config_name: sample10bt-9
data_files:
- split: train
path: sample10bt-9/train-*
---
This dataset consists of multiple configurations, each identified by a unique name. Each configuration includes two features: text (a sequence of strings) and avg_chars_per_chunk (an integer type). The dataset is divided into a training set, with each configurations training set having specified numbers of bytes and examples. The download and dataset sizes for each configuration are also provided. The data file paths for each configuration point to the training set.
提供机构:
dododo1234
原始信息汇总
数据集概述
本数据集包含多个配置,每个配置对应不同的数据集子集。每个子集包含两个主要特征:text 和 avg_chars_per_chunk,其中 text 是字符串类型的序列,avg_chars_per_chunk 是整数类型。所有子集均只包含训练数据。
数据集配置详情
以下是各配置的基本信息:
| 配置名称 | 特征名称 | 数据类型 | 训练数据大小(字节) | 训练示例数量 | 下载大小(字节) |
|---|---|---|---|---|---|
| sample10bt-0 | text | string | 925700532 | 193442 | 550580187 |
| sample10bt-1 | text | string | 924290167 | 193442 | 549843020 |
| sample10bt-10 | text | string | 932556823 | 193442 | 552673193 |
| sample10bt-11 | text | string | 926438193 | 193442 | 549038441 |
| sample10bt-12 | text | string | 912121145 | 193442 | 540693059 |
| sample10bt-13 | text | string | 918403300 | 193442 | 543811914 |
| sample10bt-14 | text | string | 929283866 | 193442 | 549880214 |
| sample10bt-15 | text | string | 916033918 | 193442 | 542077971 |
| sample10bt-16 | text | string | 891530928 | 193442 | 532190679 |
| sample10bt-17 | text | string | 901026027 | 193442 | 537720236 |
| sample10bt-18 | text | string | 899439135 | 193442 | 536756057 |
| sample10bt-19 | text | string | 895955691 | 193442 | 534529714 |
| sample10bt-2 | text | string | 931603243 | 193442 | 554185931 |
| sample10bt-20 | text | string | 914486560 | 193442 | 544706739 |
| sample10bt-21 | text | string | 912656162 | 193442 | 543540306 |
| sample10bt-22 | text | string | 916819520 | 193442 | 545340927 |
| sample10bt-23 | text | string | 914713596 | 193442 | 544747564 |
| sample10bt-24 | text | string | 927404707 | 193442 | 551511716 |
| sample10bt-25 | text | string | 919943354 | 193442 | 547251385 |
| sample10bt-26 | text | string | 915334691 | 193442 | 545008675 |
| sample10bt-27 | text | string | 919573945 | 193442 | 546794057 |
| sample10bt-28 | text | string | 917638559 | 193442 | 545885056 |
| sample10bt-29 | text | string | 949232381 | 193442 | 564537836 |
| sample10bt-3 | text | string | 929145151 | 193442 | 552204902 |
| sample10bt-30 | text | string | 934225271 | 193442 | 556164166 |
| sample10bt-31 | text | string | 922212973 | 193442 | 549107995 |
| sample10bt-32 | text | string | 924727979 | 193442 | 550784342 |
| sample10bt-33 | text | string | 934987736 | 193442 | 556762357 |
| sample10bt-34 | text | string | 940839028 | 193442 | 559881256 |
| sample10bt-35 | text | string | 958320835 | 193442 | 569789256 |
| sample10bt-36 | text | string | 944648724 | 193442 | 561577837 |
| sample10bt-37 | text | string | 932104870 | 193442 | 554000385 |
| sample10bt-38 | text | string | 920199770 | 193442 | 547046038 |
| sample10bt-39 | text | string | 926504635 | 193442 | 550593267 |
| sample10bt-4 | text | string | 914377355 | 193442 | 543427962 |
| sample10bt-40 | text | string | 918530196 | 193442 | 545569375 |
| sample10bt-41 | text | string | 918710582 | 193442 | 545833366 |
| sample10bt-42 | text | string | 922428578 | 193442 | 547645954 |
| sample10bt-43 | text | string | 908568139 | 193442 | 539343702 |
| sample10bt-44 | text | string | 919020498 | 193442 | 546353356 |
| sample10bt-45 | text | string | 915364726 | 193442 | 543648897 |
| sample10bt-46 | text | string | 925697269 | 193442 | 550067366 |
| sample10bt-47 | text | string | 917774963 | 193442 | 544876461 |
| sample10bt-48 | text | string | 926040677 | 193442 | 549318532 |
| sample10bt-49 | text | string | 931903751 | 193442 | 552402900 |
| sample10bt-5 | text | string | 925468581 | 193442 | 550168626 |
| sample10bt-6 | text | string | 929543073 | 193442 | 551506614 |
| sample10bt-7 | text | string | 934462780 | 193442 | 554165748 |
| sample10bt-8 | text | string | 923570983 | 193442 | 547313063 |
| sample10bt-9 | text | string | 926282155 | 193442 | 548624840 |
每个配置的训练数据文件路径遵循模式:配置名称/train-*。



