Rijgersberg/GPT-NL_public_corpus-new
收藏Hugging Face2026-04-08 更新2026-04-12 收录
下载链接:
https://hf-mirror.com/datasets/Rijgersberg/GPT-NL_public_corpus-new
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: auditdienstrijk
features:
- name: avg_word_length
dtype: float64
- name: title
dtype: string
- name: n_char
dtype: int64
- name: license
dtype: string
- name: id
dtype: string
- name: dataset_url
dtype: string
- name: source
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: text
dtype: string
- name: language_score
dtype: float64
- name: dataset_name
dtype: string
- name: author
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 23187709
num_examples: 560
download_size: 10942976
dataset_size: 23187709
- config_name: belgian-journal
features:
- name: n_char
dtype: int64
- name: dataset_name
dtype: string
- name: text
dtype: string
- name: language
dtype: string
- name: language_score
dtype: float64
- name: source
dtype: string
- name: avg_word_length
dtype: float64
- name: title
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: dataset_url
dtype: string
- name: id
dtype: string
- name: author
dtype: string
- name: license
dtype: string
splits:
- name: train
num_bytes: 3251295721
num_examples: 209088
download_size: 1508426165
dataset_size: 3251295721
- config_name: dansknaw
features:
- name: author
dtype: string
- name: dataset_url
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: text
dtype: string
- name: dataset_name
dtype: string
- name: language
dtype: string
- name: id
dtype: string
- name: license
dtype: string
- name: title
dtype: string
- name: source
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
- name: avg_word_length
dtype: float64
splits:
- name: train
num_bytes: 113752383
num_examples: 57104
download_size: 52164844
dataset_size: 113752383
- config_name: dpc
features:
- name: license
dtype: string
- name: language
dtype: string
- name: dataset_name
dtype: string
- name: title
dtype: string
- name: text
dtype: string
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: id
dtype: string
- name: n_char
dtype: int64
- name: author
dtype: string
- name: avg_word_length
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: source
dtype: string
splits:
- name: train
num_bytes: 413712941
num_examples: 127715
download_size: 205060509
dataset_size: 413712941
- config_name: european-parliament
features:
- name: source
dtype: string
- name: text
dtype: string
- name: author
dtype: string
- name: id
dtype: string
- name: avg_word_length
dtype: float64
- name: license
dtype: string
- name: dataset_name
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
- name: title
dtype: string
- name: dataset_url
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: language
dtype: string
splits:
- name: train
num_bytes: 380818073
num_examples: 983
download_size: 217126173
dataset_size: 380818073
- config_name: kb
features:
- name: text
dtype: string
- name: avg_word_length
dtype: float64
- name: source
dtype: string
- name: language_score
dtype: float64
- name: author
dtype: string
- name: language
dtype: string
- name: id
dtype: string
- name: dataset_name
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: title
dtype: string
- name: license
dtype: string
- name: n_char
dtype: int64
- name: dataset_url
dtype: string
splits:
- name: train
num_bytes: 1741004189
num_examples: 6705
download_size: 1042703115
dataset_size: 1741004189
- config_name: kb-open-kranten
features:
- name: avg_word_length
dtype: float64
- name: text
dtype: string
- name: source
dtype: string
- name: n_char
dtype: int64
- name: license
dtype: string
- name: dataset_name
dtype: string
- name: title
dtype: string
- name: author
dtype: string
- name: language_score
dtype: float64
- name: dataset_url
dtype: string
- name: id
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: language
dtype: string
splits:
- name: train
num_bytes: 6918478654
num_examples: 1573792
download_size: 4152867845
dataset_size: 6918478654
- config_name: kb-pd-books
features:
- name: avg_word_length
dtype: float64
- name: id
dtype: string
- name: dataset_name
dtype: string
- name: language
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: dataset_url
dtype: string
- name: author
dtype: string
- name: source
dtype: string
- name: n_char
dtype: int64
- name: language_score
dtype: float64
- name: license
dtype: string
- name: text
dtype: string
- name: title
dtype: string
splits:
- name: train
num_bytes: 1414444213
num_examples: 2250
download_size: 839096865
dataset_size: 1414444213
- config_name: nationaal-archief
features:
- name: language
dtype: string
- name: id
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
- name: title
dtype: string
- name: source
dtype: string
- name: avg_word_length
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: dataset_name
dtype: string
- name: text
dtype: string
- name: author
dtype: string
- name: license
dtype: string
- name: dataset_url
dtype: string
splits:
- name: train
num_bytes: 4134721757
num_examples: 1932515
download_size: 2178049596
dataset_size: 4134721757
- config_name: naturalis
features:
- name: id
dtype: string
- name: source
dtype: string
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: title
dtype: string
- name: text
dtype: string
- name: n_char
dtype: int64
- name: author
dtype: string
- name: language
dtype: string
- name: avg_word_length
dtype: float64
- name: license
dtype: string
- name: dataset_name
dtype: string
splits:
- name: train
num_bytes: 541923755
num_examples: 12430
download_size: 303362354
dataset_size: 541923755
- config_name: noordhollandsarchief
features:
- name: dataset_url
dtype: string
- name: avg_word_length
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: id
dtype: string
- name: source
dtype: string
- name: title
dtype: string
- name: license
dtype: string
- name: dataset_name
dtype: string
- name: author
dtype: string
- name: language_score
dtype: float64
- name: text
dtype: string
- name: n_char
dtype: int64
- name: language
dtype: string
splits:
- name: train
num_bytes: 98812117
num_examples: 38755
download_size: 52843962
dataset_size: 98812117
- config_name: officiele-bekendmakingen
features:
- name: language
dtype: string
- name: dataset_name
dtype: string
- name: source
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: n_char
dtype: int64
- name: author
dtype: string
- name: title
dtype: string
- name: license
dtype: string
- name: dataset_url
dtype: string
- name: id
dtype: string
- name: text
dtype: string
- name: avg_word_length
dtype: float64
- name: language_score
dtype: float64
splits:
- name: train
num_bytes: 13387894877
num_examples: 1826487
download_size: 6278602697
dataset_size: 13387894877
- config_name: openraadsinformatie-part1
features:
- name: avg_word_length
dtype: float64
- name: dataset_url
dtype: string
- name: dataset_name
dtype: string
- name: author
dtype: string
- name: text
dtype: string
- name: license
dtype: string
- name: language_score
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: title
dtype: string
- name: n_char
dtype: int64
- name: source
dtype: string
- name: language
dtype: string
- name: id
dtype: string
splits:
- name: train
num_bytes: 31716829624
num_examples: 1425477
download_size: 15051464510
dataset_size: 31716829624
- config_name: openraadsinformatie-part2
features:
- name: dataset_url
dtype: string
- name: license
dtype: string
- name: language_score
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: text
dtype: string
- name: avg_word_length
dtype: float64
- name: id
dtype: string
- name: language
dtype: string
- name: author
dtype: string
- name: source
dtype: string
- name: title
dtype: string
- name: n_char
dtype: int64
- name: dataset_name
dtype: string
splits:
- name: train
num_bytes: 27960474517
num_examples: 1292830
download_size: 14054675431
dataset_size: 27960474517
- config_name: pbl
features:
- name: author
dtype: string
- name: text
dtype: string
- name: n_char
dtype: int64
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: license
dtype: string
- name: id
dtype: string
- name: source
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: language
dtype: string
- name: avg_word_length
dtype: float64
- name: dataset_name
dtype: string
- name: title
dtype: string
splits:
- name: train
num_bytes: 82303924
num_examples: 489
download_size: 42177532
dataset_size: 82303924
- config_name: rechtspraak
features:
- name: license
dtype: string
- name: author
dtype: string
- name: title
dtype: string
- name: id
dtype: string
- name: n_char
dtype: int64
- name: dataset_name
dtype: string
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: text
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: source
dtype: string
- name: language
dtype: string
- name: avg_word_length
dtype: float64
splits:
- name: train
num_bytes: 10472025363
num_examples: 918799
download_size: 4471010824
dataset_size: 10472025363
- config_name: tweedekamer
features:
- name: dataset_name
dtype: string
- name: n_char
dtype: int64
- name: text
dtype: string
- name: license
dtype: string
- name: source
dtype: string
- name: language
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: author
dtype: string
- name: avg_word_length
dtype: float64
- name: id
dtype: string
- name: title
dtype: string
splits:
- name: train
num_bytes: 5952064651
num_examples: 233676
download_size: 3048620586
dataset_size: 5952064651
- config_name: utrechts-archief
features:
- name: id
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
- name: language
dtype: string
- name: dataset_url
dtype: string
- name: author
dtype: string
- name: avg_word_length
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: license
dtype: string
- name: source
dtype: string
- name: dataset_name
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 817194681
num_examples: 526029
download_size: 426942852
dataset_size: 817194681
- config_name: wikidata
features:
- name: language_score
dtype: float64
- name: title
dtype: string
- name: language
dtype: string
- name: author
dtype: string
- name: id
dtype: string
- name: source
dtype: string
- name: dataset_name
dtype: string
- name: text
dtype: string
- name: license
dtype: string
- name: dataset_url
dtype: string
splits:
- name: train
num_bytes: 7629089475
num_examples: 14582856
download_size: 1998317930
dataset_size: 7629089475
- config_name: wikiwijs
features:
- name: dataset_url
dtype: string
- name: source
dtype: string
- name: language
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
- name: text
dtype: string
- name: license
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: dataset_name
dtype: string
- name: id
dtype: string
- name: author
dtype: string
- name: title
dtype: string
- name: avg_word_length
dtype: float64
splits:
- name: train
num_bytes: 146245710
num_examples: 124501
download_size: 67485017
dataset_size: 146245710
- config_name: woogle
features:
- name: id
dtype: string
- name: language_score
dtype: float64
- name: title
dtype: string
- name: n_char
dtype: int64
- name: avg_word_length
dtype: float64
- name: author
dtype: string
- name: source
dtype: string
- name: text
dtype: string
- name: license
dtype: string
- name: dataset_name
dtype: string
- name: language
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: dataset_url
dtype: string
splits:
- name: train
num_bytes: 14073719901
num_examples: 4358740
download_size: 6682500335
dataset_size: 14073719901
- config_name: youtube-commons
features:
- name: source
dtype: string
- name: title
dtype: string
- name: avg_word_length
dtype: float64
- name: id
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: text
dtype: string
- name: dataset_name
dtype: string
- name: n_char
dtype: int64
- name: language_score
dtype: float64
- name: dataset_url
dtype: string
- name: license
dtype: string
- name: language
dtype: string
- name: author
dtype: string
splits:
- name: train
num_bytes: 30121152149
num_examples: 2147061
download_size: 16456172914
dataset_size: 30121152149
configs:
- config_name: auditdienstrijk
data_files:
- split: train
path: auditdienstrijk/train-*
- config_name: belgian-journal
data_files:
- split: train
path: belgian-journal/train-*
- config_name: dansknaw
data_files:
- split: train
path: dansknaw/train-*
- config_name: dpc
data_files:
- split: train
path: dpc/train-*
- config_name: european-parliament
data_files:
- split: train
path: european-parliament/train-*
- config_name: kb
data_files:
- split: train
path: kb/train-*
- config_name: kb-open-kranten
data_files:
- split: train
path: kb-open-kranten/train-*
- config_name: kb-pd-books
data_files:
- split: train
path: kb-pd-books/train-*
- config_name: nationaal-archief
data_files:
- split: train
path: nationaal-archief/train-*
- config_name: naturalis
data_files:
- split: train
path: naturalis/train-*
- config_name: noordhollandsarchief
data_files:
- split: train
path: noordhollandsarchief/train-*
- config_name: officiele-bekendmakingen
data_files:
- split: train
path: officiele-bekendmakingen/train-*
- config_name: openraadsinformatie-part1
data_files:
- split: train
path: openraadsinformatie-part1/train-*
- config_name: openraadsinformatie-part2
data_files:
- split: train
path: openraadsinformatie-part2/train-*
- config_name: pbl
data_files:
- split: train
path: pbl/train-*
- config_name: rechtspraak
data_files:
- split: train
path: rechtspraak/train-*
- config_name: tweedekamer
data_files:
- split: train
path: tweedekamer/train-*
- config_name: utrechts-archief
data_files:
- split: train
path: utrechts-archief/train-*
- config_name: wikidata
data_files:
- split: train
path: wikidata/train-*
- config_name: wikiwijs
data_files:
- split: train
path: wikiwijs/train-*
- config_name: woogle
data_files:
- split: train
path: woogle/train-*
- config_name: youtube-commons
data_files:
- split: train
path: youtube-commons/train-*
---
提供机构:
Rijgersberg



