Rijgersberg/GPT-NL_public_corpus-sample
收藏Hugging Face2026-04-11 更新2026-04-26 收录
下载链接:
https://hf-mirror.com/datasets/Rijgersberg/GPT-NL_public_corpus-sample
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: all
features:
- name: id
dtype: string
- name: dataset_name
dtype: string
- name: source
dtype: string
- name: title
dtype: string
- name: language
dtype: string
- name: license
dtype: string
- name: language_score
dtype: float64
- name: dataset_url
dtype: string
- name: avg_word_length
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: author
dtype: string
- name: n_char
dtype: int64
- name: text
dtype: string
splits:
- name: train
num_bytes: 322608207
num_examples: 50000
download_size: 153425428
dataset_size: 322608207
- config_name: american-stories
features:
- name: id
dtype: string
- name: dataset_name
dtype: string
- name: source
dtype: string
- name: title
dtype: string
- name: language
dtype: string
- name: license
dtype: string
- name: language_score
dtype: float64
- name: dataset_url
dtype: string
- name: avg_word_length
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: author
dtype: string
- name: n_char
dtype: int64
- name: text
dtype: string
splits:
- name: train
num_bytes: 6081075
num_examples: 5000
download_size: 3402816
dataset_size: 6081075
- config_name: auditdienstrijk
features:
- name: avg_word_length
dtype: float64
- name: title
dtype: string
- name: n_char
dtype: int64
- name: license
dtype: string
- name: id
dtype: string
- name: dataset_url
dtype: string
- name: source
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: text
dtype: string
- name: language_score
dtype: float64
- name: dataset_name
dtype: string
- name: author
dtype: string
- name: language
dtype: string
splits:
- name: train
num_bytes: 23187709
num_examples: 560
download_size: 10942976
dataset_size: 23187709
- config_name: belgian-journal
features:
- name: n_char
dtype: int64
- name: dataset_name
dtype: string
- name: text
dtype: string
- name: language
dtype: string
- name: language_score
dtype: float64
- name: source
dtype: string
- name: avg_word_length
dtype: float64
- name: title
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: dataset_url
dtype: string
- name: id
dtype: string
- name: author
dtype: string
- name: license
dtype: string
splits:
- name: train
num_bytes: 77749457
num_examples: 5000
download_size: 36952777
dataset_size: 77749457
- config_name: cc_english-pd
features:
- name: author
dtype: string
- name: text
dtype: string
- name: n_char
dtype: int64
- name: title
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: language_score
dtype: float64
- name: id
dtype: string
- name: dataset_url
dtype: string
- name: avg_word_length
dtype: float64
- name: language
dtype: string
- name: dataset_name
dtype: string
- name: license
dtype: string
- name: source
dtype: string
splits:
- name: train
num_bytes: 185349284
num_examples: 5000
download_size: 114220410
dataset_size: 185349284
- config_name: cc_eurovoc
features:
- name: license
dtype: string
- name: language_score
dtype: float64
- name: title
dtype: string
- name: text
dtype: string
- name: avg_word_length
dtype: float64
- name: dataset_name
dtype: string
- name: n_char
dtype: int64
- name: source
dtype: string
- name: dataset_url
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: id
dtype: string
- name: language
dtype: string
- name: author
dtype: string
splits:
- name: train
num_bytes: 383548622
num_examples: 5000
download_size: 188468580
dataset_size: 383548622
- config_name: cc_german-pd
features:
- name: title
dtype: string
- name: language
dtype: string
- name: id
dtype: string
- name: author
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: n_char
dtype: int64
- name: license
dtype: string
- name: source
dtype: string
- name: avg_word_length
dtype: float64
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: dataset_name
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 244695538
num_examples: 5000
download_size: 156385945
dataset_size: 244695538
- config_name: cc_github_open_source
features:
- name: language
dtype: string
- name: avg_word_length
dtype: float64
- name: id
dtype: string
- name: author
dtype: string
- name: n_char
dtype: int64
- name: source
dtype: string
- name: license
dtype: string
- name: title
dtype: string
- name: language_score
dtype: float64
- name: dataset_url
dtype: string
- name: dataset_name
dtype: string
- name: text
dtype: string
- name: n_non_symbol_words
dtype: int64
splits:
- name: train
num_bytes: 22296061
num_examples: 5000
download_size: 7162490
dataset_size: 22296061
- config_name: cc_loc-pd-books
features:
- name: language_score
dtype: float64
- name: text
dtype: string
- name: license
dtype: string
- name: avg_word_length
dtype: float64
- name: source
dtype: string
- name: title
dtype: string
- name: n_char
dtype: int64
- name: dataset_name
dtype: string
- name: id
dtype: string
- name: dataset_url
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: language
dtype: string
- name: author
dtype: string
splits:
- name: train
num_bytes: 178468205
num_examples: 5000
download_size: 107107044
dataset_size: 178468205
- config_name: cc_openalex
features:
- name: license
dtype: string
- name: source
dtype: string
- name: title
dtype: string
- name: n_char
dtype: int64
- name: language
dtype: string
- name: author
dtype: string
- name: language_score
dtype: float64
- name: avg_word_length
dtype: float64
- name: dataset_name
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: text
dtype: string
- name: dataset_url
dtype: string
- name: id
dtype: string
splits:
- name: train
num_bytes: 276639015
num_examples: 5000
download_size: 141958793
dataset_size: 276639015
- config_name: common-crawl
features:
- name: license
dtype: string
- name: title
dtype: string
- name: id
dtype: string
- name: language
dtype: string
- name: avg_word_length
dtype: float64
- name: source
dtype: string
- name: dataset_url
dtype: string
- name: dataset_name
dtype: string
- name: author
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: text
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
splits:
- name: train
num_bytes: 14371322
num_examples: 5000
download_size: 8379232
dataset_size: 14371322
- config_name: dansknaw
features:
- name: author
dtype: string
- name: dataset_url
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: text
dtype: string
- name: dataset_name
dtype: string
- name: language
dtype: string
- name: id
dtype: string
- name: license
dtype: string
- name: title
dtype: string
- name: source
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
- name: avg_word_length
dtype: float64
splits:
- name: train
num_bytes: 9960106
num_examples: 5000
download_size: 4639751
dataset_size: 9960106
- config_name: dpc
features:
- name: license
dtype: string
- name: language
dtype: string
- name: dataset_name
dtype: string
- name: title
dtype: string
- name: text
dtype: string
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: id
dtype: string
- name: n_char
dtype: int64
- name: author
dtype: string
- name: avg_word_length
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: source
dtype: string
splits:
- name: train
num_bytes: 16196724
num_examples: 5000
download_size: 8434354
dataset_size: 16196724
- config_name: european-parliament
features:
- name: source
dtype: string
- name: text
dtype: string
- name: author
dtype: string
- name: id
dtype: string
- name: avg_word_length
dtype: float64
- name: license
dtype: string
- name: dataset_name
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
- name: title
dtype: string
- name: dataset_url
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: language
dtype: string
splits:
- name: train
num_bytes: 380818073
num_examples: 983
download_size: 217126173
dataset_size: 380818073
- config_name: kb
features:
- name: text
dtype: string
- name: avg_word_length
dtype: float64
- name: source
dtype: string
- name: language_score
dtype: float64
- name: author
dtype: string
- name: language
dtype: string
- name: id
dtype: string
- name: dataset_name
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: title
dtype: string
- name: license
dtype: string
- name: n_char
dtype: int64
- name: dataset_url
dtype: string
splits:
- name: train
num_bytes: 1298287985
num_examples: 5000
download_size: 773572767
dataset_size: 1298287985
- config_name: kb-open-kranten
features:
- name: avg_word_length
dtype: float64
- name: text
dtype: string
- name: source
dtype: string
- name: n_char
dtype: int64
- name: license
dtype: string
- name: dataset_name
dtype: string
- name: title
dtype: string
- name: author
dtype: string
- name: language_score
dtype: float64
- name: dataset_url
dtype: string
- name: id
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: language
dtype: string
splits:
- name: train
num_bytes: 21980282
num_examples: 5000
download_size: 13432181
dataset_size: 21980282
- config_name: kb-pd-books
features:
- name: avg_word_length
dtype: float64
- name: id
dtype: string
- name: dataset_name
dtype: string
- name: language
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: dataset_url
dtype: string
- name: author
dtype: string
- name: source
dtype: string
- name: n_char
dtype: int64
- name: language_score
dtype: float64
- name: license
dtype: string
- name: text
dtype: string
- name: title
dtype: string
splits:
- name: train
num_bytes: 1414444213
num_examples: 2250
download_size: 839096865
dataset_size: 1414444213
- config_name: multi-eurlex
features:
- name: avg_word_length
dtype: float64
- name: text
dtype: string
- name: source
dtype: string
- name: language
dtype: string
- name: dataset_url
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: n_char
dtype: int64
- name: id
dtype: string
- name: license
dtype: string
- name: title
dtype: string
- name: dataset_name
dtype: string
- name: author
dtype: string
- name: language_score
dtype: float64
splits:
- name: train
num_bytes: 53492517
num_examples: 5000
download_size: 24823896
dataset_size: 53492517
- config_name: nationaal-archief
features:
- name: language
dtype: string
- name: id
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
- name: title
dtype: string
- name: source
dtype: string
- name: avg_word_length
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: dataset_name
dtype: string
- name: text
dtype: string
- name: author
dtype: string
- name: license
dtype: string
- name: dataset_url
dtype: string
splits:
- name: train
num_bytes: 10697774
num_examples: 5000
download_size: 6132120
dataset_size: 10697774
- config_name: naturalis
features:
- name: id
dtype: string
- name: source
dtype: string
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: title
dtype: string
- name: text
dtype: string
- name: n_char
dtype: int64
- name: author
dtype: string
- name: language
dtype: string
- name: avg_word_length
dtype: float64
- name: license
dtype: string
- name: dataset_name
dtype: string
splits:
- name: train
num_bytes: 217990247
num_examples: 5000
download_size: 122469583
dataset_size: 217990247
- config_name: noordhollandsarchief
features:
- name: dataset_url
dtype: string
- name: avg_word_length
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: id
dtype: string
- name: source
dtype: string
- name: title
dtype: string
- name: license
dtype: string
- name: dataset_name
dtype: string
- name: author
dtype: string
- name: language_score
dtype: float64
- name: text
dtype: string
- name: n_char
dtype: int64
- name: language
dtype: string
splits:
- name: train
num_bytes: 12748305
num_examples: 5000
download_size: 7108800
dataset_size: 12748305
- config_name: officiele-bekendmakingen
features:
- name: language
dtype: string
- name: dataset_name
dtype: string
- name: source
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: n_char
dtype: int64
- name: author
dtype: string
- name: title
dtype: string
- name: license
dtype: string
- name: dataset_url
dtype: string
- name: id
dtype: string
- name: text
dtype: string
- name: avg_word_length
dtype: float64
- name: language_score
dtype: float64
splits:
- name: train
num_bytes: 36649302
num_examples: 5000
download_size: 17967005
dataset_size: 36649302
- config_name: openraadsinformatie-part1
features:
- name: avg_word_length
dtype: float64
- name: dataset_url
dtype: string
- name: dataset_name
dtype: string
- name: author
dtype: string
- name: text
dtype: string
- name: license
dtype: string
- name: language_score
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: title
dtype: string
- name: n_char
dtype: int64
- name: source
dtype: string
- name: language
dtype: string
- name: id
dtype: string
splits:
- name: train
num_bytes: 111249882
num_examples: 5000
download_size: 53722131
dataset_size: 111249882
- config_name: openraadsinformatie-part2
features:
- name: dataset_url
dtype: string
- name: license
dtype: string
- name: language_score
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: text
dtype: string
- name: avg_word_length
dtype: float64
- name: id
dtype: string
- name: language
dtype: string
- name: author
dtype: string
- name: source
dtype: string
- name: title
dtype: string
- name: n_char
dtype: int64
- name: dataset_name
dtype: string
splits:
- name: train
num_bytes: 108136702
num_examples: 5000
download_size: 54685101
dataset_size: 108136702
- config_name: pbl
features:
- name: author
dtype: string
- name: text
dtype: string
- name: n_char
dtype: int64
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: license
dtype: string
- name: id
dtype: string
- name: source
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: language
dtype: string
- name: avg_word_length
dtype: float64
- name: dataset_name
dtype: string
- name: title
dtype: string
splits:
- name: train
num_bytes: 82303924
num_examples: 489
download_size: 42177532
dataset_size: 82303924
- config_name: rechtspraak
features:
- name: license
dtype: string
- name: author
dtype: string
- name: title
dtype: string
- name: id
dtype: string
- name: n_char
dtype: int64
- name: dataset_name
dtype: string
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: text
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: source
dtype: string
- name: language
dtype: string
- name: avg_word_length
dtype: float64
splits:
- name: train
num_bytes: 56987574
num_examples: 5000
download_size: 27098862
dataset_size: 56987574
- config_name: tweedekamer
features:
- name: dataset_name
dtype: string
- name: n_char
dtype: int64
- name: text
dtype: string
- name: license
dtype: string
- name: source
dtype: string
- name: language
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: dataset_url
dtype: string
- name: language_score
dtype: float64
- name: author
dtype: string
- name: avg_word_length
dtype: float64
- name: id
dtype: string
- name: title
dtype: string
splits:
- name: train
num_bytes: 127357209
num_examples: 5000
download_size: 63737723
dataset_size: 127357209
- config_name: utrechts-archief
features:
- name: id
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
- name: language
dtype: string
- name: dataset_url
dtype: string
- name: author
dtype: string
- name: avg_word_length
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: license
dtype: string
- name: source
dtype: string
- name: dataset_name
dtype: string
- name: title
dtype: string
- name: text
dtype: string
splits:
- name: train
num_bytes: 7767582
num_examples: 5000
download_size: 4516990
dataset_size: 7767582
- config_name: wikidata
features:
- name: language_score
dtype: float64
- name: title
dtype: string
- name: language
dtype: string
- name: author
dtype: string
- name: id
dtype: string
- name: source
dtype: string
- name: dataset_name
dtype: string
- name: text
dtype: string
- name: license
dtype: string
- name: dataset_url
dtype: string
splits:
- name: train
num_bytes: 2615773
num_examples: 5000
download_size: 861396
dataset_size: 2615773
- config_name: wikiwijs
features:
- name: dataset_url
dtype: string
- name: source
dtype: string
- name: language
dtype: string
- name: language_score
dtype: float64
- name: n_char
dtype: int64
- name: text
dtype: string
- name: license
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: dataset_name
dtype: string
- name: id
dtype: string
- name: author
dtype: string
- name: title
dtype: string
- name: avg_word_length
dtype: float64
splits:
- name: train
num_bytes: 5873274
num_examples: 5000
download_size: 3104848
dataset_size: 5873274
- config_name: woogle
features:
- name: id
dtype: string
- name: language_score
dtype: float64
- name: title
dtype: string
- name: n_char
dtype: int64
- name: avg_word_length
dtype: float64
- name: author
dtype: string
- name: source
dtype: string
- name: text
dtype: string
- name: license
dtype: string
- name: dataset_name
dtype: string
- name: language
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: dataset_url
dtype: string
splits:
- name: train
num_bytes: 16144252
num_examples: 5000
download_size: 8587272
dataset_size: 16144252
- config_name: youtube-commons
features:
- name: source
dtype: string
- name: title
dtype: string
- name: avg_word_length
dtype: float64
- name: id
dtype: string
- name: n_non_symbol_words
dtype: int64
- name: text
dtype: string
- name: dataset_name
dtype: string
- name: n_char
dtype: int64
- name: language_score
dtype: float64
- name: dataset_url
dtype: string
- name: license
dtype: string
- name: language
dtype: string
- name: author
dtype: string
splits:
- name: train
num_bytes: 70145077
num_examples: 5000
download_size: 38068064
dataset_size: 70145077
- config_name: zeeuws-archief
features:
- name: dataset_url
dtype: string
- name: n_char
dtype: int64
- name: author
dtype: string
- name: text
dtype: string
- name: title
dtype: string
- name: language
dtype: string
- name: license
dtype: string
- name: id
dtype: string
- name: source
dtype: string
- name: language_score
dtype: float64
- name: n_non_symbol_words
dtype: int64
- name: dataset_name
dtype: string
- name: avg_word_length
dtype: float64
splits:
- name: train
num_bytes: 22393365
num_examples: 5000
download_size: 12410472
dataset_size: 22393365
configs:
- config_name: all
data_files:
- split: train
path: all/train-*
- config_name: american-stories
data_files:
- split: train
path: american-stories/train-*
- config_name: auditdienstrijk
data_files:
- split: train
path: auditdienstrijk/train-*
- config_name: belgian-journal
data_files:
- split: train
path: belgian-journal/train-*
- config_name: cc_english-pd
data_files:
- split: train
path: cc_english-pd/train-*
- config_name: cc_eurovoc
data_files:
- split: train
path: cc_eurovoc/train-*
- config_name: cc_german-pd
data_files:
- split: train
path: cc_german-pd/train-*
- config_name: cc_github_open_source
data_files:
- split: train
path: cc_github_open_source/train-*
- config_name: cc_loc-pd-books
data_files:
- split: train
path: cc_loc-pd-books/train-*
- config_name: cc_openalex
data_files:
- split: train
path: cc_openalex/train-*
- config_name: common-crawl
data_files:
- split: train
path: common-crawl/train-*
- config_name: dansknaw
data_files:
- split: train
path: dansknaw/train-*
- config_name: dpc
data_files:
- split: train
path: dpc/train-*
- config_name: european-parliament
data_files:
- split: train
path: european-parliament/train-*
- config_name: kb
data_files:
- split: train
path: kb/train-*
- config_name: kb-open-kranten
data_files:
- split: train
path: kb-open-kranten/train-*
- config_name: kb-pd-books
data_files:
- split: train
path: kb-pd-books/train-*
- config_name: multi-eurlex
data_files:
- split: train
path: multi-eurlex/train-*
- config_name: nationaal-archief
data_files:
- split: train
path: nationaal-archief/train-*
- config_name: naturalis
data_files:
- split: train
path: naturalis/train-*
- config_name: noordhollandsarchief
data_files:
- split: train
path: noordhollandsarchief/train-*
- config_name: officiele-bekendmakingen
data_files:
- split: train
path: officiele-bekendmakingen/train-*
- config_name: openraadsinformatie-part1
data_files:
- split: train
path: openraadsinformatie-part1/train-*
- config_name: openraadsinformatie-part2
data_files:
- split: train
path: openraadsinformatie-part2/train-*
- config_name: pbl
data_files:
- split: train
path: pbl/train-*
- config_name: rechtspraak
data_files:
- split: train
path: rechtspraak/train-*
- config_name: tweedekamer
data_files:
- split: train
path: tweedekamer/train-*
- config_name: utrechts-archief
data_files:
- split: train
path: utrechts-archief/train-*
- config_name: wikidata
data_files:
- split: train
path: wikidata/train-*
- config_name: wikiwijs
data_files:
- split: train
path: wikiwijs/train-*
- config_name: woogle
data_files:
- split: train
path: woogle/train-*
- config_name: youtube-commons
data_files:
- split: train
path: youtube-commons/train-*
- config_name: zeeuws-archief
data_files:
- split: train
path: zeeuws-archief/train-*
---
提供机构:
Rijgersberg



