allenai/scirepeval
收藏Hugging Face2024-01-16 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/allenai/scirepeval
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: biomimicry
features:
- name: doc_id
dtype: string
- name: doi
dtype: string
- name: corpus_id
dtype: uint64
- name: title
dtype: string
- name: abstract
dtype: string
- name: label
dtype: uint32
- name: venue
dtype: string
splits:
- name: evaluation
num_bytes: 16652415
num_examples: 10991
download_size: 9314032
dataset_size: 16652415
- config_name: cite_count
features:
- name: doc_id
dtype: string
- name: corpus_id
dtype: uint64
- name: title
dtype: string
- name: abstract
dtype: string
- name: venue
dtype: string
- name: n_citations
dtype: int32
- name: log_citations
dtype: float32
splits:
- name: evaluation
num_bytes: 45741032
num_examples: 30058
- name: train
num_bytes: 265390284
num_examples: 175944
- name: validation
num_bytes: 40997159
num_examples: 26830
download_size: 204760850
dataset_size: 352128475
- config_name: cite_prediction
features:
- name: query
struct:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: sha
dtype: string
- name: corpus_id
dtype: uint64
- name: pos
struct:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: sha
dtype: string
- name: corpus_id
dtype: uint64
- name: neg
struct:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: sha
dtype: string
- name: corpus_id
dtype: uint64
splits:
- name: train
num_bytes: 2582594392
num_examples: 676150
- name: validation
num_bytes: 549599739
num_examples: 143686
download_size: 1854909838
dataset_size: 3132194131
- config_name: cite_prediction_aug2023refresh
features:
- name: query
struct:
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: pos
struct:
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: neg
struct:
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
splits:
- name: train
num_bytes: 2069439948
num_examples: 475656
download_size: 1222814801
dataset_size: 2069439948
- config_name: cite_prediction_new
features:
- name: query
struct:
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: pos
struct:
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: neg
struct:
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: score
dtype: int8
splits:
- name: train
num_bytes: 23829782726
num_examples: 6197963
- name: validation
num_bytes: 609822308
num_examples: 176430
download_size: 14512970071
dataset_size: 24439605034
- config_name: drsm
features:
- name: doc_id
dtype: string
- name: corpus_id
dtype: uint64
- name: title
dtype: string
- name: abstract
dtype: string
- name: label_type
dtype: string
- name: label
dtype: string
- name: class
dtype: uint32
splits:
- name: evaluation
num_bytes: 12757612
num_examples: 8813
download_size: 7021949
dataset_size: 12757612
- config_name: feeds_1
features:
- name: query
struct:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: feed_id
dtype: string
- name: candidates
list:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: score
dtype: uint32
splits:
- name: evaluation
num_bytes: 6488182
num_examples: 423
download_size: 6911928
dataset_size: 6488182
- config_name: feeds_m
features:
- name: query
struct:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: feed_id
dtype: string
- name: candidates
list:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: score
dtype: uint32
splits:
- name: evaluation
num_bytes: 135219457
num_examples: 9025
download_size: 149126628
dataset_size: 135219457
- config_name: feeds_title
features:
- name: query
dtype: string
- name: doc_id
dtype: string
- name: feed_id
dtype: string
- name: abbreviations
dtype: string
- name: candidates
list:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: score
dtype: uint32
splits:
- name: evaluation
num_bytes: 5923757
num_examples: 424
download_size: 6228046
dataset_size: 5923757
- config_name: fos
features:
- name: doc_id
dtype: string
- name: corpus_id
dtype: uint64
- name: title
dtype: string
- name: abstract
dtype: string
- name: labels
sequence: int32
- name: labels_text
sequence: string
splits:
- name: evaluation
num_bytes: 63854253
num_examples: 68147
- name: train
num_bytes: 509154623
num_examples: 541218
- name: validation
num_bytes: 63947785
num_examples: 67631
download_size: 382411779
dataset_size: 636956661
- config_name: high_influence_cite
features:
- name: query
struct:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: candidates
list:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: score
dtype: uint32
splits:
- name: evaluation
num_bytes: 85746699
num_examples: 1199
- name: train
num_bytes: 2607643584
num_examples: 58626
- name: validation
num_bytes: 329589399
num_examples: 7356
download_size: 1622948830
dataset_size: 3022979682
- config_name: mesh_descriptors
features:
- name: doc_id
dtype: string
- name: mag_id
dtype: uint64
- name: corpus_id
dtype: uint64
- name: title
dtype: string
- name: abstract
dtype: string
- name: descriptor
dtype: string
- name: qualifier
dtype: string
splits:
- name: evaluation
num_bytes: 390178523
num_examples: 258678
- name: train
num_bytes: 3120119117
num_examples: 2069065
- name: validation
num_bytes: 390161743
num_examples: 258678
download_size: 2259106030
dataset_size: 3900459383
- config_name: nfcorpus
features:
- name: query
dtype: string
- name: doc_id
dtype: string
- name: candidates
list:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: score
dtype: uint32
splits:
- name: evaluation
num_bytes: 72184049
num_examples: 323
download_size: 37626800
dataset_size: 72184049
- config_name: paper_reviewer_matching
features:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
splits:
- name: evaluation
num_bytes: 76005977
num_examples: 73364
download_size: 41557009
dataset_size: 76005977
- config_name: peer_review_score_hIndex
features:
- name: doc_id
dtype: string
- name: corpus_id
dtype: uint64
- name: title
dtype: string
- name: abstract
dtype: string
- name: rating
sequence: int32
- name: confidence
dtype: string
- name: authors
sequence: string
- name: decision
dtype: string
- name: mean_rating
dtype: float32
- name: hIndex
sequence: string
splits:
- name: evaluation
num_bytes: 18233937
num_examples: 12668
download_size: 10163532
dataset_size: 18233937
- config_name: pub_year
features:
- name: doc_id
dtype: string
- name: corpus_id
dtype: uint64
- name: title
dtype: string
- name: abstract
dtype: string
- name: year
dtype: int32
- name: venue
dtype: string
- name: norm_year
dtype: float32
- name: scaled_year
dtype: float32
- name: n_authors
dtype: int32
- name: norm_authors
dtype: float32
splits:
- name: evaluation
num_bytes: 46195045
num_examples: 30000
- name: train
num_bytes: 301313882
num_examples: 198995
- name: validation
num_bytes: 30493617
num_examples: 19869
download_size: 224105260
dataset_size: 378002544
- config_name: relish
features:
- name: query
struct:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: int64
- name: candidates
list:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: int64
- name: score
dtype: uint32
splits:
- name: evaluation
num_bytes: 338282942
num_examples: 3190
download_size: 171723654
dataset_size: 338282942
- config_name: same_author
features:
- name: dataset
dtype: string
- name: query
struct:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: candidates
list:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: score
dtype: uint32
splits:
- name: evaluation
num_bytes: 126843745
num_examples: 13585
- name: train
num_bytes: 602167333
num_examples: 67493
- name: validation
num_bytes: 84426967
num_examples: 8996
download_size: 104055242
dataset_size: 813438045
- config_name: scidocs_mag_mesh
features:
- name: doc_id
dtype: string
- name: corpus_id
dtype: uint64
- name: title
dtype: string
- name: abstract
dtype: string
- name: authors
sequence: string
- name: cited_by
sequence: string
- name: references
sequence: string
- name: year
dtype: int32
splits:
- name: evaluation
num_bytes: 74030118
num_examples: 48473
download_size: 47773142
dataset_size: 74030118
- config_name: scidocs_view_cite_read
features:
- name: doc_id
dtype: string
- name: corpus_id
dtype: uint64
- name: title
dtype: string
- name: abstract
dtype: string
- name: authors
sequence: string
- name: cited_by
sequence: string
- name: references
sequence: string
- name: year
dtype: int32
splits:
- name: evaluation
num_bytes: 240569108
num_examples: 142009
download_size: 159403764
dataset_size: 240569108
- config_name: search
features:
- name: query
dtype: string
- name: doc_id
dtype: string
- name: candidates
list:
- name: doc_id
dtype: string
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: uint64
- name: venue
dtype: string
- name: year
dtype: float64
- name: author_names
sequence: string
- name: n_citations
dtype: int32
- name: n_key_citations
dtype: int32
- name: score
dtype: uint32
splits:
- name: evaluation
num_bytes: 39417912
num_examples: 2637
- name: train
num_bytes: 6889691036
num_examples: 399878
- name: validation
num_bytes: 1221360738
num_examples: 75382
download_size: 4495463131
dataset_size: 8150469686
- config_name: trec_covid
features:
- name: query
dtype: string
- name: doc_id
dtype: string
- name: candidates
list:
- name: title
dtype: string
- name: abstract
dtype: string
- name: corpus_id
dtype: string
- name: doc_id
dtype: string
- name: date
dtype: string
- name: doi
dtype: string
- name: iteration
dtype: string
- name: score
dtype: int32
splits:
- name: evaluation
num_bytes: 98757931
num_examples: 50
download_size: 52359825
dataset_size: 98757931
- config_name: tweet_mentions
features:
- name: doc_id
dtype: string
- name: corpus_id
dtype: uint64
- name: title
dtype: string
- name: abstract
dtype: string
- name: index
dtype: int32
- name: retweets
dtype: float32
- name: count
dtype: int32
- name: mentions
dtype: float32
splits:
- name: evaluation
num_bytes: 25895172
num_examples: 25655
download_size: 14991004
dataset_size: 25895172
configs:
- config_name: biomimicry
data_files:
- split: evaluation
path: biomimicry/evaluation-*
- config_name: cite_count
data_files:
- split: evaluation
path: cite_count/evaluation-*
- split: train
path: cite_count/train-*
- split: validation
path: cite_count/validation-*
- config_name: cite_prediction
data_files:
- split: train
path: cite_prediction/train-*
- split: validation
path: cite_prediction/validation-*
- config_name: cite_prediction_aug2023refresh
data_files:
- split: train
path: cite_prediction_aug2023refresh/train-*
- config_name: cite_prediction_new
data_files:
- split: train
path: cite_prediction_new/train-*
- split: validation
path: cite_prediction_new/validation-*
- config_name: drsm
data_files:
- split: evaluation
path: drsm/evaluation-*
- config_name: fos
data_files:
- split: evaluation
path: fos/evaluation-*
- split: train
path: fos/train-*
- split: validation
path: fos/validation-*
- config_name: high_influence_cite
data_files:
- split: evaluation
path: high_influence_cite/evaluation-*
- split: train
path: high_influence_cite/train-*
- split: validation
path: high_influence_cite/validation-*
- config_name: mesh_descriptors
data_files:
- split: evaluation
path: mesh_descriptors/evaluation-*
- split: train
path: mesh_descriptors/train-*
- split: validation
path: mesh_descriptors/validation-*
- config_name: nfcorpus
data_files:
- split: evaluation
path: nfcorpus/evaluation-*
- config_name: paper_reviewer_matching
data_files:
- split: evaluation
path: paper_reviewer_matching/evaluation-*
- config_name: peer_review_score_hIndex
data_files:
- split: evaluation
path: peer_review_score_hIndex/evaluation-*
- config_name: pub_year
data_files:
- split: evaluation
path: pub_year/evaluation-*
- split: train
path: pub_year/train-*
- split: validation
path: pub_year/validation-*
- config_name: relish
data_files:
- split: evaluation
path: relish/evaluation-*
- config_name: same_author
data_files:
- split: evaluation
path: same_author/evaluation-*
- split: train
path: same_author/train-*
- split: validation
path: same_author/validation-*
- config_name: scidocs_mag_mesh
data_files:
- split: evaluation
path: scidocs_mag_mesh/evaluation-*
- config_name: scidocs_view_cite_read
data_files:
- split: evaluation
path: scidocs_view_cite_read/evaluation-*
- config_name: search
data_files:
- split: evaluation
path: search/evaluation-*
- split: train
path: search/train-*
- split: validation
path: search/validation-*
- config_name: trec_covid
data_files:
- split: evaluation
path: trec_covid/evaluation-*
- config_name: tweet_mentions
data_files:
- split: evaluation
path: tweet_mentions/evaluation-*
---
提供机构:
allenai
原始信息汇总
数据集概述
数据集配置
1. biomimicry
- 特征:
doc_id: stringdoi: stringcorpus_id: uint64title: stringabstract: stringlabel: uint32venue: string
- 分割:
evaluation: 16652415 bytes, 10991 examples
- 下载大小: 9314032 bytes
- 数据集大小: 16652415 bytes
2. cite_count
- 特征:
doc_id: stringcorpus_id: uint64title: stringabstract: stringvenue: stringn_citations: int32log_citations: float32
- 分割:
evaluation: 45741032 bytes, 30058 examplestrain: 265390284 bytes, 175944 examplesvalidation: 40997159 bytes, 26830 examples
- 下载大小: 204760850 bytes
- 数据集大小: 352128475 bytes
3. cite_prediction
- 特征:
query: structdoc_id: stringtitle: stringabstract: stringsha: stringcorpus_id: uint64
pos: structdoc_id: stringtitle: stringabstract: stringsha: stringcorpus_id: uint64
neg: structdoc_id: stringtitle: stringabstract: stringsha: stringcorpus_id: uint64
- 分割:
train: 2582594392 bytes, 676150 examplesvalidation: 549599739 bytes, 143686 examples
- 下载大小: 1854909838 bytes
- 数据集大小: 3132194131 bytes
4. cite_prediction_aug2023refresh
- 特征:
query: structtitle: stringabstract: stringcorpus_id: uint64
pos: structtitle: stringabstract: stringcorpus_id: uint64
neg: structtitle: stringabstract: stringcorpus_id: uint64
- 分割:
train: 2069439948 bytes, 475656 examples
- 下载大小: 1222814801 bytes
- 数据集大小: 2069439948 bytes
5. cite_prediction_new
- 特征:
query: structtitle: stringabstract: stringcorpus_id: uint64
pos: structtitle: stringabstract: stringcorpus_id: uint64
neg: structtitle: stringabstract: stringcorpus_id: uint64score: int8
- 分割:
train: 23829782726 bytes, 6197963 examplesvalidation: 609822308 bytes, 176430 examples
- 下载大小: 14512970071 bytes
- 数据集大小: 24439605034 bytes
6. drsm
- 特征:
doc_id: stringcorpus_id: uint64title: stringabstract: stringlabel_type: stringlabel: stringclass: uint32
- 分割:
evaluation: 12757612 bytes, 8813 examples
- 下载大小: 7021949 bytes
- 数据集大小: 12757612 bytes
7. feeds_1
- 特征:
query: structdoc_id: stringtitle: stringabstract: stringcorpus_id: uint64
feed_id: stringcandidates: listdoc_id: stringtitle: stringabstract: stringcorpus_id: uint64score: uint32
- 分割:
evaluation: 6488182 bytes, 423 examples
- 下载大小: 6911928 bytes
- 数据集大小: 6488182 bytes
8. feeds_m
- 特征:
query: structdoc_id: stringtitle: stringabstract: stringcorpus_id: uint64
feed_id: stringcandidates: listdoc_id: stringtitle: stringabstract: stringcorpus_id: uint64score: uint32
- 分割:
evaluation: 135219457 bytes, 9025 examples
- 下载大小: 149126628 bytes
- 数据集大小: 135219457 bytes
9. feeds_title
- 特征:
query: stringdoc_id: stringfeed_id: stringabbreviations: stringcandidates: listdoc_id: stringtitle: stringabstract: stringcorpus_id: uint64score: uint32
- 分割:
evaluation: 5923757 bytes, 424 examples
- 下载大小: 6228046 bytes
- 数据集大小: 5923757 bytes
10. fos
- 特征:
doc_id: stringcorpus_id: uint64title: stringabstract: stringlabels: sequence: int32labels_text: sequence: string
- 分割:
evaluation: 63854253 bytes, 68147 examplestrain: 509154623 bytes, 541218 examplesvalidation: 63947785 bytes, 67631 examples
- 下载大小: 382411779 bytes
- 数据集大小: 636956661 bytes
11. high_influence_cite
- 特征:
query: structdoc_id: stringtitle: stringabstract: stringcorpus_id: uint64
candidates: listdoc_id: stringtitle: stringabstract: stringcorpus_id: uint64score: uint32
- 分割:
evaluation: 85746699 bytes, 1199 examplestrain: 2607643584 bytes, 58626 examplesvalidation: 329589399 bytes, 7356 examples
- 下载大小: 1622948830 bytes
- 数据集大小: 3022979682 bytes
12. mesh_descriptors
- 特征:
doc_id: stringmag_id: uint64corpus_id: uint64title: stringabstract: stringdescriptor: stringqualifier: string
- 分割:
evaluation: 390178523 bytes, 258678 examplestrain: 3120119117 bytes, 2069065 examplesvalidation: 390161743 bytes, 258678 examples
- 下载大小: 2259106030 bytes
- 数据集大小: 3900459383 bytes
13. nfcorpus
- 特征:
query: stringdoc_id: stringcandidates: listdoc_id: stringtitle: stringabstract: stringscore: uint32
- 分割:
evaluation: 72184049 bytes, 323 examples
- 下载大小: 37626800 bytes
- 数据集大小: 72184049 bytes
14. paper_reviewer_matching
- 特征:
doc_id: stringtitle: stringabstract: stringcorpus_id: uint64
- 分割:
evaluation: 76005977 bytes, 73364 examples
- 下载大小: 41557009 bytes
- 数据集大小: 76005977 bytes
15. peer_review_score_hIndex
- 特征:
doc_id: stringcorpus_id: uint64title: stringabstract: stringrating: sequence: int32confidence: stringauthors: sequence: stringdecision: stringmean_rating: float32hIndex: sequence: string
- 分割:
evaluation: 18233937 bytes, 12668 examples
- 下载大小: 10163532 bytes
- 数据集大小: 18233937 bytes
16. pub_year
- 特征:
doc_id: stringcorpus_id: uint64title: stringabstract: stringyear: int32venue: stringnorm_year: float32scaled_year: float32n_authors: int32norm_authors: float32
- 分割:
evaluation: 46195045 bytes, 30000 examplestrain: 301313882 bytes, 198995 examplesvalidation: 30493617 bytes, 19869 examples
- 下载大小: 224105260 bytes
- 数据集大小: 378002544 bytes
17. relish
- 特征:
query: structdoc_id: stringtitle: stringabstract: stringcorpus_id: int64
candidates: listdoc_id: stringtitle: stringabstract: stringcorpus_id: int64score: uint32
- 分割:
evaluation: 338282942 bytes, 3190 examples
- 下载大小: 171723654 bytes
- 数据集大小: 338282942 bytes
18. same_author
- 特征:
dataset: stringquery: structdoc_id: stringtitle: stringabstract: stringcorpus_id: uint64
candidates: listdoc_id: stringtitle: stringabstract: stringcorpus_id: uint64score: uint32
- 分割:
evaluation: 126843745 bytes, 13585 examplestrain: 602167333 bytes, 67493 examplesvalidation: 84426967 bytes, 8996 examples
- 下载大小: 104055242 bytes
- 数据集大小: 813438045 bytes
19. scidocs_mag_mesh
- 特征:
doc_id: stringcorpus_id: uint64title: stringabstract: stringauthors: sequence: stringcited_by: sequence: stringreferences: sequence: stringyear: int32
- 分割:
evaluation: 74030118 bytes, 48473 examples
- 下载大小: 47773142 bytes
- 数据集大小: 74030118 bytes
20. scidocs_view_cite_read
- 特征:
doc_id: stringcorpus_id: uint64title: stringabstract: stringauthors: sequence: stringcited_by: sequence: stringreferences: sequence: stringyear: int32
- 分割:
evaluation: 240569108 bytes, 142009 examples
- 下载大小: 159403764 bytes
- 数据集大小: 240569108 bytes
21. search
- 特征:
query: stringdoc_id: stringcandidates: listdoc_id: stringtitle: stringabstract: stringcorpus_id: uint64venue: stringyear: float64author_names: sequence: stringn_citations: int32n_key_citations: int32score: uint32
- 分割:
evaluation: 39417912 bytes, 2637 examplestrain: 6889691036 bytes, 399878 examplesvalidation: 1221360738 bytes, 75382 examples
- 下载大小: 4495463131 bytes
- 数据集大小: 8150469686 bytes
22. trec_covid
- 特征:
query: stringdoc_id: stringcandidates: listtitle: stringabstract: stringcorpus_id: stringdoc_id: stringdate: stringdoi: stringiteration: stringscore: int32
- 分割:
evaluation: 98757931 bytes, 50 examples
- 下载大小: 52359825 bytes
- 数据集大小: 98757931 bytes
23. tweet_mentions
- 特征:
doc_id: stringcorpus_id: uint64title: stringabstract: stringindex:



