Shuu12121/coir_hard_negative_datasets_kd
收藏Hugging Face2026-03-29 更新2026-04-12 收录
下载链接:
https://hf-mirror.com/datasets/Shuu12121/coir_hard_negative_datasets_kd
下载链接
链接失效反馈官方服务:
资源简介:
---
dataset_info:
- config_name: documents_apps-python
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 5252033
num_examples: 8755
download_size: 2441099
dataset_size: 5252033
- config_name: documents_codefeedback-mt-mixed
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 98959739
num_examples: 66371
download_size: 50614519
dataset_size: 98959739
- config_name: documents_codefeedback-st-mixed
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 241780674
num_examples: 156493
download_size: 119840558
dataset_size: 241780674
- config_name: documents_codesearchnet-ccr-go
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 35151378
num_examples: 182608
download_size: 16033868
dataset_size: 35151378
- config_name: documents_codesearchnet-ccr-java
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 49663715
num_examples: 180814
download_size: 19717956
dataset_size: 49663715
- config_name: documents_codesearchnet-ccr-javascript
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 19241952
num_examples: 65124
download_size: 8448048
dataset_size: 19241952
- config_name: documents_codesearchnet-ccr-php
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 70931762
num_examples: 268139
download_size: 28062510
dataset_size: 70931762
- config_name: documents_codesearchnet-ccr-python
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 108300841
num_examples: 276732
download_size: 44279546
dataset_size: 108300841
- config_name: documents_codesearchnet-ccr-ruby
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 5597619
num_examples: 27580
download_size: 2558547
dataset_size: 5597619
- config_name: documents_codesearchnet-go
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 21448139
num_examples: 182394
download_size: 9695160
dataset_size: 21448139
- config_name: documents_codesearchnet-java
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 35998195
num_examples: 180834
download_size: 14939015
dataset_size: 35998195
- config_name: documents_codesearchnet-javascript
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 13142849
num_examples: 64839
download_size: 5907776
dataset_size: 13142849
- config_name: documents_codesearchnet-php
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 49202162
num_examples: 267701
download_size: 20971743
dataset_size: 49202162
- config_name: documents_codesearchnet-python
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 79455238
num_examples: 280140
download_size: 32521978
dataset_size: 79455238
- config_name: documents_codesearchnet-ruby
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 6921123
num_examples: 27569
download_size: 3089665
dataset_size: 6921123
- config_name: documents_codetrans-contest-mixed
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 1541254
num_examples: 1008
download_size: 657617
dataset_size: 1541254
- config_name: documents_codetrans-dl-mixed
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 443046
num_examples: 266
download_size: 128629
dataset_size: 443046
- config_name: documents_cosqa-python
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 2072945
num_examples: 6267
download_size: 1106059
dataset_size: 2072945
- config_name: documents_stackoverflow-qa-mixed
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 24429289
num_examples: 19930
download_size: 13175543
dataset_size: 24429289
- config_name: documents_synthetic-text2sql-sql
features:
- name: document_id
dtype: string
- name: document
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 15668333
num_examples: 105851
download_size: 7085169
dataset_size: 15668333
- config_name: queries_apps-python
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 6410447
num_examples: 5000
download_size: 3263749
dataset_size: 6410447
- config_name: queries_codefeedback-mt-mixed
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 235947161
num_examples: 53106
download_size: 99226011
dataset_size: 235947161
- config_name: queries_codefeedback-st-mixed
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 93421835
num_examples: 125220
download_size: 46801238
dataset_size: 93421835
- config_name: queries_codesearchnet-ccr-go
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 43207673
num_examples: 167288
download_size: 18678358
dataset_size: 43207673
- config_name: queries_codesearchnet-ccr-java
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 62513115
num_examples: 164923
download_size: 23948866
dataset_size: 62513115
- config_name: queries_codesearchnet-ccr-javascript
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 23515337
num_examples: 58025
download_size: 10128890
dataset_size: 23515337
- config_name: queries_codesearchnet-ccr-php
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 87492528
num_examples: 241241
download_size: 33347946
dataset_size: 87492528
- config_name: queries_codesearchnet-ccr-python
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 137102794
num_examples: 251820
download_size: 57069617
dataset_size: 137102794
- config_name: queries_codesearchnet-ccr-ruby
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 6757179
num_examples: 24927
download_size: 3053272
dataset_size: 6757179
- config_name: queries_codesearchnet-go
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 72165539
num_examples: 167288
download_size: 29465690
dataset_size: 72165539
- config_name: queries_codesearchnet-java
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 104328618
num_examples: 164923
download_size: 37258933
dataset_size: 104328618
- config_name: queries_codesearchnet-javascript
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 38975411
num_examples: 58025
download_size: 15762412
dataset_size: 38975411
- config_name: queries_codesearchnet-php
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 146123377
num_examples: 241241
download_size: 51891853
dataset_size: 146123377
- config_name: queries_codesearchnet-python
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 229222143
num_examples: 251820
download_size: 89277813
dataset_size: 229222143
- config_name: queries_codesearchnet-ruby
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 11273759
num_examples: 24927
download_size: 4742042
dataset_size: 11273759
- config_name: queries_codetrans-contest-mixed
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 401024
num_examples: 561
download_size: 196326
dataset_size: 401024
- config_name: queries_codetrans-dl-mixed
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 789134
num_examples: 564
download_size: 81011
dataset_size: 789134
- config_name: queries_cosqa-python
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 547571
num_examples: 9020
download_size: 253852
dataset_size: 547571
- config_name: queries_stackoverflow-qa-mixed
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 19787799
num_examples: 13951
download_size: 9951440
dataset_size: 19787799
- config_name: queries_synthetic-text2sql-sql
features:
- name: query_id
dtype: string
- name: query
dtype: string
- name: split
dtype: string
splits:
- name: train
num_bytes: 10403642
num_examples: 100000
download_size: 5039559
dataset_size: 10403642
- config_name: scores_apps-python
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 8645114
num_examples: 5000
download_size: 5002371
dataset_size: 8645114
- config_name: scores_codefeedback-mt-mixed
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 97074328
num_examples: 53106
download_size: 65747726
dataset_size: 97074328
- config_name: scores_codefeedback-st-mixed
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 234412832
num_examples: 125220
download_size: 169380932
dataset_size: 234412832
- config_name: scores_codesearchnet-ccr-go
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 314698366
num_examples: 167288
download_size: 202321699
dataset_size: 314698366
- config_name: scores_codesearchnet-ccr-java
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 310862243
num_examples: 164923
download_size: 200693853
dataset_size: 310862243
- config_name: scores_codesearchnet-ccr-javascript
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 106362070
num_examples: 58025
download_size: 66087872
dataset_size: 106362070
- config_name: scores_codesearchnet-ccr-php
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 458913278
num_examples: 241241
download_size: 317444974
dataset_size: 458913278
- config_name: scores_codesearchnet-ccr-python
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 480385067
num_examples: 251820
download_size: 303193068
dataset_size: 480385067
- config_name: scores_codesearchnet-ccr-ruby
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 44971695
num_examples: 24927
download_size: 26739998
dataset_size: 44971695
- config_name: scores_codesearchnet-go
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 314746146
num_examples: 167288
download_size: 208168428
dataset_size: 314746146
- config_name: scores_codesearchnet-java
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 310749085
num_examples: 164923
download_size: 205058131
dataset_size: 310749085
- config_name: scores_codesearchnet-javascript
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 106335220
num_examples: 58025
download_size: 67841144
dataset_size: 106335220
- config_name: scores_codesearchnet-php
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 458901659
num_examples: 241241
download_size: 320873447
dataset_size: 458901659
- config_name: scores_codesearchnet-python
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 480603219
num_examples: 251820
download_size: 317376169
dataset_size: 480603219
- config_name: scores_codesearchnet-ruby
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 44912420
num_examples: 24927
download_size: 26981910
dataset_size: 44912420
- config_name: scores_codetrans-contest-mixed
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 917468
num_examples: 561
download_size: 522522
dataset_size: 917468
- config_name: scores_codetrans-dl-mixed
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 898326
num_examples: 564
download_size: 263988
dataset_size: 898326
- config_name: scores_cosqa-python
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 15839259
num_examples: 9020
download_size: 8894704
dataset_size: 15839259
- config_name: scores_stackoverflow-qa-mixed
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 24935270
num_examples: 13951
download_size: 15047920
dataset_size: 24935270
- config_name: scores_synthetic-text2sql-sql
features:
- name: query_id
dtype: string
- name: document_ids
sequence: string
- name: scores
sequence: float64
- name: split
dtype: string
splits:
- name: train
num_bytes: 183789209
num_examples: 100000
download_size: 128552991
dataset_size: 183789209
configs:
- config_name: documents_apps-python
data_files:
- split: train
path: documents_apps-python/train-*
- config_name: documents_codefeedback-mt-mixed
data_files:
- split: train
path: documents_codefeedback-mt-mixed/train-*
- config_name: documents_codefeedback-st-mixed
data_files:
- split: train
path: documents_codefeedback-st-mixed/train-*
- config_name: documents_codesearchnet-ccr-go
data_files:
- split: train
path: documents_codesearchnet-ccr-go/train-*
- config_name: documents_codesearchnet-ccr-java
data_files:
- split: train
path: documents_codesearchnet-ccr-java/train-*
- config_name: documents_codesearchnet-ccr-javascript
data_files:
- split: train
path: documents_codesearchnet-ccr-javascript/train-*
- config_name: documents_codesearchnet-ccr-php
data_files:
- split: train
path: documents_codesearchnet-ccr-php/train-*
- config_name: documents_codesearchnet-ccr-python
data_files:
- split: train
path: documents_codesearchnet-ccr-python/train-*
- config_name: documents_codesearchnet-ccr-ruby
data_files:
- split: train
path: documents_codesearchnet-ccr-ruby/train-*
- config_name: documents_codesearchnet-go
data_files:
- split: train
path: documents_codesearchnet-go/train-*
- config_name: documents_codesearchnet-java
data_files:
- split: train
path: documents_codesearchnet-java/train-*
- config_name: documents_codesearchnet-javascript
data_files:
- split: train
path: documents_codesearchnet-javascript/train-*
- config_name: documents_codesearchnet-php
data_files:
- split: train
path: documents_codesearchnet-php/train-*
- config_name: documents_codesearchnet-python
data_files:
- split: train
path: documents_codesearchnet-python/train-*
- config_name: documents_codesearchnet-ruby
data_files:
- split: train
path: documents_codesearchnet-ruby/train-*
- config_name: documents_codetrans-contest-mixed
data_files:
- split: train
path: documents_codetrans-contest-mixed/train-*
- config_name: documents_codetrans-dl-mixed
data_files:
- split: train
path: documents_codetrans-dl-mixed/train-*
- config_name: documents_cosqa-python
data_files:
- split: train
path: documents_cosqa-python/train-*
- config_name: documents_stackoverflow-qa-mixed
data_files:
- split: train
path: documents_stackoverflow-qa-mixed/train-*
- config_name: documents_synthetic-text2sql-sql
data_files:
- split: train
path: documents_synthetic-text2sql-sql/train-*
- config_name: queries_apps-python
data_files:
- split: train
path: queries_apps-python/train-*
- config_name: queries_codefeedback-mt-mixed
data_files:
- split: train
path: queries_codefeedback-mt-mixed/train-*
- config_name: queries_codefeedback-st-mixed
data_files:
- split: train
path: queries_codefeedback-st-mixed/train-*
- config_name: queries_codesearchnet-ccr-go
data_files:
- split: train
path: queries_codesearchnet-ccr-go/train-*
- config_name: queries_codesearchnet-ccr-java
data_files:
- split: train
path: queries_codesearchnet-ccr-java/train-*
- config_name: queries_codesearchnet-ccr-javascript
data_files:
- split: train
path: queries_codesearchnet-ccr-javascript/train-*
- config_name: queries_codesearchnet-ccr-php
data_files:
- split: train
path: queries_codesearchnet-ccr-php/train-*
- config_name: queries_codesearchnet-ccr-python
data_files:
- split: train
path: queries_codesearchnet-ccr-python/train-*
- config_name: queries_codesearchnet-ccr-ruby
data_files:
- split: train
path: queries_codesearchnet-ccr-ruby/train-*
- config_name: queries_codesearchnet-go
data_files:
- split: train
path: queries_codesearchnet-go/train-*
- config_name: queries_codesearchnet-java
data_files:
- split: train
path: queries_codesearchnet-java/train-*
- config_name: queries_codesearchnet-javascript
data_files:
- split: train
path: queries_codesearchnet-javascript/train-*
- config_name: queries_codesearchnet-php
data_files:
- split: train
path: queries_codesearchnet-php/train-*
- config_name: queries_codesearchnet-python
data_files:
- split: train
path: queries_codesearchnet-python/train-*
- config_name: queries_codesearchnet-ruby
data_files:
- split: train
path: queries_codesearchnet-ruby/train-*
- config_name: queries_codetrans-contest-mixed
data_files:
- split: train
path: queries_codetrans-contest-mixed/train-*
- config_name: queries_codetrans-dl-mixed
data_files:
- split: train
path: queries_codetrans-dl-mixed/train-*
- config_name: queries_cosqa-python
data_files:
- split: train
path: queries_cosqa-python/train-*
- config_name: queries_stackoverflow-qa-mixed
data_files:
- split: train
path: queries_stackoverflow-qa-mixed/train-*
- config_name: queries_synthetic-text2sql-sql
data_files:
- split: train
path: queries_synthetic-text2sql-sql/train-*
- config_name: scores_apps-python
data_files:
- split: train
path: scores_apps-python/train-*
- config_name: scores_codefeedback-mt-mixed
data_files:
- split: train
path: scores_codefeedback-mt-mixed/train-*
- config_name: scores_codefeedback-st-mixed
data_files:
- split: train
path: scores_codefeedback-st-mixed/train-*
- config_name: scores_codesearchnet-ccr-go
data_files:
- split: train
path: scores_codesearchnet-ccr-go/train-*
- config_name: scores_codesearchnet-ccr-java
data_files:
- split: train
path: scores_codesearchnet-ccr-java/train-*
- config_name: scores_codesearchnet-ccr-javascript
data_files:
- split: train
path: scores_codesearchnet-ccr-javascript/train-*
- config_name: scores_codesearchnet-ccr-php
data_files:
- split: train
path: scores_codesearchnet-ccr-php/train-*
- config_name: scores_codesearchnet-ccr-python
data_files:
- split: train
path: scores_codesearchnet-ccr-python/train-*
- config_name: scores_codesearchnet-ccr-ruby
data_files:
- split: train
path: scores_codesearchnet-ccr-ruby/train-*
- config_name: scores_codesearchnet-go
data_files:
- split: train
path: scores_codesearchnet-go/train-*
- config_name: scores_codesearchnet-java
data_files:
- split: train
path: scores_codesearchnet-java/train-*
- config_name: scores_codesearchnet-javascript
data_files:
- split: train
path: scores_codesearchnet-javascript/train-*
- config_name: scores_codesearchnet-php
data_files:
- split: train
path: scores_codesearchnet-php/train-*
- config_name: scores_codesearchnet-python
data_files:
- split: train
path: scores_codesearchnet-python/train-*
- config_name: scores_codesearchnet-ruby
data_files:
- split: train
path: scores_codesearchnet-ruby/train-*
- config_name: scores_codetrans-contest-mixed
data_files:
- split: train
path: scores_codetrans-contest-mixed/train-*
- config_name: scores_codetrans-dl-mixed
data_files:
- split: train
path: scores_codetrans-dl-mixed/train-*
- config_name: scores_cosqa-python
data_files:
- split: train
path: scores_cosqa-python/train-*
- config_name: scores_stackoverflow-qa-mixed
data_files:
- split: train
path: scores_stackoverflow-qa-mixed/train-*
- config_name: scores_synthetic-text2sql-sql
data_files:
- split: train
path: scores_synthetic-text2sql-sql/train-*
---
提供机构:
Shuu12121



