jenyag/repo-code-completion
收藏Hugging Face2024-01-18 更新2024-03-04 收录
下载链接:
https://hf-mirror.com/datasets/jenyag/repo-code-completion
下载链接
链接失效反馈官方服务:
资源简介:
---
license: apache-2.0
dataset_info:
- config_name: alphabetical_composer_all_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 590554966
num_examples: 224
download_size: 236538429
dataset_size: 590554966
- config_name: alphabetical_composer_non_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 560157388
num_examples: 224
download_size: 226511858
dataset_size: 560157388
- config_name: alphabetical_composer_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 114370147
num_examples: 224
download_size: 22096586
dataset_size: 114370147
- config_name: file_length_composer_all_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 590554966
num_examples: 224
download_size: 239093262
dataset_size: 590554966
- config_name: file_length_composer_non_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 560157388
num_examples: 224
download_size: 228632512
dataset_size: 560157388
- config_name: file_length_composer_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 114370147
num_examples: 224
download_size: 22181715
dataset_size: 114370147
- config_name: function_class_mask_half_composer_all_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 316335006
num_examples: 224
download_size: 0
dataset_size: 316335006
- config_name: function_class_mask_half_composer_non_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 315664977
num_examples: 224
download_size: 127938122
dataset_size: 315664977
- config_name: function_class_mask_half_composer_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 101260211
num_examples: 224
download_size: 17862587
dataset_size: 101260211
- config_name: function_class_mask_one_composer_all_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 90116249
num_examples: 224
download_size: 13554986
dataset_size: 90116249
- config_name: function_class_mask_one_composer_non_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 105054619
num_examples: 224
download_size: 15624970
dataset_size: 105054619
- config_name: function_class_mask_one_composer_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 87046937
num_examples: 224
download_size: 12999652
dataset_size: 87046937
- config_name: half_memory_composer_all_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 334960024
num_examples: 224
download_size: 123799195
dataset_size: 334960024
- config_name: half_memory_composer_non_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 311325289
num_examples: 224
download_size: 115444406
dataset_size: 311325289
- config_name: half_memory_composer_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 99351776
num_examples: 224
download_size: 18008844
dataset_size: 99351776
- config_name: imports_first_composer_all_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 590554966
num_examples: 224
download_size: 236389259
dataset_size: 590554966
- config_name: imports_first_composer_non_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 560157388
num_examples: 224
download_size: 226465503
dataset_size: 560157388
- config_name: imports_first_composer_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 114370147
num_examples: 224
download_size: 22077336
dataset_size: 114370147
- config_name: naive_composer_all_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 590554966
num_examples: 224
download_size: 236382094
dataset_size: 590554966
- config_name: naive_composer_non_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 560157388
num_examples: 224
download_size: 226480268
dataset_size: 560157388
- config_name: naive_composer_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 114370147
num_examples: 224
download_size: 22084803
dataset_size: 114370147
- config_name: path_distance_composer_all_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 590554966
num_examples: 224
download_size: 236585246
dataset_size: 590554966
- config_name: path_distance_composer_non_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 560157388
num_examples: 224
download_size: 226460548
dataset_size: 560157388
- config_name: path_distance_composer_py_context
features:
- name: repo_id
dtype: int64
- name: repo_name
dtype: string
- name: project_context
dtype: string
- name: file_context
list:
- name: content
dtype: string
- name: type
dtype: string
- name: gt
sequence: string
- name: metainfo_separator
dtype: string
splits:
- name: test
num_bytes: 114370147
num_examples: 224
download_size: 22014753
dataset_size: 114370147
- config_name: function_class_mask_half_composer_all_context
data_files:
- split: test
path: data/function_class_mask_half_composer/all_context/test-*
- config_name: function_class_mask_half_composer_non_py_context
data_files:
- split: test
path: data/function_class_mask_half_composer/non_py_context/test-*
- config_name: function_class_mask_half_composer_py_context
data_files:
- split: test
path: data/function_class_mask_half_composer/py_context/test-*
- config_name: imports_first_composer_all_context
data_files:
- split: test
path: data/imports_first_composer/all_context/test-*
- config_name: imports_first_composer_non_py_context
data_files:
- split: test
path: data/imports_first_composer/non_py_context/test-*
- config_name: imports_first_composer_py_context
data_files:
- split: test
path: data/imports_first_composer/py_context/test-*
- config_name: alphabetical_composer_all_context
data_files:
- split: test
path: data/alphabetical_composer/all_context/test-*
- config_name: alphabetical_composer_non_py_context
data_files:
- split: test
path: data/alphabetical_composer/non_py_context/test-*
- config_name: alphabetical_composer_py_context
data_files:
- split: test
path: data/alphabetical_composer/py_context/test-*
- config_name: naive_composer_all_context
data_files:
- split: test
path: data/naive_composer/all_context/test-*
- config_name: naive_composer_non_py_context
data_files:
- split: test
path: data/naive_composer/non_py_context/test-*
- config_name: naive_composer_py_context
data_files:
- split: test
path: data/naive_composer/py_context/test-*
- config_name: path_distance_composer_all_context
data_files:
- split: test
path: data/path_distance_composer/all_context/test-*
- config_name: path_distance_composer_non_py_context
data_files:
- split: test
path: data/path_distance_composer/non_py_context/test-*
- config_name: path_distance_composer_py_context
data_files:
- split: test
path: data/path_distance_composer/py_context/test-*
default: True
- config_name: file_length_composer_all_context
data_files:
- split: test
path: data/file_length_composer/all_context/test-*
- config_name: file_length_composer_non_py_context
data_files:
- split: test
path: data/file_length_composer/non_py_context/test-*
- config_name: file_length_composer_py_context
data_files:
- split: test
path: data/file_length_composer/py_context/test-*
- config_name: half_memory_composer_all_context
data_files:
- split: test
path: data/half_memory_composer/all_context/test-*
- config_name: half_memory_composer_non_py_context
data_files:
- split: test
path: data/half_memory_composer/non_py_context/test-*
- config_name: half_memory_composer_py_context
data_files:
- split: test
path: data/half_memory_composer/py_context/test-*
- config_name: function_class_mask_one_composer_all_context
data_files:
- split: test
path: data/function_class_mask_one_composer/all_context/test-*
- config_name: function_class_mask_one_composer_non_py_context
data_files:
- split: test
path: data/function_class_mask_one_composer/non_py_context/test-*
- config_name: function_class_mask_one_composer_py_context
data_files:
- split: test
path: data/function_class_mask_one_composer/py_context/test-*
---
# Repository Level Code Completion Dataset for Evaluation
This is a dataset of repository snapshots before a commit where a python file has been added. One needs to complete added file with given content of repository composed in different ways.
## How to load the data
1. via [`load_dataset`](https://huggingface.co/docs/datasets/v2.14.3/en/package_reference/loading_methods#datasets.load_dataset):
```
from datasets import load_dataset
data_files = # choose from the table below
dataset = load_dataset("jenyag/repo-code-completion", data_files=data_files, split="train")
```
#### Options for `data_files`:
| | **all_context** | **non_py_context** | **py_context** |
|----|----|----|----|
| **function class mask half composer** | data/function_class_mask_half_composer/all_context/test-* | data/function_class_mask_half_composer/non_py_context/test-* | data/function_class_mask_half_composer/py_context/test-* |
| **imports first composer** | data/imports_first_composer/all_context/test-* | data/imports_first_composer/non_py_context/test-* | data/imports_first_composer/py_context/test-* |
| **alphabetical composer** | data/alphabetical_composer/all_context/test-* | data/alphabetical_composer/non_py_context/test-* | data/alphabetical_composer/py_context/test-* |
| **naive composer** | data/naive_composer/all_context/test-* | data/naive_composer/non_py_context/test-* | data/naive_composer/py_context/test-* |
| **path distance composer** | data/path_distance_composer/all_context/test-* | data/path_distance_composer/non_py_context/test-* | data/path_distance_composer/py_context/test-* |
| **file length composer** | data/file_length_composer/all_context/test-* | data/file_length_composer/non_py_context/test-* | data/file_length_composer/py_context/test-* |
| **half memory composer** | data/half_memory_composer/all_context/test-* | data/half_memory_composer/non_py_context/test-* | data/half_memory_composer/py_context/test-* |
| **function class mask one composer** | data/function_class_mask_one_composer/all_context/test-* | data/function_class_mask_one_composer/non_py_context/test-* | data/function_class_mask_one_composer/py_context/test-* |
## How to get the full context for the specific line
```
for datapoint in dataset:
project_context = datapoint['project_context'] # The project context may be quite long
for file_context_dict, ground_truth in zip(datapoint['file_context'], datapoint['gt']):
file_context = file_context_dict['content']
full_context = project_context + file_context
```
提供机构:
jenyag
原始信息汇总
数据集概述
数据集配置
配置名称:alphabetical_composer_all_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 590554966num_examples: 224
- 下载大小:236538429
- 数据集大小:590554966
配置名称:alphabetical_composer_non_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 560157388num_examples: 224
- 下载大小:226511858
- 数据集大小:560157388
配置名称:alphabetical_composer_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 114370147num_examples: 224
- 下载大小:22096586
- 数据集大小:114370147
配置名称:file_length_composer_all_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 590554966num_examples: 224
- 下载大小:239093262
- 数据集大小:590554966
配置名称:file_length_composer_non_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 560157388num_examples: 224
- 下载大小:228632512
- 数据集大小:560157388
配置名称:file_length_composer_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 114370147num_examples: 224
- 下载大小:22181715
- 数据集大小:114370147
配置名称:function_class_mask_half_composer_all_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 316335006num_examples: 224
- 下载大小:0
- 数据集大小:316335006
配置名称:function_class_mask_half_composer_non_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 315664977num_examples: 224
- 下载大小:127938122
- 数据集大小:315664977
配置名称:function_class_mask_half_composer_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 101260211num_examples: 224
- 下载大小:17862587
- 数据集大小:101260211
配置名称:function_class_mask_one_composer_all_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 90116249num_examples: 224
- 下载大小:13554986
- 数据集大小:90116249
配置名称:function_class_mask_one_composer_non_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 105054619num_examples: 224
- 下载大小:15624970
- 数据集大小:105054619
配置名称:function_class_mask_one_composer_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 87046937num_examples: 224
- 下载大小:12999652
- 数据集大小:87046937
配置名称:half_memory_composer_all_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 334960024num_examples: 224
- 下载大小:123799195
- 数据集大小:334960024
配置名称:half_memory_composer_non_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 311325289num_examples: 224
- 下载大小:115444406
- 数据集大小:311325289
配置名称:half_memory_composer_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 99351776num_examples: 224
- 下载大小:18008844
- 数据集大小:99351776
配置名称:imports_first_composer_all_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 590554966num_examples: 224
- 下载大小:236389259
- 数据集大小:590554966
配置名称:imports_first_composer_non_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 560157388num_examples: 224
- 下载大小:226465503
- 数据集大小:560157388
配置名称:imports_first_composer_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 114370147num_examples: 224
- 下载大小:22077336
- 数据集大小:114370147
配置名称:naive_composer_all_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 590554966num_examples: 224
- 下载大小:236382094
- 数据集大小:590554966
配置名称:naive_composer_non_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 560157388num_examples: 224
- 下载大小:226480268
- 数据集大小:560157388
配置名称:naive_composer_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 114370147num_examples: 224
- 下载大小:22084803
- 数据集大小:114370147
配置名称:path_distance_composer_all_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 590554966num_examples: 224
- 下载大小:236585246
- 数据集大小:590554966
配置名称:path_distance_composer_non_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 560157388num_examples: 224
- 下载大小:226460548
- 数据集大小:560157388
配置名称:path_distance_composer_py_context
- 特征:
repo_id: int64repo_name: stringproject_context: stringfile_context: listcontent: stringtype: string
gt: sequencemetainfo_separator: string
- 分割:
test:num_bytes: 114370147num_examples: 224
- 下载大小:22014753
- 数据集大小:114370147



