COIR
收藏数据集概述
CoIR (Code Information Retrieval) 是一个用于评估代码检索能力的基准测试。CoIR 包含 10 个精选的代码数据集,涵盖 8 个检索任务,跨越 7 个领域。总共包含两百万个文档。它还提供了一个通用的、易于使用的 Python 框架,可通过 pip 安装,并与 MTEB 和 BEIR 等基准测试共享相同的数据模式,以便于跨基准评估。
数据集统计信息
以下是 CoIR 基准测试中数据集的统计信息:
| 主任务 | 子任务 | 领域 | 数据集 | 语言 | #Query (train/dev/test) | #Corpus | L_Query | L_Corpus |
|---|---|---|---|---|---|---|---|---|
| 文本到代码检索 | 代码竞赛检索 | 代码竞赛 | APPS | py | 5k/-/3.8K | 9K | 1.4K | 575 |
| 网页查询到代码检索 | 网页查询 | CosQA | py | 19k/-/500 | 21K | 37 | 276 | |
| 文本到 SQL 检索 | 数据库 | Synthetic Text2SQL | sql | 100k/-/6K | 106K | 83 | 127 | |
| 代码到文本检索 | 代码摘要检索 | Github | CodeSearchNet | go, java, js, php, py, ruby | 905k/41k/53K | 1M | 594 | 156 |
| 代码到代码检索 | 代码上下文检索 | Github | CodeSearchNet-CCR^dag | go, java, js, php, py, ruby | 905k/41k/53K | 1M | 154 | 113 |
| 相似代码检索 | 深度学习 | CodeTrans Ocean-DL | py | 564/72/180 | 816 | 1.6K | 1.5K | |
| 竞赛 | CodeTrans Ocean-Contest | c++, py | 561/226/446 | 1K | 770 | 1.5K | ||
| 混合代码检索 | 单轮代码问答 | Stack Overflow | StackOverflow QA^dag | 杂项 | 13k/3k/2K | 20K | 1.4K | 1.2K |
| 代码指令 | CodeFeedBack-ST | html, c, css, sql, js, sql, py, shell, ruby, rust, swift | 125k/-/31K | 156K | 722 | 1.5K | ||
| 多轮代码问答 | 代码指令 | CodeFeeback-MT | 杂项 | 53k/-/13K | 66K | 4.4K | 1.5K |
数据集特点
- CoIR 包含总共十个不同的代码检索数据集。
- CoIR 支持与 Hugging Face 和其他库的无缝集成,实现一键加载和模型评估。
- CoIR 支持自定义模型和基于 API 的模型,提供灵活的集成选项以满足不同的需求。
安装
通过 pip 安装 coir-eval 包:
bash pip install coir-eval
如果需要从源代码构建,请使用:
bash git clone git@github.com:CoIR-team/coir.git cd coir pip install -e .
简单使用
如果已安装 coir-eval 包,直接使用以下代码运行评估:
python import coir from coir.data_loader import get_tasks from coir.evaluation import COIR from coir.models import YourCustomDEModel
model_name = "intfloat/e5-base-v2"
加载模型
model = YourCustomDEModel(model_name=model_name)
获取任务
tasks = get_tasks(tasks=["codetrans-dl"])
初始化评估
evaluation = COIR(tasks=tasks, batch_size=128)
运行评估
results = evaluation.run(model, output_folder=f"results/{model_name}") print(results)
高级使用
自定义密集检索模型
python import coir from coir.data_loader import get_tasks from coir.evaluation import COIR import torch import numpy as np import logging from transformers import AutoTokenizer, AutoModel from typing import List, Dict from tqdm.auto import tqdm
class YourCustomDEModel: def init(self, model_name="intfloat/e5-base-v2", **kwargs): self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModel.from_pretrained(model_name).to(device) self.model_name = model_name self.tokenizer.add_eos_token = False
def mean_pooling(self, model_output, attention_mask):
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
return sum_embeddings / sum_mask
def cls_pooling(self, model_output, attention_mask):
# First element of model_output contains all token embeddings
token_embeddings = model_output[0]
# Extract the CLS tokens embeddings (index 0) for each sequence in the batch
cls_embeddings = token_embeddings[:, 0, :]
return cls_embeddings
def last_token_pool(self, model_output, attention_mask):
last_hidden_states = model_output.last_hidden_state
left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
if left_padding:
return last_hidden_states[:, -1]
else:
sequence_lengths = attention_mask.sum(dim=1) - 1
batch_size = last_hidden_states.shape[0]
return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
def encode_text(self, texts: List[str], batch_size: int = 12, max_length: int = 128) -> np.ndarray:
logging.info(f"Encoding {len(texts)} texts...")
embeddings = []
for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches", unit="batch"):
batch_texts = texts[i:i+batch_size]
encoded_input = self.tokenizer(batch_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
with torch.no_grad():
model_output = self.model(**encoded_input)
batch_embeddings = self.mean_pooling(model_output, encoded_input[attention_mask])
embeddings.append(batch_embeddings.cpu())
embeddings = torch.cat(embeddings, dim=0)
if embeddings is None:
logging.error("Embeddings are None.")
else:
logging.info(f"Encoded {len(embeddings)} embeddings.")
return embeddings.numpy()
def encode_queries(self, queries: List[str], batch_size: int = 12, max_length: int = 512, **kwargs) -> np.ndarray:
all_queries = ["query: "+ query for query in queries]
return self.encode_text(all_queries, batch_size, max_length)
def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int = 12, max_length: int = 512, **kwargs) -> np.ndarray:
all_texts = ["passage: "+ doc[text] for doc in corpus]
return self.encode_text(all_texts, batch_size, max_length)
加载模型
model = YourCustomDEModel()
获取任务
tasks = coir.get_tasks(tasks=["codetrans-dl"])
初始化评估
evaluation = COIR(tasks=tasks, batch_size=128)
运行评估
results = evaluation.run(model, output_folder=f"results/{model_name}") print(results)
使用 Sentence-Transformers 模型
python import coir from coir.data_loader import get_tasks from coir.evaluation import COIR import torch import numpy as np import logging from sentence_transformers import SentenceTransformer from typing import List, Dict from tqdm.auto import tqdm
class YourCustomDEModel: def init(self, model_name="intfloat/e5-base-v2", **kwargs): self.model = SentenceTransformer(model_name)
def encode_text(self, texts: List[str], batch_size: int = 12, show_progress_bar: bool = True, **kwargs) -> np.ndarray:
logging.info(f"Encoding {len(texts)} texts...")
embeddings = self.model.encode(texts, batch_size=batch_size, show_progress_bar=show_progress_bar, **kwargs)
if embeddings is None:
logging.error("Embeddings are None.")
else:
logging.info(f"Encoded {len(embeddings)} embeddings.")
return np.array(embeddings)
def encode_queries(self, queries: List[str], batch_size: int = 12, show_progress_bar: bool = True, **kwargs) -> np.ndarray:
all_queries = ["query: "+ query for query in queries]
return self.encode_text(all_queries, batch_size, show_progress_bar, **kwargs)
def encode_corpus(self, corpus: List[Dict[str, str]], batch_size: int = 12, show_progress_bar: bool = True, **kwargs) -> np.ndarray:
all_texts = ["passage: "+ doc[text] for doc in corpus]
return self.encode_text(all_texts, batch_size, show_progress_bar, **kwargs)
加载模型
model = YourCustomDEModel()
获取任务
tasks = coir.get_tasks(tasks=["codetrans-dl"])
初始化评估
evaluation = COIR(tasks=tasks, batch_size=128)
运行评估
results = evaluation.run(model, output_folder=f"results/{model.model_name}") print(results)
自定义 API 检索模型
python import coir from coir.data_loader import get_tasks from coir.evaluation import COIR import torch import numpy as np import logging from transformers import AutoTokenizer, AutoModel from typing import List, Dict from tqdm.auto import tqdm
class APIModel: def init(self, model_name="voyage-code-2", **kwargs): # Initialize the voyageai client self.vo = voyageai.Client(api_key="xxxx") # This uses VOYAGE_API_KEY from environment self.model_name = model_name self.requests_per_minute = 300 # Max requests per minute self.delay_between_requests = 60 / self.requests_per_minute # Delay in seco
def encode_text(self, texts: list, batch_size: int = 12, input_type: str = "document") -> np.ndarray:
logging.info(f"Encoding {len(texts)} texts...")
all_embeddings = []
start_time = time.time()
# Processing texts in batches
for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches", unit="batch"):
batch_texts = texts[i:i + batch_size]
result = self.vo.embed(batch_texts, model=self.model_name, input_type=input_type,truncation=True)
batch_embeddings = result.embeddings # Assume the API directly returns embeddings
all_embeddings.extend(batch_embeddings)
# Ensure we do not exceed rate limits
time_elapsed = time.time() - start_time
if time_elapsed < self.delay_between_requests:
time.sleep(self.delay_between_requests - time_elapsed)
start_time = time.time()
# Combine all embeddings into a single numpy array
embeddings_array = np.array(all_embeddings)
# Logging after encoding
if embeddings_array.size == 0:
logging.error("No embeddings received.")
else:
logging.info(f"Encoded {len(embeddings_array)} embeddings.")
return embeddings_array
def encode_queries(self, queries: list, batch_size: int = 12, **kwargs) -> np.ndarray:
truncated_queries = [query[:256] for query in queries]
truncated_queries = ["query: " + query for query in truncated_queries]
query_embeddings = self.encode_text(truncated_queries, batch_size, input_type="query")
return query_embeddings
def encode_corpus(self, corpus: list, batch_size: int = 12, **kwargs) -> np.ndarray:
texts = [doc[text][:512] for doc in corpus]
texts = ["passage: " + doc for doc in texts]
return self.encode_text(texts, batch_size, input_type="document")
加载模型
model = APIModel()
获取任务
tasks = coir.get_tasks(tasks=["codetrans-dl"])
初始化评估
evaluation = COIR(tasks=tasks, batch_size=128)
运行评估
results = evaluation.run(model, output_folder=f"results/{model_name}") print(results)

- 1CoIR: A Comprehensive Benchmark for Code Information Retrieval Models华为诺亚方舟实验室 · 2024年



