five

wikipedia_de_retival_BGE-m3

收藏
魔搭社区2025-12-05 更新2025-12-06 收录
下载链接:
https://modelscope.cn/datasets/laion/wikipedia_de_retival_BGE-m3
下载链接
链接失效反馈
官方服务:
资源简介:
import os import pandas as pd from pathlib import Path import retriv retriv.set_base_path("./retriv_wiki_de") from retriv import DenseRetriever """ # Uncomment if you wanna make your own index dr = DenseRetriever( index_name="wiki_de-index_sentence_transf-BAAI/bge-m3_title_only_fullarticles", model="BAAI/bge-m3", normalize=True, max_length=512, use_ann=True, ) dr = dr.index_file( path="./wikipedia_de_filtered_fullarticles.csv", # File kind is automatically inferred embeddings_path=None, # Default value use_gpu=True, # Default value batch_size=32, # Default value show_progress=True, # Default value callback=lambda doc: { # Callback defaults to None. "id": doc["id"], "text": doc["title"], }, ) """ from retriv import DenseRetriever # loading the wikipedia de text data file_path = "./wikipedia_de_filtered_fullarticles.csv" # CSV with fulltext df = pd.read_csv(file_path) file_path = "./wikipedia_de_filtered_300wordchunks.csv" # CSV with fulltext df2 = pd.read_csv(file_path) # loading the retrievers dr = DenseRetriever.load("wiki_de-index_sentence_transf-BAAI/bge-m3_title_only_fullarticles") # the embeddings here are made from the titles of the wikipedia pages, but can be matched to the full texts in the wikipedia_de_filtered_fullarticles.csv result = dr.search( query="was is der doppelspaltversuch?", # What to search for return_docs=True, # Default value, return the text of the documents cutoff=3, # Default value, number of results to return ) print(df) for res in result: id_query = int(res["id"])-1 row = df.iloc[id_query] print(row) # Extracting 'text' and 'url' from the resulting row result_text = row['text'] result_url = row['url'] print(result_url,result_text[:1000]) print("###################") print("+++++++++++++++++++") dr2 = DenseRetriever.load("wiki_de-index_sentence_transf-BAAI/bge-m3") # the embeddings here are made from 300 word segments of the articles. The IDs point to wikipedia_de_filtered_300wordchunks.csv result = dr2.search( query="was is der doppelspaltversuch?", # What to search for return_docs=True, # Default value, return the text of the documents cutoff=3, # Default value, number of results to return ) for res in result: id_query = int(res["id"])-1 # the "id" values start with 1, not 0 , -> need to substract 1 ;) row = df2.iloc[id_query] print(row) # Extracting 'text' and 'url' from the resulting row result_text = row['text'] result_url = row['url'] print(result_url,result_text) print("########")

首先导入相关依赖模块: import os import pandas as pd from pathlib import Path import retriv retriv.set_base_path("./retriv_wiki_de") from retriv import DenseRetriever """ # 若需自行构建索引,请取消以下代码注释 dr = DenseRetriever( index_name="wiki_de-index_sentence_transf-BAAI/bge-m3_title_only_fullarticles", model="BAAI/bge-m3", normalize=True, max_length=512, use_ann=True, ) dr = dr.index_file( path="./wikipedia_de_filtered_fullarticles.csv", # 文件类型将自动推断 embeddings_path=None, # 默认参数 use_gpu=True, # 默认参数 batch_size=32, # 默认参数 show_progress=True, # 默认参数 callback=lambda doc: { # 回调函数默认为None "id": doc"id", "text": doc"title", }, ) """ from retriv import DenseRetriever # 加载德语维基百科文本数据集 file_path = "./wikipedia_de_filtered_fullarticles.csv" # 存储完整文本的CSV文件 df = pd.read_csv(file_path) file_path = "./wikipedia_de_filtered_300wordchunks.csv" # 存储完整文本的CSV文件 df2 = pd.read_csv(file_path) # 加载密集检索器(DenseRetriever) dr = DenseRetriever.load("wiki_de-index_sentence_transf-BAAI/bge-m3_title_only_fullarticles") # 该检索器的嵌入由维基百科页面的标题生成,可与wikipedia_de_filtered_fullarticles.csv中的完整文本进行匹配 result = dr.search( query="was is der doppelspaltversuch?", # 搜索查询:什么是双缝实验? return_docs=True, # 默认参数,返回文档文本 cutoff=3, # 默认参数,指定返回的结果数量 ) print(df) for res in result: id_query = int(res"id") - 1 row = df.iloc[id_query] print(row) # 从结果行中提取`text`与`url`字段 result_text = row['text'] result_url = row['url'] print(result_url, result_text[:1000]) print("###################") print("+++++++++++++++++++") dr2 = DenseRetriever.load("wiki_de-index_sentence_transf-BAAI/bge-m3") # 该检索器的嵌入由文章的300词分段生成,其ID指向wikipedia_de_filtered_300wordchunks.csv result = dr2.search( query="was is der doppelspaltversuch?", # 搜索查询:什么是双缝实验? return_docs=True, # 默认参数,返回文档文本 cutoff=3, # 默认参数,指定返回的结果数量 ) for res in result: id_query = int(res"id") - 1 # 此处`id`值从1开始而非0,因此需将其减1以匹配DataFrame索引 row = df2.iloc[id_query] print(row) # 从结果行中提取`text`与`url`字段 result_text = row['text'] result_url = row['url'] print(result_url, result_text) print("########")
提供机构:
maas
创建时间:
2025-10-02
5,000+
优质数据集
54 个
任务类型
进入经典数据集
二维码
社区交流群

面向社区/商业的数据集话题

二维码
科研交流群

面向高校/科研机构的开源数据集话题

数据驱动未来

携手共赢发展

商业合作