wikipedia_de_retival_BGE-m3
收藏魔搭社区2025-12-05 更新2025-12-06 收录
下载链接:
https://modelscope.cn/datasets/laion/wikipedia_de_retival_BGE-m3
下载链接
链接失效反馈官方服务:
资源简介:
import os
import pandas as pd
from pathlib import Path
import retriv
retriv.set_base_path("./retriv_wiki_de")
from retriv import DenseRetriever
"""
# Uncomment if you wanna make your own index
dr = DenseRetriever(
index_name="wiki_de-index_sentence_transf-BAAI/bge-m3_title_only_fullarticles",
model="BAAI/bge-m3",
normalize=True,
max_length=512,
use_ann=True,
)
dr = dr.index_file(
path="./wikipedia_de_filtered_fullarticles.csv", # File kind is automatically inferred
embeddings_path=None, # Default value
use_gpu=True, # Default value
batch_size=32, # Default value
show_progress=True, # Default value
callback=lambda doc: { # Callback defaults to None.
"id": doc["id"],
"text": doc["title"],
},
)
"""
from retriv import DenseRetriever
# loading the wikipedia de text data
file_path = "./wikipedia_de_filtered_fullarticles.csv" # CSV with fulltext
df = pd.read_csv(file_path)
file_path = "./wikipedia_de_filtered_300wordchunks.csv" # CSV with fulltext
df2 = pd.read_csv(file_path)
# loading the retrievers
dr = DenseRetriever.load("wiki_de-index_sentence_transf-BAAI/bge-m3_title_only_fullarticles") # the embeddings here are made from the titles of the wikipedia pages, but can be matched to the full texts in the wikipedia_de_filtered_fullarticles.csv
result = dr.search(
query="was is der doppelspaltversuch?", # What to search for
return_docs=True, # Default value, return the text of the documents
cutoff=3, # Default value, number of results to return
)
print(df)
for res in result:
id_query = int(res["id"])-1
row = df.iloc[id_query]
print(row)
# Extracting 'text' and 'url' from the resulting row
result_text = row['text']
result_url = row['url']
print(result_url,result_text[:1000])
print("###################")
print("+++++++++++++++++++")
dr2 = DenseRetriever.load("wiki_de-index_sentence_transf-BAAI/bge-m3") # the embeddings here are made from 300 word segments of the articles. The IDs point to wikipedia_de_filtered_300wordchunks.csv
result = dr2.search(
query="was is der doppelspaltversuch?", # What to search for
return_docs=True, # Default value, return the text of the documents
cutoff=3, # Default value, number of results to return
)
for res in result:
id_query = int(res["id"])-1 # the "id" values start with 1, not 0 , -> need to substract 1 ;)
row = df2.iloc[id_query]
print(row)
# Extracting 'text' and 'url' from the resulting row
result_text = row['text']
result_url = row['url']
print(result_url,result_text)
print("########")
首先导入相关依赖模块:
import os
import pandas as pd
from pathlib import Path
import retriv
retriv.set_base_path("./retriv_wiki_de")
from retriv import DenseRetriever
"""
# 若需自行构建索引,请取消以下代码注释
dr = DenseRetriever(
index_name="wiki_de-index_sentence_transf-BAAI/bge-m3_title_only_fullarticles",
model="BAAI/bge-m3",
normalize=True,
max_length=512,
use_ann=True,
)
dr = dr.index_file(
path="./wikipedia_de_filtered_fullarticles.csv", # 文件类型将自动推断
embeddings_path=None, # 默认参数
use_gpu=True, # 默认参数
batch_size=32, # 默认参数
show_progress=True, # 默认参数
callback=lambda doc: { # 回调函数默认为None
"id": doc"id",
"text": doc"title",
},
)
"""
from retriv import DenseRetriever
# 加载德语维基百科文本数据集
file_path = "./wikipedia_de_filtered_fullarticles.csv" # 存储完整文本的CSV文件
df = pd.read_csv(file_path)
file_path = "./wikipedia_de_filtered_300wordchunks.csv" # 存储完整文本的CSV文件
df2 = pd.read_csv(file_path)
# 加载密集检索器(DenseRetriever)
dr = DenseRetriever.load("wiki_de-index_sentence_transf-BAAI/bge-m3_title_only_fullarticles") # 该检索器的嵌入由维基百科页面的标题生成,可与wikipedia_de_filtered_fullarticles.csv中的完整文本进行匹配
result = dr.search(
query="was is der doppelspaltversuch?", # 搜索查询:什么是双缝实验?
return_docs=True, # 默认参数,返回文档文本
cutoff=3, # 默认参数,指定返回的结果数量
)
print(df)
for res in result:
id_query = int(res"id") - 1
row = df.iloc[id_query]
print(row)
# 从结果行中提取`text`与`url`字段
result_text = row['text']
result_url = row['url']
print(result_url, result_text[:1000])
print("###################")
print("+++++++++++++++++++")
dr2 = DenseRetriever.load("wiki_de-index_sentence_transf-BAAI/bge-m3") # 该检索器的嵌入由文章的300词分段生成,其ID指向wikipedia_de_filtered_300wordchunks.csv
result = dr2.search(
query="was is der doppelspaltversuch?", # 搜索查询:什么是双缝实验?
return_docs=True, # 默认参数,返回文档文本
cutoff=3, # 默认参数,指定返回的结果数量
)
for res in result:
id_query = int(res"id") - 1 # 此处`id`值从1开始而非0,因此需将其减1以匹配DataFrame索引
row = df2.iloc[id_query]
print(row)
# 从结果行中提取`text`与`url`字段
result_text = row['text']
result_url = row['url']
print(result_url, result_text)
print("########")
提供机构:
maas
创建时间:
2025-10-02



